Remove unused variable and field
[official-gcc.git] / gcc / config / i386 / i386.c
blob2f905d2b31a1acc7c3133a47efc3967d5abe5263
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
65 #include "context.h"
66 #include "pass_manager.h"
68 static rtx legitimize_dllimport_symbol (rtx, bool);
69 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
70 static rtx legitimize_pe_coff_symbol (rtx, bool);
72 #ifndef CHECK_STACK_LIMIT
73 #define CHECK_STACK_LIMIT (-1)
74 #endif
76 /* Return index of given mode in mult and division cost tables. */
77 #define MODE_INDEX(mode) \
78 ((mode) == QImode ? 0 \
79 : (mode) == HImode ? 1 \
80 : (mode) == SImode ? 2 \
81 : (mode) == DImode ? 3 \
82 : 4)
84 /* Processor costs (relative to an add) */
85 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
86 #define COSTS_N_BYTES(N) ((N) * 2)
88 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
90 static stringop_algs ix86_size_memcpy[2] = {
91 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
92 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
93 static stringop_algs ix86_size_memset[2] = {
94 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
95 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
97 const
98 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
99 COSTS_N_BYTES (2), /* cost of an add instruction */
100 COSTS_N_BYTES (3), /* cost of a lea instruction */
101 COSTS_N_BYTES (2), /* variable shift costs */
102 COSTS_N_BYTES (3), /* constant shift costs */
103 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
104 COSTS_N_BYTES (3), /* HI */
105 COSTS_N_BYTES (3), /* SI */
106 COSTS_N_BYTES (3), /* DI */
107 COSTS_N_BYTES (5)}, /* other */
108 0, /* cost of multiply per each bit set */
109 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
110 COSTS_N_BYTES (3), /* HI */
111 COSTS_N_BYTES (3), /* SI */
112 COSTS_N_BYTES (3), /* DI */
113 COSTS_N_BYTES (5)}, /* other */
114 COSTS_N_BYTES (3), /* cost of movsx */
115 COSTS_N_BYTES (3), /* cost of movzx */
116 0, /* "large" insn */
117 2, /* MOVE_RATIO */
118 2, /* cost for loading QImode using movzbl */
119 {2, 2, 2}, /* cost of loading integer registers
120 in QImode, HImode and SImode.
121 Relative to reg-reg move (2). */
122 {2, 2, 2}, /* cost of storing integer registers */
123 2, /* cost of reg,reg fld/fst */
124 {2, 2, 2}, /* cost of loading fp registers
125 in SFmode, DFmode and XFmode */
126 {2, 2, 2}, /* cost of storing fp registers
127 in SFmode, DFmode and XFmode */
128 3, /* cost of moving MMX register */
129 {3, 3}, /* cost of loading MMX registers
130 in SImode and DImode */
131 {3, 3}, /* cost of storing MMX registers
132 in SImode and DImode */
133 3, /* cost of moving SSE register */
134 {3, 3, 3}, /* cost of loading SSE registers
135 in SImode, DImode and TImode */
136 {3, 3, 3}, /* cost of storing SSE registers
137 in SImode, DImode and TImode */
138 3, /* MMX or SSE register to integer */
139 0, /* size of l1 cache */
140 0, /* size of l2 cache */
141 0, /* size of prefetch block */
142 0, /* number of parallel prefetches */
143 2, /* Branch cost */
144 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
145 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
146 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
147 COSTS_N_BYTES (2), /* cost of FABS instruction. */
148 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
149 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
150 ix86_size_memcpy,
151 ix86_size_memset,
152 1, /* scalar_stmt_cost. */
153 1, /* scalar load_cost. */
154 1, /* scalar_store_cost. */
155 1, /* vec_stmt_cost. */
156 1, /* vec_to_scalar_cost. */
157 1, /* scalar_to_vec_cost. */
158 1, /* vec_align_load_cost. */
159 1, /* vec_unalign_load_cost. */
160 1, /* vec_store_cost. */
161 1, /* cond_taken_branch_cost. */
162 1, /* cond_not_taken_branch_cost. */
165 /* Processor costs (relative to an add) */
166 static stringop_algs i386_memcpy[2] = {
167 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
168 DUMMY_STRINGOP_ALGS};
169 static stringop_algs i386_memset[2] = {
170 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
171 DUMMY_STRINGOP_ALGS};
173 static const
174 struct processor_costs i386_cost = { /* 386 specific costs */
175 COSTS_N_INSNS (1), /* cost of an add instruction */
176 COSTS_N_INSNS (1), /* cost of a lea instruction */
177 COSTS_N_INSNS (3), /* variable shift costs */
178 COSTS_N_INSNS (2), /* constant shift costs */
179 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
180 COSTS_N_INSNS (6), /* HI */
181 COSTS_N_INSNS (6), /* SI */
182 COSTS_N_INSNS (6), /* DI */
183 COSTS_N_INSNS (6)}, /* other */
184 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
185 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
186 COSTS_N_INSNS (23), /* HI */
187 COSTS_N_INSNS (23), /* SI */
188 COSTS_N_INSNS (23), /* DI */
189 COSTS_N_INSNS (23)}, /* other */
190 COSTS_N_INSNS (3), /* cost of movsx */
191 COSTS_N_INSNS (2), /* cost of movzx */
192 15, /* "large" insn */
193 3, /* MOVE_RATIO */
194 4, /* cost for loading QImode using movzbl */
195 {2, 4, 2}, /* cost of loading integer registers
196 in QImode, HImode and SImode.
197 Relative to reg-reg move (2). */
198 {2, 4, 2}, /* cost of storing integer registers */
199 2, /* cost of reg,reg fld/fst */
200 {8, 8, 8}, /* cost of loading fp registers
201 in SFmode, DFmode and XFmode */
202 {8, 8, 8}, /* cost of storing fp registers
203 in SFmode, DFmode and XFmode */
204 2, /* cost of moving MMX register */
205 {4, 8}, /* cost of loading MMX registers
206 in SImode and DImode */
207 {4, 8}, /* cost of storing MMX registers
208 in SImode and DImode */
209 2, /* cost of moving SSE register */
210 {4, 8, 16}, /* cost of loading SSE registers
211 in SImode, DImode and TImode */
212 {4, 8, 16}, /* cost of storing SSE registers
213 in SImode, DImode and TImode */
214 3, /* MMX or SSE register to integer */
215 0, /* size of l1 cache */
216 0, /* size of l2 cache */
217 0, /* size of prefetch block */
218 0, /* number of parallel prefetches */
219 1, /* Branch cost */
220 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
221 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
222 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
223 COSTS_N_INSNS (22), /* cost of FABS instruction. */
224 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
225 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
226 i386_memcpy,
227 i386_memset,
228 1, /* scalar_stmt_cost. */
229 1, /* scalar load_cost. */
230 1, /* scalar_store_cost. */
231 1, /* vec_stmt_cost. */
232 1, /* vec_to_scalar_cost. */
233 1, /* scalar_to_vec_cost. */
234 1, /* vec_align_load_cost. */
235 2, /* vec_unalign_load_cost. */
236 1, /* vec_store_cost. */
237 3, /* cond_taken_branch_cost. */
238 1, /* cond_not_taken_branch_cost. */
241 static stringop_algs i486_memcpy[2] = {
242 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
243 DUMMY_STRINGOP_ALGS};
244 static stringop_algs i486_memset[2] = {
245 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
246 DUMMY_STRINGOP_ALGS};
248 static const
249 struct processor_costs i486_cost = { /* 486 specific costs */
250 COSTS_N_INSNS (1), /* cost of an add instruction */
251 COSTS_N_INSNS (1), /* cost of a lea instruction */
252 COSTS_N_INSNS (3), /* variable shift costs */
253 COSTS_N_INSNS (2), /* constant shift costs */
254 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
255 COSTS_N_INSNS (12), /* HI */
256 COSTS_N_INSNS (12), /* SI */
257 COSTS_N_INSNS (12), /* DI */
258 COSTS_N_INSNS (12)}, /* other */
259 1, /* cost of multiply per each bit set */
260 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
261 COSTS_N_INSNS (40), /* HI */
262 COSTS_N_INSNS (40), /* SI */
263 COSTS_N_INSNS (40), /* DI */
264 COSTS_N_INSNS (40)}, /* other */
265 COSTS_N_INSNS (3), /* cost of movsx */
266 COSTS_N_INSNS (2), /* cost of movzx */
267 15, /* "large" insn */
268 3, /* MOVE_RATIO */
269 4, /* cost for loading QImode using movzbl */
270 {2, 4, 2}, /* cost of loading integer registers
271 in QImode, HImode and SImode.
272 Relative to reg-reg move (2). */
273 {2, 4, 2}, /* cost of storing integer registers */
274 2, /* cost of reg,reg fld/fst */
275 {8, 8, 8}, /* cost of loading fp registers
276 in SFmode, DFmode and XFmode */
277 {8, 8, 8}, /* cost of storing fp registers
278 in SFmode, DFmode and XFmode */
279 2, /* cost of moving MMX register */
280 {4, 8}, /* cost of loading MMX registers
281 in SImode and DImode */
282 {4, 8}, /* cost of storing MMX registers
283 in SImode and DImode */
284 2, /* cost of moving SSE register */
285 {4, 8, 16}, /* cost of loading SSE registers
286 in SImode, DImode and TImode */
287 {4, 8, 16}, /* cost of storing SSE registers
288 in SImode, DImode and TImode */
289 3, /* MMX or SSE register to integer */
290 4, /* size of l1 cache. 486 has 8kB cache
291 shared for code and data, so 4kB is
292 not really precise. */
293 4, /* size of l2 cache */
294 0, /* size of prefetch block */
295 0, /* number of parallel prefetches */
296 1, /* Branch cost */
297 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
298 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
299 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
300 COSTS_N_INSNS (3), /* cost of FABS instruction. */
301 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
302 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
303 i486_memcpy,
304 i486_memset,
305 1, /* scalar_stmt_cost. */
306 1, /* scalar load_cost. */
307 1, /* scalar_store_cost. */
308 1, /* vec_stmt_cost. */
309 1, /* vec_to_scalar_cost. */
310 1, /* scalar_to_vec_cost. */
311 1, /* vec_align_load_cost. */
312 2, /* vec_unalign_load_cost. */
313 1, /* vec_store_cost. */
314 3, /* cond_taken_branch_cost. */
315 1, /* cond_not_taken_branch_cost. */
318 static stringop_algs pentium_memcpy[2] = {
319 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
320 DUMMY_STRINGOP_ALGS};
321 static stringop_algs pentium_memset[2] = {
322 {libcall, {{-1, rep_prefix_4_byte, false}}},
323 DUMMY_STRINGOP_ALGS};
325 static const
326 struct processor_costs pentium_cost = {
327 COSTS_N_INSNS (1), /* cost of an add instruction */
328 COSTS_N_INSNS (1), /* cost of a lea instruction */
329 COSTS_N_INSNS (4), /* variable shift costs */
330 COSTS_N_INSNS (1), /* constant shift costs */
331 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
332 COSTS_N_INSNS (11), /* HI */
333 COSTS_N_INSNS (11), /* SI */
334 COSTS_N_INSNS (11), /* DI */
335 COSTS_N_INSNS (11)}, /* other */
336 0, /* cost of multiply per each bit set */
337 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
338 COSTS_N_INSNS (25), /* HI */
339 COSTS_N_INSNS (25), /* SI */
340 COSTS_N_INSNS (25), /* DI */
341 COSTS_N_INSNS (25)}, /* other */
342 COSTS_N_INSNS (3), /* cost of movsx */
343 COSTS_N_INSNS (2), /* cost of movzx */
344 8, /* "large" insn */
345 6, /* MOVE_RATIO */
346 6, /* cost for loading QImode using movzbl */
347 {2, 4, 2}, /* cost of loading integer registers
348 in QImode, HImode and SImode.
349 Relative to reg-reg move (2). */
350 {2, 4, 2}, /* cost of storing integer registers */
351 2, /* cost of reg,reg fld/fst */
352 {2, 2, 6}, /* cost of loading fp registers
353 in SFmode, DFmode and XFmode */
354 {4, 4, 6}, /* cost of storing fp registers
355 in SFmode, DFmode and XFmode */
356 8, /* cost of moving MMX register */
357 {8, 8}, /* cost of loading MMX registers
358 in SImode and DImode */
359 {8, 8}, /* cost of storing MMX registers
360 in SImode and DImode */
361 2, /* cost of moving SSE register */
362 {4, 8, 16}, /* cost of loading SSE registers
363 in SImode, DImode and TImode */
364 {4, 8, 16}, /* cost of storing SSE registers
365 in SImode, DImode and TImode */
366 3, /* MMX or SSE register to integer */
367 8, /* size of l1 cache. */
368 8, /* size of l2 cache */
369 0, /* size of prefetch block */
370 0, /* number of parallel prefetches */
371 2, /* Branch cost */
372 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
373 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
374 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
375 COSTS_N_INSNS (1), /* cost of FABS instruction. */
376 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
377 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
378 pentium_memcpy,
379 pentium_memset,
380 1, /* scalar_stmt_cost. */
381 1, /* scalar load_cost. */
382 1, /* scalar_store_cost. */
383 1, /* vec_stmt_cost. */
384 1, /* vec_to_scalar_cost. */
385 1, /* scalar_to_vec_cost. */
386 1, /* vec_align_load_cost. */
387 2, /* vec_unalign_load_cost. */
388 1, /* vec_store_cost. */
389 3, /* cond_taken_branch_cost. */
390 1, /* cond_not_taken_branch_cost. */
393 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
394 (we ensure the alignment). For small blocks inline loop is still a
395 noticeable win, for bigger blocks either rep movsl or rep movsb is
396 way to go. Rep movsb has apparently more expensive startup time in CPU,
397 but after 4K the difference is down in the noise. */
398 static stringop_algs pentiumpro_memcpy[2] = {
399 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
400 {8192, rep_prefix_4_byte, false},
401 {-1, rep_prefix_1_byte, false}}},
402 DUMMY_STRINGOP_ALGS};
403 static stringop_algs pentiumpro_memset[2] = {
404 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
405 {8192, rep_prefix_4_byte, false},
406 {-1, libcall, false}}},
407 DUMMY_STRINGOP_ALGS};
408 static const
409 struct processor_costs pentiumpro_cost = {
410 COSTS_N_INSNS (1), /* cost of an add instruction */
411 COSTS_N_INSNS (1), /* cost of a lea instruction */
412 COSTS_N_INSNS (1), /* variable shift costs */
413 COSTS_N_INSNS (1), /* constant shift costs */
414 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
415 COSTS_N_INSNS (4), /* HI */
416 COSTS_N_INSNS (4), /* SI */
417 COSTS_N_INSNS (4), /* DI */
418 COSTS_N_INSNS (4)}, /* other */
419 0, /* cost of multiply per each bit set */
420 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
421 COSTS_N_INSNS (17), /* HI */
422 COSTS_N_INSNS (17), /* SI */
423 COSTS_N_INSNS (17), /* DI */
424 COSTS_N_INSNS (17)}, /* other */
425 COSTS_N_INSNS (1), /* cost of movsx */
426 COSTS_N_INSNS (1), /* cost of movzx */
427 8, /* "large" insn */
428 6, /* MOVE_RATIO */
429 2, /* cost for loading QImode using movzbl */
430 {4, 4, 4}, /* cost of loading integer registers
431 in QImode, HImode and SImode.
432 Relative to reg-reg move (2). */
433 {2, 2, 2}, /* cost of storing integer registers */
434 2, /* cost of reg,reg fld/fst */
435 {2, 2, 6}, /* cost of loading fp registers
436 in SFmode, DFmode and XFmode */
437 {4, 4, 6}, /* cost of storing fp registers
438 in SFmode, DFmode and XFmode */
439 2, /* cost of moving MMX register */
440 {2, 2}, /* cost of loading MMX registers
441 in SImode and DImode */
442 {2, 2}, /* cost of storing MMX registers
443 in SImode and DImode */
444 2, /* cost of moving SSE register */
445 {2, 2, 8}, /* cost of loading SSE registers
446 in SImode, DImode and TImode */
447 {2, 2, 8}, /* cost of storing SSE registers
448 in SImode, DImode and TImode */
449 3, /* MMX or SSE register to integer */
450 8, /* size of l1 cache. */
451 256, /* size of l2 cache */
452 32, /* size of prefetch block */
453 6, /* number of parallel prefetches */
454 2, /* Branch cost */
455 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
456 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
457 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
458 COSTS_N_INSNS (2), /* cost of FABS instruction. */
459 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
460 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
461 pentiumpro_memcpy,
462 pentiumpro_memset,
463 1, /* scalar_stmt_cost. */
464 1, /* scalar load_cost. */
465 1, /* scalar_store_cost. */
466 1, /* vec_stmt_cost. */
467 1, /* vec_to_scalar_cost. */
468 1, /* scalar_to_vec_cost. */
469 1, /* vec_align_load_cost. */
470 2, /* vec_unalign_load_cost. */
471 1, /* vec_store_cost. */
472 3, /* cond_taken_branch_cost. */
473 1, /* cond_not_taken_branch_cost. */
476 static stringop_algs geode_memcpy[2] = {
477 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
478 DUMMY_STRINGOP_ALGS};
479 static stringop_algs geode_memset[2] = {
480 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
481 DUMMY_STRINGOP_ALGS};
482 static const
483 struct processor_costs geode_cost = {
484 COSTS_N_INSNS (1), /* cost of an add instruction */
485 COSTS_N_INSNS (1), /* cost of a lea instruction */
486 COSTS_N_INSNS (2), /* variable shift costs */
487 COSTS_N_INSNS (1), /* constant shift costs */
488 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
489 COSTS_N_INSNS (4), /* HI */
490 COSTS_N_INSNS (7), /* SI */
491 COSTS_N_INSNS (7), /* DI */
492 COSTS_N_INSNS (7)}, /* other */
493 0, /* cost of multiply per each bit set */
494 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
495 COSTS_N_INSNS (23), /* HI */
496 COSTS_N_INSNS (39), /* SI */
497 COSTS_N_INSNS (39), /* DI */
498 COSTS_N_INSNS (39)}, /* other */
499 COSTS_N_INSNS (1), /* cost of movsx */
500 COSTS_N_INSNS (1), /* cost of movzx */
501 8, /* "large" insn */
502 4, /* MOVE_RATIO */
503 1, /* cost for loading QImode using movzbl */
504 {1, 1, 1}, /* cost of loading integer registers
505 in QImode, HImode and SImode.
506 Relative to reg-reg move (2). */
507 {1, 1, 1}, /* cost of storing integer registers */
508 1, /* cost of reg,reg fld/fst */
509 {1, 1, 1}, /* cost of loading fp registers
510 in SFmode, DFmode and XFmode */
511 {4, 6, 6}, /* cost of storing fp registers
512 in SFmode, DFmode and XFmode */
514 1, /* cost of moving MMX register */
515 {1, 1}, /* cost of loading MMX registers
516 in SImode and DImode */
517 {1, 1}, /* cost of storing MMX registers
518 in SImode and DImode */
519 1, /* cost of moving SSE register */
520 {1, 1, 1}, /* cost of loading SSE registers
521 in SImode, DImode and TImode */
522 {1, 1, 1}, /* cost of storing SSE registers
523 in SImode, DImode and TImode */
524 1, /* MMX or SSE register to integer */
525 64, /* size of l1 cache. */
526 128, /* size of l2 cache. */
527 32, /* size of prefetch block */
528 1, /* number of parallel prefetches */
529 1, /* Branch cost */
530 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
531 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
532 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
533 COSTS_N_INSNS (1), /* cost of FABS instruction. */
534 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
535 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
536 geode_memcpy,
537 geode_memset,
538 1, /* scalar_stmt_cost. */
539 1, /* scalar load_cost. */
540 1, /* scalar_store_cost. */
541 1, /* vec_stmt_cost. */
542 1, /* vec_to_scalar_cost. */
543 1, /* scalar_to_vec_cost. */
544 1, /* vec_align_load_cost. */
545 2, /* vec_unalign_load_cost. */
546 1, /* vec_store_cost. */
547 3, /* cond_taken_branch_cost. */
548 1, /* cond_not_taken_branch_cost. */
551 static stringop_algs k6_memcpy[2] = {
552 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
553 DUMMY_STRINGOP_ALGS};
554 static stringop_algs k6_memset[2] = {
555 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
556 DUMMY_STRINGOP_ALGS};
557 static const
558 struct processor_costs k6_cost = {
559 COSTS_N_INSNS (1), /* cost of an add instruction */
560 COSTS_N_INSNS (2), /* cost of a lea instruction */
561 COSTS_N_INSNS (1), /* variable shift costs */
562 COSTS_N_INSNS (1), /* constant shift costs */
563 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
564 COSTS_N_INSNS (3), /* HI */
565 COSTS_N_INSNS (3), /* SI */
566 COSTS_N_INSNS (3), /* DI */
567 COSTS_N_INSNS (3)}, /* other */
568 0, /* cost of multiply per each bit set */
569 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
570 COSTS_N_INSNS (18), /* HI */
571 COSTS_N_INSNS (18), /* SI */
572 COSTS_N_INSNS (18), /* DI */
573 COSTS_N_INSNS (18)}, /* other */
574 COSTS_N_INSNS (2), /* cost of movsx */
575 COSTS_N_INSNS (2), /* cost of movzx */
576 8, /* "large" insn */
577 4, /* MOVE_RATIO */
578 3, /* cost for loading QImode using movzbl */
579 {4, 5, 4}, /* cost of loading integer registers
580 in QImode, HImode and SImode.
581 Relative to reg-reg move (2). */
582 {2, 3, 2}, /* cost of storing integer registers */
583 4, /* cost of reg,reg fld/fst */
584 {6, 6, 6}, /* cost of loading fp registers
585 in SFmode, DFmode and XFmode */
586 {4, 4, 4}, /* cost of storing fp registers
587 in SFmode, DFmode and XFmode */
588 2, /* cost of moving MMX register */
589 {2, 2}, /* cost of loading MMX registers
590 in SImode and DImode */
591 {2, 2}, /* cost of storing MMX registers
592 in SImode and DImode */
593 2, /* cost of moving SSE register */
594 {2, 2, 8}, /* cost of loading SSE registers
595 in SImode, DImode and TImode */
596 {2, 2, 8}, /* cost of storing SSE registers
597 in SImode, DImode and TImode */
598 6, /* MMX or SSE register to integer */
599 32, /* size of l1 cache. */
600 32, /* size of l2 cache. Some models
601 have integrated l2 cache, but
602 optimizing for k6 is not important
603 enough to worry about that. */
604 32, /* size of prefetch block */
605 1, /* number of parallel prefetches */
606 1, /* Branch cost */
607 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
608 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
609 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
610 COSTS_N_INSNS (2), /* cost of FABS instruction. */
611 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
612 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
613 k6_memcpy,
614 k6_memset,
615 1, /* scalar_stmt_cost. */
616 1, /* scalar load_cost. */
617 1, /* scalar_store_cost. */
618 1, /* vec_stmt_cost. */
619 1, /* vec_to_scalar_cost. */
620 1, /* scalar_to_vec_cost. */
621 1, /* vec_align_load_cost. */
622 2, /* vec_unalign_load_cost. */
623 1, /* vec_store_cost. */
624 3, /* cond_taken_branch_cost. */
625 1, /* cond_not_taken_branch_cost. */
628 /* For some reason, Athlon deals better with REP prefix (relative to loops)
629 compared to K8. Alignment becomes important after 8 bytes for memcpy and
630 128 bytes for memset. */
631 static stringop_algs athlon_memcpy[2] = {
632 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
633 DUMMY_STRINGOP_ALGS};
634 static stringop_algs athlon_memset[2] = {
635 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
636 DUMMY_STRINGOP_ALGS};
637 static const
638 struct processor_costs athlon_cost = {
639 COSTS_N_INSNS (1), /* cost of an add instruction */
640 COSTS_N_INSNS (2), /* cost of a lea instruction */
641 COSTS_N_INSNS (1), /* variable shift costs */
642 COSTS_N_INSNS (1), /* constant shift costs */
643 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
644 COSTS_N_INSNS (5), /* HI */
645 COSTS_N_INSNS (5), /* SI */
646 COSTS_N_INSNS (5), /* DI */
647 COSTS_N_INSNS (5)}, /* other */
648 0, /* cost of multiply per each bit set */
649 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
650 COSTS_N_INSNS (26), /* HI */
651 COSTS_N_INSNS (42), /* SI */
652 COSTS_N_INSNS (74), /* DI */
653 COSTS_N_INSNS (74)}, /* other */
654 COSTS_N_INSNS (1), /* cost of movsx */
655 COSTS_N_INSNS (1), /* cost of movzx */
656 8, /* "large" insn */
657 9, /* MOVE_RATIO */
658 4, /* cost for loading QImode using movzbl */
659 {3, 4, 3}, /* cost of loading integer registers
660 in QImode, HImode and SImode.
661 Relative to reg-reg move (2). */
662 {3, 4, 3}, /* cost of storing integer registers */
663 4, /* cost of reg,reg fld/fst */
664 {4, 4, 12}, /* cost of loading fp registers
665 in SFmode, DFmode and XFmode */
666 {6, 6, 8}, /* cost of storing fp registers
667 in SFmode, DFmode and XFmode */
668 2, /* cost of moving MMX register */
669 {4, 4}, /* cost of loading MMX registers
670 in SImode and DImode */
671 {4, 4}, /* cost of storing MMX registers
672 in SImode and DImode */
673 2, /* cost of moving SSE register */
674 {4, 4, 6}, /* cost of loading SSE registers
675 in SImode, DImode and TImode */
676 {4, 4, 5}, /* cost of storing SSE registers
677 in SImode, DImode and TImode */
678 5, /* MMX or SSE register to integer */
679 64, /* size of l1 cache. */
680 256, /* size of l2 cache. */
681 64, /* size of prefetch block */
682 6, /* number of parallel prefetches */
683 5, /* Branch cost */
684 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
685 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
686 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
687 COSTS_N_INSNS (2), /* cost of FABS instruction. */
688 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
689 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
690 athlon_memcpy,
691 athlon_memset,
692 1, /* scalar_stmt_cost. */
693 1, /* scalar load_cost. */
694 1, /* scalar_store_cost. */
695 1, /* vec_stmt_cost. */
696 1, /* vec_to_scalar_cost. */
697 1, /* scalar_to_vec_cost. */
698 1, /* vec_align_load_cost. */
699 2, /* vec_unalign_load_cost. */
700 1, /* vec_store_cost. */
701 3, /* cond_taken_branch_cost. */
702 1, /* cond_not_taken_branch_cost. */
705 /* K8 has optimized REP instruction for medium sized blocks, but for very
706 small blocks it is better to use loop. For large blocks, libcall can
707 do nontemporary accesses and beat inline considerably. */
708 static stringop_algs k8_memcpy[2] = {
709 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
710 {-1, rep_prefix_4_byte, false}}},
711 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
712 {-1, libcall, false}}}};
713 static stringop_algs k8_memset[2] = {
714 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
715 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
716 {libcall, {{48, unrolled_loop, false},
717 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
718 static const
719 struct processor_costs k8_cost = {
720 COSTS_N_INSNS (1), /* cost of an add instruction */
721 COSTS_N_INSNS (2), /* cost of a lea instruction */
722 COSTS_N_INSNS (1), /* variable shift costs */
723 COSTS_N_INSNS (1), /* constant shift costs */
724 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
725 COSTS_N_INSNS (4), /* HI */
726 COSTS_N_INSNS (3), /* SI */
727 COSTS_N_INSNS (4), /* DI */
728 COSTS_N_INSNS (5)}, /* other */
729 0, /* cost of multiply per each bit set */
730 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
731 COSTS_N_INSNS (26), /* HI */
732 COSTS_N_INSNS (42), /* SI */
733 COSTS_N_INSNS (74), /* DI */
734 COSTS_N_INSNS (74)}, /* other */
735 COSTS_N_INSNS (1), /* cost of movsx */
736 COSTS_N_INSNS (1), /* cost of movzx */
737 8, /* "large" insn */
738 9, /* MOVE_RATIO */
739 4, /* cost for loading QImode using movzbl */
740 {3, 4, 3}, /* cost of loading integer registers
741 in QImode, HImode and SImode.
742 Relative to reg-reg move (2). */
743 {3, 4, 3}, /* cost of storing integer registers */
744 4, /* cost of reg,reg fld/fst */
745 {4, 4, 12}, /* cost of loading fp registers
746 in SFmode, DFmode and XFmode */
747 {6, 6, 8}, /* cost of storing fp registers
748 in SFmode, DFmode and XFmode */
749 2, /* cost of moving MMX register */
750 {3, 3}, /* cost of loading MMX registers
751 in SImode and DImode */
752 {4, 4}, /* cost of storing MMX registers
753 in SImode and DImode */
754 2, /* cost of moving SSE register */
755 {4, 3, 6}, /* cost of loading SSE registers
756 in SImode, DImode and TImode */
757 {4, 4, 5}, /* cost of storing SSE registers
758 in SImode, DImode and TImode */
759 5, /* MMX or SSE register to integer */
760 64, /* size of l1 cache. */
761 512, /* size of l2 cache. */
762 64, /* size of prefetch block */
763 /* New AMD processors never drop prefetches; if they cannot be performed
764 immediately, they are queued. We set number of simultaneous prefetches
765 to a large constant to reflect this (it probably is not a good idea not
766 to limit number of prefetches at all, as their execution also takes some
767 time). */
768 100, /* number of parallel prefetches */
769 3, /* Branch cost */
770 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
771 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
772 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
773 COSTS_N_INSNS (2), /* cost of FABS instruction. */
774 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
775 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
777 k8_memcpy,
778 k8_memset,
779 4, /* scalar_stmt_cost. */
780 2, /* scalar load_cost. */
781 2, /* scalar_store_cost. */
782 5, /* vec_stmt_cost. */
783 0, /* vec_to_scalar_cost. */
784 2, /* scalar_to_vec_cost. */
785 2, /* vec_align_load_cost. */
786 3, /* vec_unalign_load_cost. */
787 3, /* vec_store_cost. */
788 3, /* cond_taken_branch_cost. */
789 2, /* cond_not_taken_branch_cost. */
792 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
793 very small blocks it is better to use loop. For large blocks, libcall can
794 do nontemporary accesses and beat inline considerably. */
795 static stringop_algs amdfam10_memcpy[2] = {
796 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
797 {-1, rep_prefix_4_byte, false}}},
798 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
799 {-1, libcall, false}}}};
800 static stringop_algs amdfam10_memset[2] = {
801 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
802 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
803 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
804 {-1, libcall, false}}}};
805 struct processor_costs amdfam10_cost = {
806 COSTS_N_INSNS (1), /* cost of an add instruction */
807 COSTS_N_INSNS (2), /* cost of a lea instruction */
808 COSTS_N_INSNS (1), /* variable shift costs */
809 COSTS_N_INSNS (1), /* constant shift costs */
810 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
811 COSTS_N_INSNS (4), /* HI */
812 COSTS_N_INSNS (3), /* SI */
813 COSTS_N_INSNS (4), /* DI */
814 COSTS_N_INSNS (5)}, /* other */
815 0, /* cost of multiply per each bit set */
816 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
817 COSTS_N_INSNS (35), /* HI */
818 COSTS_N_INSNS (51), /* SI */
819 COSTS_N_INSNS (83), /* DI */
820 COSTS_N_INSNS (83)}, /* other */
821 COSTS_N_INSNS (1), /* cost of movsx */
822 COSTS_N_INSNS (1), /* cost of movzx */
823 8, /* "large" insn */
824 9, /* MOVE_RATIO */
825 4, /* cost for loading QImode using movzbl */
826 {3, 4, 3}, /* cost of loading integer registers
827 in QImode, HImode and SImode.
828 Relative to reg-reg move (2). */
829 {3, 4, 3}, /* cost of storing integer registers */
830 4, /* cost of reg,reg fld/fst */
831 {4, 4, 12}, /* cost of loading fp registers
832 in SFmode, DFmode and XFmode */
833 {6, 6, 8}, /* cost of storing fp registers
834 in SFmode, DFmode and XFmode */
835 2, /* cost of moving MMX register */
836 {3, 3}, /* cost of loading MMX registers
837 in SImode and DImode */
838 {4, 4}, /* cost of storing MMX registers
839 in SImode and DImode */
840 2, /* cost of moving SSE register */
841 {4, 4, 3}, /* cost of loading SSE registers
842 in SImode, DImode and TImode */
843 {4, 4, 5}, /* cost of storing SSE registers
844 in SImode, DImode and TImode */
845 3, /* MMX or SSE register to integer */
846 /* On K8:
847 MOVD reg64, xmmreg Double FSTORE 4
848 MOVD reg32, xmmreg Double FSTORE 4
849 On AMDFAM10:
850 MOVD reg64, xmmreg Double FADD 3
851 1/1 1/1
852 MOVD reg32, xmmreg Double FADD 3
853 1/1 1/1 */
854 64, /* size of l1 cache. */
855 512, /* size of l2 cache. */
856 64, /* size of prefetch block */
857 /* New AMD processors never drop prefetches; if they cannot be performed
858 immediately, they are queued. We set number of simultaneous prefetches
859 to a large constant to reflect this (it probably is not a good idea not
860 to limit number of prefetches at all, as their execution also takes some
861 time). */
862 100, /* number of parallel prefetches */
863 2, /* Branch cost */
864 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
865 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
866 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
867 COSTS_N_INSNS (2), /* cost of FABS instruction. */
868 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
869 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
871 amdfam10_memcpy,
872 amdfam10_memset,
873 4, /* scalar_stmt_cost. */
874 2, /* scalar load_cost. */
875 2, /* scalar_store_cost. */
876 6, /* vec_stmt_cost. */
877 0, /* vec_to_scalar_cost. */
878 2, /* scalar_to_vec_cost. */
879 2, /* vec_align_load_cost. */
880 2, /* vec_unalign_load_cost. */
881 2, /* vec_store_cost. */
882 2, /* cond_taken_branch_cost. */
883 1, /* cond_not_taken_branch_cost. */
886 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
887 very small blocks it is better to use loop. For large blocks, libcall
888 can do nontemporary accesses and beat inline considerably. */
889 static stringop_algs bdver1_memcpy[2] = {
890 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
891 {-1, rep_prefix_4_byte, false}}},
892 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
893 {-1, libcall, false}}}};
894 static stringop_algs bdver1_memset[2] = {
895 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
896 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
897 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
898 {-1, libcall, false}}}};
900 const struct processor_costs bdver1_cost = {
901 COSTS_N_INSNS (1), /* cost of an add instruction */
902 COSTS_N_INSNS (1), /* cost of a lea instruction */
903 COSTS_N_INSNS (1), /* variable shift costs */
904 COSTS_N_INSNS (1), /* constant shift costs */
905 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
906 COSTS_N_INSNS (4), /* HI */
907 COSTS_N_INSNS (4), /* SI */
908 COSTS_N_INSNS (6), /* DI */
909 COSTS_N_INSNS (6)}, /* other */
910 0, /* cost of multiply per each bit set */
911 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
912 COSTS_N_INSNS (35), /* HI */
913 COSTS_N_INSNS (51), /* SI */
914 COSTS_N_INSNS (83), /* DI */
915 COSTS_N_INSNS (83)}, /* other */
916 COSTS_N_INSNS (1), /* cost of movsx */
917 COSTS_N_INSNS (1), /* cost of movzx */
918 8, /* "large" insn */
919 9, /* MOVE_RATIO */
920 4, /* cost for loading QImode using movzbl */
921 {5, 5, 4}, /* cost of loading integer registers
922 in QImode, HImode and SImode.
923 Relative to reg-reg move (2). */
924 {4, 4, 4}, /* cost of storing integer registers */
925 2, /* cost of reg,reg fld/fst */
926 {5, 5, 12}, /* cost of loading fp registers
927 in SFmode, DFmode and XFmode */
928 {4, 4, 8}, /* cost of storing fp registers
929 in SFmode, DFmode and XFmode */
930 2, /* cost of moving MMX register */
931 {4, 4}, /* cost of loading MMX registers
932 in SImode and DImode */
933 {4, 4}, /* cost of storing MMX registers
934 in SImode and DImode */
935 2, /* cost of moving SSE register */
936 {4, 4, 4}, /* cost of loading SSE registers
937 in SImode, DImode and TImode */
938 {4, 4, 4}, /* cost of storing SSE registers
939 in SImode, DImode and TImode */
940 2, /* MMX or SSE register to integer */
941 /* On K8:
942 MOVD reg64, xmmreg Double FSTORE 4
943 MOVD reg32, xmmreg Double FSTORE 4
944 On AMDFAM10:
945 MOVD reg64, xmmreg Double FADD 3
946 1/1 1/1
947 MOVD reg32, xmmreg Double FADD 3
948 1/1 1/1 */
949 16, /* size of l1 cache. */
950 2048, /* size of l2 cache. */
951 64, /* size of prefetch block */
952 /* New AMD processors never drop prefetches; if they cannot be performed
953 immediately, they are queued. We set number of simultaneous prefetches
954 to a large constant to reflect this (it probably is not a good idea not
955 to limit number of prefetches at all, as their execution also takes some
956 time). */
957 100, /* number of parallel prefetches */
958 2, /* Branch cost */
959 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
960 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
961 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
962 COSTS_N_INSNS (2), /* cost of FABS instruction. */
963 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
964 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
966 bdver1_memcpy,
967 bdver1_memset,
968 6, /* scalar_stmt_cost. */
969 4, /* scalar load_cost. */
970 4, /* scalar_store_cost. */
971 6, /* vec_stmt_cost. */
972 0, /* vec_to_scalar_cost. */
973 2, /* scalar_to_vec_cost. */
974 4, /* vec_align_load_cost. */
975 4, /* vec_unalign_load_cost. */
976 4, /* vec_store_cost. */
977 2, /* cond_taken_branch_cost. */
978 1, /* cond_not_taken_branch_cost. */
981 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
982 very small blocks it is better to use loop. For large blocks, libcall
983 can do nontemporary accesses and beat inline considerably. */
985 static stringop_algs bdver2_memcpy[2] = {
986 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
987 {-1, rep_prefix_4_byte, false}}},
988 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
989 {-1, libcall, false}}}};
990 static stringop_algs bdver2_memset[2] = {
991 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
992 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
993 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
994 {-1, libcall, false}}}};
996 const struct processor_costs bdver2_cost = {
997 COSTS_N_INSNS (1), /* cost of an add instruction */
998 COSTS_N_INSNS (1), /* cost of a lea instruction */
999 COSTS_N_INSNS (1), /* variable shift costs */
1000 COSTS_N_INSNS (1), /* constant shift costs */
1001 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1002 COSTS_N_INSNS (4), /* HI */
1003 COSTS_N_INSNS (4), /* SI */
1004 COSTS_N_INSNS (6), /* DI */
1005 COSTS_N_INSNS (6)}, /* other */
1006 0, /* cost of multiply per each bit set */
1007 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1008 COSTS_N_INSNS (35), /* HI */
1009 COSTS_N_INSNS (51), /* SI */
1010 COSTS_N_INSNS (83), /* DI */
1011 COSTS_N_INSNS (83)}, /* other */
1012 COSTS_N_INSNS (1), /* cost of movsx */
1013 COSTS_N_INSNS (1), /* cost of movzx */
1014 8, /* "large" insn */
1015 9, /* MOVE_RATIO */
1016 4, /* cost for loading QImode using movzbl */
1017 {5, 5, 4}, /* cost of loading integer registers
1018 in QImode, HImode and SImode.
1019 Relative to reg-reg move (2). */
1020 {4, 4, 4}, /* cost of storing integer registers */
1021 2, /* cost of reg,reg fld/fst */
1022 {5, 5, 12}, /* cost of loading fp registers
1023 in SFmode, DFmode and XFmode */
1024 {4, 4, 8}, /* cost of storing fp registers
1025 in SFmode, DFmode and XFmode */
1026 2, /* cost of moving MMX register */
1027 {4, 4}, /* cost of loading MMX registers
1028 in SImode and DImode */
1029 {4, 4}, /* cost of storing MMX registers
1030 in SImode and DImode */
1031 2, /* cost of moving SSE register */
1032 {4, 4, 4}, /* cost of loading SSE registers
1033 in SImode, DImode and TImode */
1034 {4, 4, 4}, /* cost of storing SSE registers
1035 in SImode, DImode and TImode */
1036 2, /* MMX or SSE register to integer */
1037 /* On K8:
1038 MOVD reg64, xmmreg Double FSTORE 4
1039 MOVD reg32, xmmreg Double FSTORE 4
1040 On AMDFAM10:
1041 MOVD reg64, xmmreg Double FADD 3
1042 1/1 1/1
1043 MOVD reg32, xmmreg Double FADD 3
1044 1/1 1/1 */
1045 16, /* size of l1 cache. */
1046 2048, /* size of l2 cache. */
1047 64, /* size of prefetch block */
1048 /* New AMD processors never drop prefetches; if they cannot be performed
1049 immediately, they are queued. We set number of simultaneous prefetches
1050 to a large constant to reflect this (it probably is not a good idea not
1051 to limit number of prefetches at all, as their execution also takes some
1052 time). */
1053 100, /* number of parallel prefetches */
1054 2, /* Branch cost */
1055 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1056 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1057 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1058 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1059 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1060 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1062 bdver2_memcpy,
1063 bdver2_memset,
1064 6, /* scalar_stmt_cost. */
1065 4, /* scalar load_cost. */
1066 4, /* scalar_store_cost. */
1067 6, /* vec_stmt_cost. */
1068 0, /* vec_to_scalar_cost. */
1069 2, /* scalar_to_vec_cost. */
1070 4, /* vec_align_load_cost. */
1071 4, /* vec_unalign_load_cost. */
1072 4, /* vec_store_cost. */
1073 2, /* cond_taken_branch_cost. */
1074 1, /* cond_not_taken_branch_cost. */
1078 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1079 very small blocks it is better to use loop. For large blocks, libcall
1080 can do nontemporary accesses and beat inline considerably. */
1081 static stringop_algs bdver3_memcpy[2] = {
1082 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1083 {-1, rep_prefix_4_byte, false}}},
1084 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}};
1086 static stringop_algs bdver3_memset[2] = {
1087 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1088 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1089 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1090 {-1, libcall, false}}}};
1091 struct processor_costs bdver3_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (1), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (4), /* SI */
1099 COSTS_N_INSNS (6), /* DI */
1100 COSTS_N_INSNS (6)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (35), /* HI */
1104 COSTS_N_INSNS (51), /* SI */
1105 COSTS_N_INSNS (83), /* DI */
1106 COSTS_N_INSNS (83)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {5, 5, 4}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {4, 4, 4}, /* cost of storing integer registers */
1116 2, /* cost of reg,reg fld/fst */
1117 {5, 5, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {4, 4, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {4, 4}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 4, 4}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 4}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 2, /* MMX or SSE register to integer */
1132 16, /* size of l1 cache. */
1133 2048, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 2, /* Branch cost */
1142 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1149 bdver3_memcpy,
1150 bdver3_memset,
1151 6, /* scalar_stmt_cost. */
1152 4, /* scalar load_cost. */
1153 4, /* scalar_store_cost. */
1154 6, /* vec_stmt_cost. */
1155 0, /* vec_to_scalar_cost. */
1156 2, /* scalar_to_vec_cost. */
1157 4, /* vec_align_load_cost. */
1158 4, /* vec_unalign_load_cost. */
1159 4, /* vec_store_cost. */
1160 2, /* cond_taken_branch_cost. */
1161 1, /* cond_not_taken_branch_cost. */
1164 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1165 very small blocks it is better to use loop. For large blocks, libcall can
1166 do nontemporary accesses and beat inline considerably. */
1167 static stringop_algs btver1_memcpy[2] = {
1168 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1169 {-1, rep_prefix_4_byte, false}}},
1170 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1171 {-1, libcall, false}}}};
1172 static stringop_algs btver1_memset[2] = {
1173 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1174 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1175 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1176 {-1, libcall, false}}}};
1177 const struct processor_costs btver1_cost = {
1178 COSTS_N_INSNS (1), /* cost of an add instruction */
1179 COSTS_N_INSNS (2), /* cost of a lea instruction */
1180 COSTS_N_INSNS (1), /* variable shift costs */
1181 COSTS_N_INSNS (1), /* constant shift costs */
1182 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1183 COSTS_N_INSNS (4), /* HI */
1184 COSTS_N_INSNS (3), /* SI */
1185 COSTS_N_INSNS (4), /* DI */
1186 COSTS_N_INSNS (5)}, /* other */
1187 0, /* cost of multiply per each bit set */
1188 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1189 COSTS_N_INSNS (35), /* HI */
1190 COSTS_N_INSNS (51), /* SI */
1191 COSTS_N_INSNS (83), /* DI */
1192 COSTS_N_INSNS (83)}, /* other */
1193 COSTS_N_INSNS (1), /* cost of movsx */
1194 COSTS_N_INSNS (1), /* cost of movzx */
1195 8, /* "large" insn */
1196 9, /* MOVE_RATIO */
1197 4, /* cost for loading QImode using movzbl */
1198 {3, 4, 3}, /* cost of loading integer registers
1199 in QImode, HImode and SImode.
1200 Relative to reg-reg move (2). */
1201 {3, 4, 3}, /* cost of storing integer registers */
1202 4, /* cost of reg,reg fld/fst */
1203 {4, 4, 12}, /* cost of loading fp registers
1204 in SFmode, DFmode and XFmode */
1205 {6, 6, 8}, /* cost of storing fp registers
1206 in SFmode, DFmode and XFmode */
1207 2, /* cost of moving MMX register */
1208 {3, 3}, /* cost of loading MMX registers
1209 in SImode and DImode */
1210 {4, 4}, /* cost of storing MMX registers
1211 in SImode and DImode */
1212 2, /* cost of moving SSE register */
1213 {4, 4, 3}, /* cost of loading SSE registers
1214 in SImode, DImode and TImode */
1215 {4, 4, 5}, /* cost of storing SSE registers
1216 in SImode, DImode and TImode */
1217 3, /* MMX or SSE register to integer */
1218 /* On K8:
1219 MOVD reg64, xmmreg Double FSTORE 4
1220 MOVD reg32, xmmreg Double FSTORE 4
1221 On AMDFAM10:
1222 MOVD reg64, xmmreg Double FADD 3
1223 1/1 1/1
1224 MOVD reg32, xmmreg Double FADD 3
1225 1/1 1/1 */
1226 32, /* size of l1 cache. */
1227 512, /* size of l2 cache. */
1228 64, /* size of prefetch block */
1229 100, /* number of parallel prefetches */
1230 2, /* Branch cost */
1231 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1232 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1233 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1234 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1235 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1236 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1238 btver1_memcpy,
1239 btver1_memset,
1240 4, /* scalar_stmt_cost. */
1241 2, /* scalar load_cost. */
1242 2, /* scalar_store_cost. */
1243 6, /* vec_stmt_cost. */
1244 0, /* vec_to_scalar_cost. */
1245 2, /* scalar_to_vec_cost. */
1246 2, /* vec_align_load_cost. */
1247 2, /* vec_unalign_load_cost. */
1248 2, /* vec_store_cost. */
1249 2, /* cond_taken_branch_cost. */
1250 1, /* cond_not_taken_branch_cost. */
1253 static stringop_algs btver2_memcpy[2] = {
1254 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1255 {-1, rep_prefix_4_byte, false}}},
1256 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1257 {-1, libcall, false}}}};
1258 static stringop_algs btver2_memset[2] = {
1259 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1260 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1261 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1262 {-1, libcall, false}}}};
1263 const struct processor_costs btver2_cost = {
1264 COSTS_N_INSNS (1), /* cost of an add instruction */
1265 COSTS_N_INSNS (2), /* cost of a lea instruction */
1266 COSTS_N_INSNS (1), /* variable shift costs */
1267 COSTS_N_INSNS (1), /* constant shift costs */
1268 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1269 COSTS_N_INSNS (4), /* HI */
1270 COSTS_N_INSNS (3), /* SI */
1271 COSTS_N_INSNS (4), /* DI */
1272 COSTS_N_INSNS (5)}, /* other */
1273 0, /* cost of multiply per each bit set */
1274 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1275 COSTS_N_INSNS (35), /* HI */
1276 COSTS_N_INSNS (51), /* SI */
1277 COSTS_N_INSNS (83), /* DI */
1278 COSTS_N_INSNS (83)}, /* other */
1279 COSTS_N_INSNS (1), /* cost of movsx */
1280 COSTS_N_INSNS (1), /* cost of movzx */
1281 8, /* "large" insn */
1282 9, /* MOVE_RATIO */
1283 4, /* cost for loading QImode using movzbl */
1284 {3, 4, 3}, /* cost of loading integer registers
1285 in QImode, HImode and SImode.
1286 Relative to reg-reg move (2). */
1287 {3, 4, 3}, /* cost of storing integer registers */
1288 4, /* cost of reg,reg fld/fst */
1289 {4, 4, 12}, /* cost of loading fp registers
1290 in SFmode, DFmode and XFmode */
1291 {6, 6, 8}, /* cost of storing fp registers
1292 in SFmode, DFmode and XFmode */
1293 2, /* cost of moving MMX register */
1294 {3, 3}, /* cost of loading MMX registers
1295 in SImode and DImode */
1296 {4, 4}, /* cost of storing MMX registers
1297 in SImode and DImode */
1298 2, /* cost of moving SSE register */
1299 {4, 4, 3}, /* cost of loading SSE registers
1300 in SImode, DImode and TImode */
1301 {4, 4, 5}, /* cost of storing SSE registers
1302 in SImode, DImode and TImode */
1303 3, /* MMX or SSE register to integer */
1304 /* On K8:
1305 MOVD reg64, xmmreg Double FSTORE 4
1306 MOVD reg32, xmmreg Double FSTORE 4
1307 On AMDFAM10:
1308 MOVD reg64, xmmreg Double FADD 3
1309 1/1 1/1
1310 MOVD reg32, xmmreg Double FADD 3
1311 1/1 1/1 */
1312 32, /* size of l1 cache. */
1313 2048, /* size of l2 cache. */
1314 64, /* size of prefetch block */
1315 100, /* number of parallel prefetches */
1316 2, /* Branch cost */
1317 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1318 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1319 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1320 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1321 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1322 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1323 btver2_memcpy,
1324 btver2_memset,
1325 4, /* scalar_stmt_cost. */
1326 2, /* scalar load_cost. */
1327 2, /* scalar_store_cost. */
1328 6, /* vec_stmt_cost. */
1329 0, /* vec_to_scalar_cost. */
1330 2, /* scalar_to_vec_cost. */
1331 2, /* vec_align_load_cost. */
1332 2, /* vec_unalign_load_cost. */
1333 2, /* vec_store_cost. */
1334 2, /* cond_taken_branch_cost. */
1335 1, /* cond_not_taken_branch_cost. */
1338 static stringop_algs pentium4_memcpy[2] = {
1339 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1340 DUMMY_STRINGOP_ALGS};
1341 static stringop_algs pentium4_memset[2] = {
1342 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1343 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1344 DUMMY_STRINGOP_ALGS};
1346 static const
1347 struct processor_costs pentium4_cost = {
1348 COSTS_N_INSNS (1), /* cost of an add instruction */
1349 COSTS_N_INSNS (3), /* cost of a lea instruction */
1350 COSTS_N_INSNS (4), /* variable shift costs */
1351 COSTS_N_INSNS (4), /* constant shift costs */
1352 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1353 COSTS_N_INSNS (15), /* HI */
1354 COSTS_N_INSNS (15), /* SI */
1355 COSTS_N_INSNS (15), /* DI */
1356 COSTS_N_INSNS (15)}, /* other */
1357 0, /* cost of multiply per each bit set */
1358 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1359 COSTS_N_INSNS (56), /* HI */
1360 COSTS_N_INSNS (56), /* SI */
1361 COSTS_N_INSNS (56), /* DI */
1362 COSTS_N_INSNS (56)}, /* other */
1363 COSTS_N_INSNS (1), /* cost of movsx */
1364 COSTS_N_INSNS (1), /* cost of movzx */
1365 16, /* "large" insn */
1366 6, /* MOVE_RATIO */
1367 2, /* cost for loading QImode using movzbl */
1368 {4, 5, 4}, /* cost of loading integer registers
1369 in QImode, HImode and SImode.
1370 Relative to reg-reg move (2). */
1371 {2, 3, 2}, /* cost of storing integer registers */
1372 2, /* cost of reg,reg fld/fst */
1373 {2, 2, 6}, /* cost of loading fp registers
1374 in SFmode, DFmode and XFmode */
1375 {4, 4, 6}, /* cost of storing fp registers
1376 in SFmode, DFmode and XFmode */
1377 2, /* cost of moving MMX register */
1378 {2, 2}, /* cost of loading MMX registers
1379 in SImode and DImode */
1380 {2, 2}, /* cost of storing MMX registers
1381 in SImode and DImode */
1382 12, /* cost of moving SSE register */
1383 {12, 12, 12}, /* cost of loading SSE registers
1384 in SImode, DImode and TImode */
1385 {2, 2, 8}, /* cost of storing SSE registers
1386 in SImode, DImode and TImode */
1387 10, /* MMX or SSE register to integer */
1388 8, /* size of l1 cache. */
1389 256, /* size of l2 cache. */
1390 64, /* size of prefetch block */
1391 6, /* number of parallel prefetches */
1392 2, /* Branch cost */
1393 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1394 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1395 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1396 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1397 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1398 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1399 pentium4_memcpy,
1400 pentium4_memset,
1401 1, /* scalar_stmt_cost. */
1402 1, /* scalar load_cost. */
1403 1, /* scalar_store_cost. */
1404 1, /* vec_stmt_cost. */
1405 1, /* vec_to_scalar_cost. */
1406 1, /* scalar_to_vec_cost. */
1407 1, /* vec_align_load_cost. */
1408 2, /* vec_unalign_load_cost. */
1409 1, /* vec_store_cost. */
1410 3, /* cond_taken_branch_cost. */
1411 1, /* cond_not_taken_branch_cost. */
1414 static stringop_algs nocona_memcpy[2] = {
1415 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1416 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1417 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1419 static stringop_algs nocona_memset[2] = {
1420 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1421 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1422 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1423 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1425 static const
1426 struct processor_costs nocona_cost = {
1427 COSTS_N_INSNS (1), /* cost of an add instruction */
1428 COSTS_N_INSNS (1), /* cost of a lea instruction */
1429 COSTS_N_INSNS (1), /* variable shift costs */
1430 COSTS_N_INSNS (1), /* constant shift costs */
1431 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1432 COSTS_N_INSNS (10), /* HI */
1433 COSTS_N_INSNS (10), /* SI */
1434 COSTS_N_INSNS (10), /* DI */
1435 COSTS_N_INSNS (10)}, /* other */
1436 0, /* cost of multiply per each bit set */
1437 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1438 COSTS_N_INSNS (66), /* HI */
1439 COSTS_N_INSNS (66), /* SI */
1440 COSTS_N_INSNS (66), /* DI */
1441 COSTS_N_INSNS (66)}, /* other */
1442 COSTS_N_INSNS (1), /* cost of movsx */
1443 COSTS_N_INSNS (1), /* cost of movzx */
1444 16, /* "large" insn */
1445 17, /* MOVE_RATIO */
1446 4, /* cost for loading QImode using movzbl */
1447 {4, 4, 4}, /* cost of loading integer registers
1448 in QImode, HImode and SImode.
1449 Relative to reg-reg move (2). */
1450 {4, 4, 4}, /* cost of storing integer registers */
1451 3, /* cost of reg,reg fld/fst */
1452 {12, 12, 12}, /* cost of loading fp registers
1453 in SFmode, DFmode and XFmode */
1454 {4, 4, 4}, /* cost of storing fp registers
1455 in SFmode, DFmode and XFmode */
1456 6, /* cost of moving MMX register */
1457 {12, 12}, /* cost of loading MMX registers
1458 in SImode and DImode */
1459 {12, 12}, /* cost of storing MMX registers
1460 in SImode and DImode */
1461 6, /* cost of moving SSE register */
1462 {12, 12, 12}, /* cost of loading SSE registers
1463 in SImode, DImode and TImode */
1464 {12, 12, 12}, /* cost of storing SSE registers
1465 in SImode, DImode and TImode */
1466 8, /* MMX or SSE register to integer */
1467 8, /* size of l1 cache. */
1468 1024, /* size of l2 cache. */
1469 128, /* size of prefetch block */
1470 8, /* number of parallel prefetches */
1471 1, /* Branch cost */
1472 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1473 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1474 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1475 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1476 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1477 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1478 nocona_memcpy,
1479 nocona_memset,
1480 1, /* scalar_stmt_cost. */
1481 1, /* scalar load_cost. */
1482 1, /* scalar_store_cost. */
1483 1, /* vec_stmt_cost. */
1484 1, /* vec_to_scalar_cost. */
1485 1, /* scalar_to_vec_cost. */
1486 1, /* vec_align_load_cost. */
1487 2, /* vec_unalign_load_cost. */
1488 1, /* vec_store_cost. */
1489 3, /* cond_taken_branch_cost. */
1490 1, /* cond_not_taken_branch_cost. */
1493 static stringop_algs atom_memcpy[2] = {
1494 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1495 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1496 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1497 static stringop_algs atom_memset[2] = {
1498 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1499 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1500 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1501 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1502 static const
1503 struct processor_costs atom_cost = {
1504 COSTS_N_INSNS (1), /* cost of an add instruction */
1505 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1506 COSTS_N_INSNS (1), /* variable shift costs */
1507 COSTS_N_INSNS (1), /* constant shift costs */
1508 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1509 COSTS_N_INSNS (4), /* HI */
1510 COSTS_N_INSNS (3), /* SI */
1511 COSTS_N_INSNS (4), /* DI */
1512 COSTS_N_INSNS (2)}, /* other */
1513 0, /* cost of multiply per each bit set */
1514 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1515 COSTS_N_INSNS (26), /* HI */
1516 COSTS_N_INSNS (42), /* SI */
1517 COSTS_N_INSNS (74), /* DI */
1518 COSTS_N_INSNS (74)}, /* other */
1519 COSTS_N_INSNS (1), /* cost of movsx */
1520 COSTS_N_INSNS (1), /* cost of movzx */
1521 8, /* "large" insn */
1522 17, /* MOVE_RATIO */
1523 4, /* cost for loading QImode using movzbl */
1524 {4, 4, 4}, /* cost of loading integer registers
1525 in QImode, HImode and SImode.
1526 Relative to reg-reg move (2). */
1527 {4, 4, 4}, /* cost of storing integer registers */
1528 4, /* cost of reg,reg fld/fst */
1529 {12, 12, 12}, /* cost of loading fp registers
1530 in SFmode, DFmode and XFmode */
1531 {6, 6, 8}, /* cost of storing fp registers
1532 in SFmode, DFmode and XFmode */
1533 2, /* cost of moving MMX register */
1534 {8, 8}, /* cost of loading MMX registers
1535 in SImode and DImode */
1536 {8, 8}, /* cost of storing MMX registers
1537 in SImode and DImode */
1538 2, /* cost of moving SSE register */
1539 {8, 8, 8}, /* cost of loading SSE registers
1540 in SImode, DImode and TImode */
1541 {8, 8, 8}, /* cost of storing SSE registers
1542 in SImode, DImode and TImode */
1543 5, /* MMX or SSE register to integer */
1544 32, /* size of l1 cache. */
1545 256, /* size of l2 cache. */
1546 64, /* size of prefetch block */
1547 6, /* number of parallel prefetches */
1548 3, /* Branch cost */
1549 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1550 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1551 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1552 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1553 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1554 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1555 atom_memcpy,
1556 atom_memset,
1557 1, /* scalar_stmt_cost. */
1558 1, /* scalar load_cost. */
1559 1, /* scalar_store_cost. */
1560 1, /* vec_stmt_cost. */
1561 1, /* vec_to_scalar_cost. */
1562 1, /* scalar_to_vec_cost. */
1563 1, /* vec_align_load_cost. */
1564 2, /* vec_unalign_load_cost. */
1565 1, /* vec_store_cost. */
1566 3, /* cond_taken_branch_cost. */
1567 1, /* cond_not_taken_branch_cost. */
1570 static stringop_algs slm_memcpy[2] = {
1571 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1572 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1573 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1574 static stringop_algs slm_memset[2] = {
1575 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1576 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1577 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1578 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1579 static const
1580 struct processor_costs slm_cost = {
1581 COSTS_N_INSNS (1), /* cost of an add instruction */
1582 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1583 COSTS_N_INSNS (1), /* variable shift costs */
1584 COSTS_N_INSNS (1), /* constant shift costs */
1585 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1586 COSTS_N_INSNS (4), /* HI */
1587 COSTS_N_INSNS (3), /* SI */
1588 COSTS_N_INSNS (4), /* DI */
1589 COSTS_N_INSNS (2)}, /* other */
1590 0, /* cost of multiply per each bit set */
1591 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1592 COSTS_N_INSNS (26), /* HI */
1593 COSTS_N_INSNS (42), /* SI */
1594 COSTS_N_INSNS (74), /* DI */
1595 COSTS_N_INSNS (74)}, /* other */
1596 COSTS_N_INSNS (1), /* cost of movsx */
1597 COSTS_N_INSNS (1), /* cost of movzx */
1598 8, /* "large" insn */
1599 17, /* MOVE_RATIO */
1600 4, /* cost for loading QImode using movzbl */
1601 {4, 4, 4}, /* cost of loading integer registers
1602 in QImode, HImode and SImode.
1603 Relative to reg-reg move (2). */
1604 {4, 4, 4}, /* cost of storing integer registers */
1605 4, /* cost of reg,reg fld/fst */
1606 {12, 12, 12}, /* cost of loading fp registers
1607 in SFmode, DFmode and XFmode */
1608 {6, 6, 8}, /* cost of storing fp registers
1609 in SFmode, DFmode and XFmode */
1610 2, /* cost of moving MMX register */
1611 {8, 8}, /* cost of loading MMX registers
1612 in SImode and DImode */
1613 {8, 8}, /* cost of storing MMX registers
1614 in SImode and DImode */
1615 2, /* cost of moving SSE register */
1616 {8, 8, 8}, /* cost of loading SSE registers
1617 in SImode, DImode and TImode */
1618 {8, 8, 8}, /* cost of storing SSE registers
1619 in SImode, DImode and TImode */
1620 5, /* MMX or SSE register to integer */
1621 32, /* size of l1 cache. */
1622 256, /* size of l2 cache. */
1623 64, /* size of prefetch block */
1624 6, /* number of parallel prefetches */
1625 3, /* Branch cost */
1626 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1627 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1628 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1629 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1630 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1631 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1632 slm_memcpy,
1633 slm_memset,
1634 1, /* scalar_stmt_cost. */
1635 1, /* scalar load_cost. */
1636 1, /* scalar_store_cost. */
1637 1, /* vec_stmt_cost. */
1638 1, /* vec_to_scalar_cost. */
1639 1, /* scalar_to_vec_cost. */
1640 1, /* vec_align_load_cost. */
1641 2, /* vec_unalign_load_cost. */
1642 1, /* vec_store_cost. */
1643 3, /* cond_taken_branch_cost. */
1644 1, /* cond_not_taken_branch_cost. */
1647 /* Generic64 should produce code tuned for Nocona and K8. */
1649 static stringop_algs generic64_memcpy[2] = {
1650 DUMMY_STRINGOP_ALGS,
1651 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1652 {-1, libcall, false}}}};
1653 static stringop_algs generic64_memset[2] = {
1654 DUMMY_STRINGOP_ALGS,
1655 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1656 {-1, libcall, false}}}};
1657 static const
1658 struct processor_costs generic64_cost = {
1659 COSTS_N_INSNS (1), /* cost of an add instruction */
1660 /* On all chips taken into consideration lea is 2 cycles and more. With
1661 this cost however our current implementation of synth_mult results in
1662 use of unnecessary temporary registers causing regression on several
1663 SPECfp benchmarks. */
1664 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1665 COSTS_N_INSNS (1), /* variable shift costs */
1666 COSTS_N_INSNS (1), /* constant shift costs */
1667 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1668 COSTS_N_INSNS (4), /* HI */
1669 COSTS_N_INSNS (3), /* SI */
1670 COSTS_N_INSNS (4), /* DI */
1671 COSTS_N_INSNS (2)}, /* other */
1672 0, /* cost of multiply per each bit set */
1673 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1674 COSTS_N_INSNS (26), /* HI */
1675 COSTS_N_INSNS (42), /* SI */
1676 COSTS_N_INSNS (74), /* DI */
1677 COSTS_N_INSNS (74)}, /* other */
1678 COSTS_N_INSNS (1), /* cost of movsx */
1679 COSTS_N_INSNS (1), /* cost of movzx */
1680 8, /* "large" insn */
1681 17, /* MOVE_RATIO */
1682 4, /* cost for loading QImode using movzbl */
1683 {4, 4, 4}, /* cost of loading integer registers
1684 in QImode, HImode and SImode.
1685 Relative to reg-reg move (2). */
1686 {4, 4, 4}, /* cost of storing integer registers */
1687 4, /* cost of reg,reg fld/fst */
1688 {12, 12, 12}, /* cost of loading fp registers
1689 in SFmode, DFmode and XFmode */
1690 {6, 6, 8}, /* cost of storing fp registers
1691 in SFmode, DFmode and XFmode */
1692 2, /* cost of moving MMX register */
1693 {8, 8}, /* cost of loading MMX registers
1694 in SImode and DImode */
1695 {8, 8}, /* cost of storing MMX registers
1696 in SImode and DImode */
1697 2, /* cost of moving SSE register */
1698 {8, 8, 8}, /* cost of loading SSE registers
1699 in SImode, DImode and TImode */
1700 {8, 8, 8}, /* cost of storing SSE registers
1701 in SImode, DImode and TImode */
1702 5, /* MMX or SSE register to integer */
1703 32, /* size of l1 cache. */
1704 512, /* size of l2 cache. */
1705 64, /* size of prefetch block */
1706 6, /* number of parallel prefetches */
1707 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1708 value is increased to perhaps more appropriate value of 5. */
1709 3, /* Branch cost */
1710 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1711 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1712 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1713 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1714 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1715 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1716 generic64_memcpy,
1717 generic64_memset,
1718 1, /* scalar_stmt_cost. */
1719 1, /* scalar load_cost. */
1720 1, /* scalar_store_cost. */
1721 1, /* vec_stmt_cost. */
1722 1, /* vec_to_scalar_cost. */
1723 1, /* scalar_to_vec_cost. */
1724 1, /* vec_align_load_cost. */
1725 2, /* vec_unalign_load_cost. */
1726 1, /* vec_store_cost. */
1727 3, /* cond_taken_branch_cost. */
1728 1, /* cond_not_taken_branch_cost. */
1731 /* core_cost should produce code tuned for Core familly of CPUs. */
1732 static stringop_algs core_memcpy[2] = {
1733 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1734 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1735 {-1, libcall, false}}}};
1736 static stringop_algs core_memset[2] = {
1737 {libcall, {{6, loop_1_byte, true},
1738 {24, loop, true},
1739 {8192, rep_prefix_4_byte, true},
1740 {-1, libcall, false}}},
1741 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1742 {-1, libcall, false}}}};
1744 static const
1745 struct processor_costs core_cost = {
1746 COSTS_N_INSNS (1), /* cost of an add instruction */
1747 /* On all chips taken into consideration lea is 2 cycles and more. With
1748 this cost however our current implementation of synth_mult results in
1749 use of unnecessary temporary registers causing regression on several
1750 SPECfp benchmarks. */
1751 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1752 COSTS_N_INSNS (1), /* variable shift costs */
1753 COSTS_N_INSNS (1), /* constant shift costs */
1754 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1755 COSTS_N_INSNS (4), /* HI */
1756 COSTS_N_INSNS (3), /* SI */
1757 COSTS_N_INSNS (4), /* DI */
1758 COSTS_N_INSNS (2)}, /* other */
1759 0, /* cost of multiply per each bit set */
1760 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1761 COSTS_N_INSNS (26), /* HI */
1762 COSTS_N_INSNS (42), /* SI */
1763 COSTS_N_INSNS (74), /* DI */
1764 COSTS_N_INSNS (74)}, /* other */
1765 COSTS_N_INSNS (1), /* cost of movsx */
1766 COSTS_N_INSNS (1), /* cost of movzx */
1767 8, /* "large" insn */
1768 17, /* MOVE_RATIO */
1769 4, /* cost for loading QImode using movzbl */
1770 {4, 4, 4}, /* cost of loading integer registers
1771 in QImode, HImode and SImode.
1772 Relative to reg-reg move (2). */
1773 {4, 4, 4}, /* cost of storing integer registers */
1774 4, /* cost of reg,reg fld/fst */
1775 {12, 12, 12}, /* cost of loading fp registers
1776 in SFmode, DFmode and XFmode */
1777 {6, 6, 8}, /* cost of storing fp registers
1778 in SFmode, DFmode and XFmode */
1779 2, /* cost of moving MMX register */
1780 {8, 8}, /* cost of loading MMX registers
1781 in SImode and DImode */
1782 {8, 8}, /* cost of storing MMX registers
1783 in SImode and DImode */
1784 2, /* cost of moving SSE register */
1785 {8, 8, 8}, /* cost of loading SSE registers
1786 in SImode, DImode and TImode */
1787 {8, 8, 8}, /* cost of storing SSE registers
1788 in SImode, DImode and TImode */
1789 5, /* MMX or SSE register to integer */
1790 64, /* size of l1 cache. */
1791 512, /* size of l2 cache. */
1792 64, /* size of prefetch block */
1793 6, /* number of parallel prefetches */
1794 /* FIXME perhaps more appropriate value is 5. */
1795 3, /* Branch cost */
1796 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1797 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1798 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1799 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1800 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1801 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1802 core_memcpy,
1803 core_memset,
1804 1, /* scalar_stmt_cost. */
1805 1, /* scalar load_cost. */
1806 1, /* scalar_store_cost. */
1807 1, /* vec_stmt_cost. */
1808 1, /* vec_to_scalar_cost. */
1809 1, /* scalar_to_vec_cost. */
1810 1, /* vec_align_load_cost. */
1811 2, /* vec_unalign_load_cost. */
1812 1, /* vec_store_cost. */
1813 3, /* cond_taken_branch_cost. */
1814 1, /* cond_not_taken_branch_cost. */
1817 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1818 Athlon and K8. */
1819 static stringop_algs generic32_memcpy[2] = {
1820 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1821 {-1, libcall, false}}},
1822 DUMMY_STRINGOP_ALGS};
1823 static stringop_algs generic32_memset[2] = {
1824 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1825 {-1, libcall, false}}},
1826 DUMMY_STRINGOP_ALGS};
1827 static const
1828 struct processor_costs generic32_cost = {
1829 COSTS_N_INSNS (1), /* cost of an add instruction */
1830 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1831 COSTS_N_INSNS (1), /* variable shift costs */
1832 COSTS_N_INSNS (1), /* constant shift costs */
1833 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1834 COSTS_N_INSNS (4), /* HI */
1835 COSTS_N_INSNS (3), /* SI */
1836 COSTS_N_INSNS (4), /* DI */
1837 COSTS_N_INSNS (2)}, /* other */
1838 0, /* cost of multiply per each bit set */
1839 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1840 COSTS_N_INSNS (26), /* HI */
1841 COSTS_N_INSNS (42), /* SI */
1842 COSTS_N_INSNS (74), /* DI */
1843 COSTS_N_INSNS (74)}, /* other */
1844 COSTS_N_INSNS (1), /* cost of movsx */
1845 COSTS_N_INSNS (1), /* cost of movzx */
1846 8, /* "large" insn */
1847 17, /* MOVE_RATIO */
1848 4, /* cost for loading QImode using movzbl */
1849 {4, 4, 4}, /* cost of loading integer registers
1850 in QImode, HImode and SImode.
1851 Relative to reg-reg move (2). */
1852 {4, 4, 4}, /* cost of storing integer registers */
1853 4, /* cost of reg,reg fld/fst */
1854 {12, 12, 12}, /* cost of loading fp registers
1855 in SFmode, DFmode and XFmode */
1856 {6, 6, 8}, /* cost of storing fp registers
1857 in SFmode, DFmode and XFmode */
1858 2, /* cost of moving MMX register */
1859 {8, 8}, /* cost of loading MMX registers
1860 in SImode and DImode */
1861 {8, 8}, /* cost of storing MMX registers
1862 in SImode and DImode */
1863 2, /* cost of moving SSE register */
1864 {8, 8, 8}, /* cost of loading SSE registers
1865 in SImode, DImode and TImode */
1866 {8, 8, 8}, /* cost of storing SSE registers
1867 in SImode, DImode and TImode */
1868 5, /* MMX or SSE register to integer */
1869 32, /* size of l1 cache. */
1870 256, /* size of l2 cache. */
1871 64, /* size of prefetch block */
1872 6, /* number of parallel prefetches */
1873 3, /* Branch cost */
1874 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1875 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1876 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1877 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1878 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1879 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1880 generic32_memcpy,
1881 generic32_memset,
1882 1, /* scalar_stmt_cost. */
1883 1, /* scalar load_cost. */
1884 1, /* scalar_store_cost. */
1885 1, /* vec_stmt_cost. */
1886 1, /* vec_to_scalar_cost. */
1887 1, /* scalar_to_vec_cost. */
1888 1, /* vec_align_load_cost. */
1889 2, /* vec_unalign_load_cost. */
1890 1, /* vec_store_cost. */
1891 3, /* cond_taken_branch_cost. */
1892 1, /* cond_not_taken_branch_cost. */
1895 /* Set by -mtune. */
1896 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1898 /* Set by -mtune or -Os. */
1899 const struct processor_costs *ix86_cost = &pentium_cost;
1901 /* Processor feature/optimization bitmasks. */
1902 #define m_386 (1<<PROCESSOR_I386)
1903 #define m_486 (1<<PROCESSOR_I486)
1904 #define m_PENT (1<<PROCESSOR_PENTIUM)
1905 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1906 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1907 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1908 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1909 #define m_CORE2 (1<<PROCESSOR_CORE2)
1910 #define m_COREI7 (1<<PROCESSOR_COREI7)
1911 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1912 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1913 #define m_ATOM (1<<PROCESSOR_ATOM)
1914 #define m_SLM (1<<PROCESSOR_SLM)
1916 #define m_GEODE (1<<PROCESSOR_GEODE)
1917 #define m_K6 (1<<PROCESSOR_K6)
1918 #define m_K6_GEODE (m_K6 | m_GEODE)
1919 #define m_K8 (1<<PROCESSOR_K8)
1920 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1921 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1922 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1923 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1924 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1925 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1926 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1927 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1928 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1929 #define m_BTVER (m_BTVER1 | m_BTVER2)
1930 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1932 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1933 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1935 /* Generic instruction choice should be common subset of supported CPUs
1936 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1937 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1939 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1940 #undef DEF_TUNE
1941 #define DEF_TUNE(tune, name, selector) name,
1942 #include "x86-tune.def"
1943 #undef DEF_TUNE
1946 /* Feature tests against the various tunings. */
1947 unsigned char ix86_tune_features[X86_TUNE_LAST];
1949 /* Feature tests against the various tunings used to create ix86_tune_features
1950 based on the processor mask. */
1951 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1952 #undef DEF_TUNE
1953 #define DEF_TUNE(tune, name, selector) selector,
1954 #include "x86-tune.def"
1955 #undef DEF_TUNE
1958 /* Feature tests against the various architecture variations. */
1959 unsigned char ix86_arch_features[X86_ARCH_LAST];
1961 /* Feature tests against the various architecture variations, used to create
1962 ix86_arch_features based on the processor mask. */
1963 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1964 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1965 ~(m_386 | m_486 | m_PENT | m_K6),
1967 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1968 ~m_386,
1970 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1971 ~(m_386 | m_486),
1973 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1974 ~m_386,
1976 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1977 ~m_386,
1980 static const unsigned int x86_accumulate_outgoing_args
1981 = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
1983 static const unsigned int x86_arch_always_fancy_math_387
1984 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
1986 static const unsigned int x86_avx256_split_unaligned_load
1987 = m_COREI7 | m_GENERIC;
1989 static const unsigned int x86_avx256_split_unaligned_store
1990 = m_COREI7 | m_BDVER | m_GENERIC;
1992 /* In case the average insn count for single function invocation is
1993 lower than this constant, emit fast (but longer) prologue and
1994 epilogue code. */
1995 #define FAST_PROLOGUE_INSN_COUNT 20
1997 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1998 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1999 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2000 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2002 /* Array of the smallest class containing reg number REGNO, indexed by
2003 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2005 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2007 /* ax, dx, cx, bx */
2008 AREG, DREG, CREG, BREG,
2009 /* si, di, bp, sp */
2010 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2011 /* FP registers */
2012 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2013 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2014 /* arg pointer */
2015 NON_Q_REGS,
2016 /* flags, fpsr, fpcr, frame */
2017 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2018 /* SSE registers */
2019 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2020 SSE_REGS, SSE_REGS,
2021 /* MMX registers */
2022 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2023 MMX_REGS, MMX_REGS,
2024 /* REX registers */
2025 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2026 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2027 /* SSE REX registers */
2028 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2029 SSE_REGS, SSE_REGS,
2032 /* The "default" register map used in 32bit mode. */
2034 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2036 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2037 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2038 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2039 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2040 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2041 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2042 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2045 /* The "default" register map used in 64bit mode. */
2047 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2049 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2050 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2051 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2052 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2053 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2054 8,9,10,11,12,13,14,15, /* extended integer registers */
2055 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2058 /* Define the register numbers to be used in Dwarf debugging information.
2059 The SVR4 reference port C compiler uses the following register numbers
2060 in its Dwarf output code:
2061 0 for %eax (gcc regno = 0)
2062 1 for %ecx (gcc regno = 2)
2063 2 for %edx (gcc regno = 1)
2064 3 for %ebx (gcc regno = 3)
2065 4 for %esp (gcc regno = 7)
2066 5 for %ebp (gcc regno = 6)
2067 6 for %esi (gcc regno = 4)
2068 7 for %edi (gcc regno = 5)
2069 The following three DWARF register numbers are never generated by
2070 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2071 believes these numbers have these meanings.
2072 8 for %eip (no gcc equivalent)
2073 9 for %eflags (gcc regno = 17)
2074 10 for %trapno (no gcc equivalent)
2075 It is not at all clear how we should number the FP stack registers
2076 for the x86 architecture. If the version of SDB on x86/svr4 were
2077 a bit less brain dead with respect to floating-point then we would
2078 have a precedent to follow with respect to DWARF register numbers
2079 for x86 FP registers, but the SDB on x86/svr4 is so completely
2080 broken with respect to FP registers that it is hardly worth thinking
2081 of it as something to strive for compatibility with.
2082 The version of x86/svr4 SDB I have at the moment does (partially)
2083 seem to believe that DWARF register number 11 is associated with
2084 the x86 register %st(0), but that's about all. Higher DWARF
2085 register numbers don't seem to be associated with anything in
2086 particular, and even for DWARF regno 11, SDB only seems to under-
2087 stand that it should say that a variable lives in %st(0) (when
2088 asked via an `=' command) if we said it was in DWARF regno 11,
2089 but SDB still prints garbage when asked for the value of the
2090 variable in question (via a `/' command).
2091 (Also note that the labels SDB prints for various FP stack regs
2092 when doing an `x' command are all wrong.)
2093 Note that these problems generally don't affect the native SVR4
2094 C compiler because it doesn't allow the use of -O with -g and
2095 because when it is *not* optimizing, it allocates a memory
2096 location for each floating-point variable, and the memory
2097 location is what gets described in the DWARF AT_location
2098 attribute for the variable in question.
2099 Regardless of the severe mental illness of the x86/svr4 SDB, we
2100 do something sensible here and we use the following DWARF
2101 register numbers. Note that these are all stack-top-relative
2102 numbers.
2103 11 for %st(0) (gcc regno = 8)
2104 12 for %st(1) (gcc regno = 9)
2105 13 for %st(2) (gcc regno = 10)
2106 14 for %st(3) (gcc regno = 11)
2107 15 for %st(4) (gcc regno = 12)
2108 16 for %st(5) (gcc regno = 13)
2109 17 for %st(6) (gcc regno = 14)
2110 18 for %st(7) (gcc regno = 15)
2112 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2114 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2115 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2116 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2117 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2118 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2119 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2120 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2123 /* Define parameter passing and return registers. */
2125 static int const x86_64_int_parameter_registers[6] =
2127 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2130 static int const x86_64_ms_abi_int_parameter_registers[4] =
2132 CX_REG, DX_REG, R8_REG, R9_REG
2135 static int const x86_64_int_return_registers[4] =
2137 AX_REG, DX_REG, DI_REG, SI_REG
2140 /* Additional registers that are clobbered by SYSV calls. */
2142 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2144 SI_REG, DI_REG,
2145 XMM6_REG, XMM7_REG,
2146 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2147 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2150 /* Define the structure for the machine field in struct function. */
2152 struct GTY(()) stack_local_entry {
2153 unsigned short mode;
2154 unsigned short n;
2155 rtx rtl;
2156 struct stack_local_entry *next;
2159 /* Structure describing stack frame layout.
2160 Stack grows downward:
2162 [arguments]
2163 <- ARG_POINTER
2164 saved pc
2166 saved static chain if ix86_static_chain_on_stack
2168 saved frame pointer if frame_pointer_needed
2169 <- HARD_FRAME_POINTER
2170 [saved regs]
2171 <- regs_save_offset
2172 [padding0]
2174 [saved SSE regs]
2175 <- sse_regs_save_offset
2176 [padding1] |
2177 | <- FRAME_POINTER
2178 [va_arg registers] |
2180 [frame] |
2182 [padding2] | = to_allocate
2183 <- STACK_POINTER
2185 struct ix86_frame
2187 int nsseregs;
2188 int nregs;
2189 int va_arg_size;
2190 int red_zone_size;
2191 int outgoing_arguments_size;
2193 /* The offsets relative to ARG_POINTER. */
2194 HOST_WIDE_INT frame_pointer_offset;
2195 HOST_WIDE_INT hard_frame_pointer_offset;
2196 HOST_WIDE_INT stack_pointer_offset;
2197 HOST_WIDE_INT hfp_save_offset;
2198 HOST_WIDE_INT reg_save_offset;
2199 HOST_WIDE_INT sse_reg_save_offset;
2201 /* When save_regs_using_mov is set, emit prologue using
2202 move instead of push instructions. */
2203 bool save_regs_using_mov;
2206 /* Which cpu are we scheduling for. */
2207 enum attr_cpu ix86_schedule;
2209 /* Which cpu are we optimizing for. */
2210 enum processor_type ix86_tune;
2212 /* Which instruction set architecture to use. */
2213 enum processor_type ix86_arch;
2215 /* True if processor has SSE prefetch instruction. */
2216 unsigned char x86_prefetch_sse;
2218 /* -mstackrealign option */
2219 static const char ix86_force_align_arg_pointer_string[]
2220 = "force_align_arg_pointer";
2222 static rtx (*ix86_gen_leave) (void);
2223 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2224 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2225 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2226 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2227 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2228 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2229 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2230 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2231 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2232 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2233 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2235 /* Preferred alignment for stack boundary in bits. */
2236 unsigned int ix86_preferred_stack_boundary;
2238 /* Alignment for incoming stack boundary in bits specified at
2239 command line. */
2240 static unsigned int ix86_user_incoming_stack_boundary;
2242 /* Default alignment for incoming stack boundary in bits. */
2243 static unsigned int ix86_default_incoming_stack_boundary;
2245 /* Alignment for incoming stack boundary in bits. */
2246 unsigned int ix86_incoming_stack_boundary;
2248 /* Calling abi specific va_list type nodes. */
2249 static GTY(()) tree sysv_va_list_type_node;
2250 static GTY(()) tree ms_va_list_type_node;
2252 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2253 char internal_label_prefix[16];
2254 int internal_label_prefix_len;
2256 /* Fence to use after loop using movnt. */
2257 tree x86_mfence;
2259 /* Register class used for passing given 64bit part of the argument.
2260 These represent classes as documented by the PS ABI, with the exception
2261 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2262 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2264 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2265 whenever possible (upper half does contain padding). */
2266 enum x86_64_reg_class
2268 X86_64_NO_CLASS,
2269 X86_64_INTEGER_CLASS,
2270 X86_64_INTEGERSI_CLASS,
2271 X86_64_SSE_CLASS,
2272 X86_64_SSESF_CLASS,
2273 X86_64_SSEDF_CLASS,
2274 X86_64_SSEUP_CLASS,
2275 X86_64_X87_CLASS,
2276 X86_64_X87UP_CLASS,
2277 X86_64_COMPLEX_X87_CLASS,
2278 X86_64_MEMORY_CLASS
2281 #define MAX_CLASSES 4
2283 /* Table of constants used by fldpi, fldln2, etc.... */
2284 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2285 static bool ext_80387_constants_init = 0;
2288 static struct machine_function * ix86_init_machine_status (void);
2289 static rtx ix86_function_value (const_tree, const_tree, bool);
2290 static bool ix86_function_value_regno_p (const unsigned int);
2291 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2292 const_tree);
2293 static rtx ix86_static_chain (const_tree, bool);
2294 static int ix86_function_regparm (const_tree, const_tree);
2295 static void ix86_compute_frame_layout (struct ix86_frame *);
2296 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2297 rtx, rtx, int);
2298 static void ix86_add_new_builtins (HOST_WIDE_INT);
2299 static tree ix86_canonical_va_list_type (tree);
2300 static void predict_jump (int);
2301 static unsigned int split_stack_prologue_scratch_regno (void);
2302 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2304 enum ix86_function_specific_strings
2306 IX86_FUNCTION_SPECIFIC_ARCH,
2307 IX86_FUNCTION_SPECIFIC_TUNE,
2308 IX86_FUNCTION_SPECIFIC_MAX
2311 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2312 const char *, enum fpmath_unit, bool);
2313 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2314 static void ix86_function_specific_save (struct cl_target_option *);
2315 static void ix86_function_specific_restore (struct cl_target_option *);
2316 static void ix86_function_specific_print (FILE *, int,
2317 struct cl_target_option *);
2318 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2319 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2320 struct gcc_options *);
2321 static bool ix86_can_inline_p (tree, tree);
2322 static void ix86_set_current_function (tree);
2323 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2325 static enum calling_abi ix86_function_abi (const_tree);
2328 #ifndef SUBTARGET32_DEFAULT_CPU
2329 #define SUBTARGET32_DEFAULT_CPU "i386"
2330 #endif
2332 /* Whether -mtune= or -march= were specified */
2333 static int ix86_tune_defaulted;
2334 static int ix86_arch_specified;
2336 /* Vectorization library interface and handlers. */
2337 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2339 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2340 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2342 /* Processor target table, indexed by processor number */
2343 struct ptt
2345 const struct processor_costs *cost; /* Processor costs */
2346 const int align_loop; /* Default alignments. */
2347 const int align_loop_max_skip;
2348 const int align_jump;
2349 const int align_jump_max_skip;
2350 const int align_func;
2353 static const struct ptt processor_target_table[PROCESSOR_max] =
2355 {&i386_cost, 4, 3, 4, 3, 4},
2356 {&i486_cost, 16, 15, 16, 15, 16},
2357 {&pentium_cost, 16, 7, 16, 7, 16},
2358 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2359 {&geode_cost, 0, 0, 0, 0, 0},
2360 {&k6_cost, 32, 7, 32, 7, 32},
2361 {&athlon_cost, 16, 7, 16, 7, 16},
2362 {&pentium4_cost, 0, 0, 0, 0, 0},
2363 {&k8_cost, 16, 7, 16, 7, 16},
2364 {&nocona_cost, 0, 0, 0, 0, 0},
2365 /* Core 2 */
2366 {&core_cost, 16, 10, 16, 10, 16},
2367 /* Core i7 */
2368 {&core_cost, 16, 10, 16, 10, 16},
2369 /* Core avx2 */
2370 {&core_cost, 16, 10, 16, 10, 16},
2371 {&generic32_cost, 16, 7, 16, 7, 16},
2372 {&generic64_cost, 16, 10, 16, 10, 16},
2373 {&amdfam10_cost, 32, 24, 32, 7, 32},
2374 {&bdver1_cost, 16, 10, 16, 7, 11},
2375 {&bdver2_cost, 16, 10, 16, 7, 11},
2376 {&bdver3_cost, 16, 10, 16, 7, 11},
2377 {&btver1_cost, 16, 10, 16, 7, 11},
2378 {&btver2_cost, 16, 10, 16, 7, 11},
2379 {&atom_cost, 16, 15, 16, 7, 16},
2380 {&slm_cost, 16, 15, 16, 7, 16}
2383 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2385 "generic",
2386 "i386",
2387 "i486",
2388 "pentium",
2389 "pentium-mmx",
2390 "pentiumpro",
2391 "pentium2",
2392 "pentium3",
2393 "pentium4",
2394 "pentium-m",
2395 "prescott",
2396 "nocona",
2397 "core2",
2398 "corei7",
2399 "core-avx2",
2400 "atom",
2401 "slm",
2402 "geode",
2403 "k6",
2404 "k6-2",
2405 "k6-3",
2406 "athlon",
2407 "athlon-4",
2408 "k8",
2409 "amdfam10",
2410 "bdver1",
2411 "bdver2",
2412 "bdver3",
2413 "btver1",
2414 "btver2"
2417 static bool
2418 gate_insert_vzeroupper (void)
2420 return TARGET_AVX && TARGET_VZEROUPPER;
2423 static unsigned int
2424 rest_of_handle_insert_vzeroupper (void)
2426 int i;
2428 /* vzeroupper instructions are inserted immediately after reload to
2429 account for possible spills from 256bit registers. The pass
2430 reuses mode switching infrastructure by re-running mode insertion
2431 pass, so disable entities that have already been processed. */
2432 for (i = 0; i < MAX_386_ENTITIES; i++)
2433 ix86_optimize_mode_switching[i] = 0;
2435 ix86_optimize_mode_switching[AVX_U128] = 1;
2437 /* Call optimize_mode_switching. */
2438 g->get_passes ()->execute_pass_mode_switching ();
2439 return 0;
2442 namespace {
2444 const pass_data pass_data_insert_vzeroupper =
2446 RTL_PASS, /* type */
2447 "vzeroupper", /* name */
2448 OPTGROUP_NONE, /* optinfo_flags */
2449 true, /* has_gate */
2450 true, /* has_execute */
2451 TV_NONE, /* tv_id */
2452 0, /* properties_required */
2453 0, /* properties_provided */
2454 0, /* properties_destroyed */
2455 0, /* todo_flags_start */
2456 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2459 class pass_insert_vzeroupper : public rtl_opt_pass
2461 public:
2462 pass_insert_vzeroupper(gcc::context *ctxt)
2463 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2466 /* opt_pass methods: */
2467 bool gate () { return gate_insert_vzeroupper (); }
2468 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2470 }; // class pass_insert_vzeroupper
2472 } // anon namespace
2474 rtl_opt_pass *
2475 make_pass_insert_vzeroupper (gcc::context *ctxt)
2477 return new pass_insert_vzeroupper (ctxt);
2480 /* Return true if a red-zone is in use. */
2482 static inline bool
2483 ix86_using_red_zone (void)
2485 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2488 /* Return a string that documents the current -m options. The caller is
2489 responsible for freeing the string. */
2491 static char *
2492 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2493 const char *tune, enum fpmath_unit fpmath,
2494 bool add_nl_p)
2496 struct ix86_target_opts
2498 const char *option; /* option string */
2499 HOST_WIDE_INT mask; /* isa mask options */
2502 /* This table is ordered so that options like -msse4.2 that imply
2503 preceding options while match those first. */
2504 static struct ix86_target_opts isa_opts[] =
2506 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2507 { "-mfma", OPTION_MASK_ISA_FMA },
2508 { "-mxop", OPTION_MASK_ISA_XOP },
2509 { "-mlwp", OPTION_MASK_ISA_LWP },
2510 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2511 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2512 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2513 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2514 { "-msse3", OPTION_MASK_ISA_SSE3 },
2515 { "-msse2", OPTION_MASK_ISA_SSE2 },
2516 { "-msse", OPTION_MASK_ISA_SSE },
2517 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2518 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2519 { "-mmmx", OPTION_MASK_ISA_MMX },
2520 { "-mabm", OPTION_MASK_ISA_ABM },
2521 { "-mbmi", OPTION_MASK_ISA_BMI },
2522 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2523 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2524 { "-mhle", OPTION_MASK_ISA_HLE },
2525 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2526 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2527 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2528 { "-madx", OPTION_MASK_ISA_ADX },
2529 { "-mtbm", OPTION_MASK_ISA_TBM },
2530 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2531 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2532 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2533 { "-maes", OPTION_MASK_ISA_AES },
2534 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2535 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2536 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2537 { "-mf16c", OPTION_MASK_ISA_F16C },
2538 { "-mrtm", OPTION_MASK_ISA_RTM },
2539 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2540 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2543 /* Flag options. */
2544 static struct ix86_target_opts flag_opts[] =
2546 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2547 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2548 { "-m80387", MASK_80387 },
2549 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2550 { "-malign-double", MASK_ALIGN_DOUBLE },
2551 { "-mcld", MASK_CLD },
2552 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2553 { "-mieee-fp", MASK_IEEE_FP },
2554 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2555 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2556 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2557 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2558 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2559 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2560 { "-mno-red-zone", MASK_NO_RED_ZONE },
2561 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2562 { "-mrecip", MASK_RECIP },
2563 { "-mrtd", MASK_RTD },
2564 { "-msseregparm", MASK_SSEREGPARM },
2565 { "-mstack-arg-probe", MASK_STACK_PROBE },
2566 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2567 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2568 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2569 { "-mvzeroupper", MASK_VZEROUPPER },
2570 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2571 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2572 { "-mprefer-avx128", MASK_PREFER_AVX128},
2575 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2577 char isa_other[40];
2578 char target_other[40];
2579 unsigned num = 0;
2580 unsigned i, j;
2581 char *ret;
2582 char *ptr;
2583 size_t len;
2584 size_t line_len;
2585 size_t sep_len;
2586 const char *abi;
2588 memset (opts, '\0', sizeof (opts));
2590 /* Add -march= option. */
2591 if (arch)
2593 opts[num][0] = "-march=";
2594 opts[num++][1] = arch;
2597 /* Add -mtune= option. */
2598 if (tune)
2600 opts[num][0] = "-mtune=";
2601 opts[num++][1] = tune;
2604 /* Add -m32/-m64/-mx32. */
2605 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2607 if ((isa & OPTION_MASK_ABI_64) != 0)
2608 abi = "-m64";
2609 else
2610 abi = "-mx32";
2611 isa &= ~ (OPTION_MASK_ISA_64BIT
2612 | OPTION_MASK_ABI_64
2613 | OPTION_MASK_ABI_X32);
2615 else
2616 abi = "-m32";
2617 opts[num++][0] = abi;
2619 /* Pick out the options in isa options. */
2620 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2622 if ((isa & isa_opts[i].mask) != 0)
2624 opts[num++][0] = isa_opts[i].option;
2625 isa &= ~ isa_opts[i].mask;
2629 if (isa && add_nl_p)
2631 opts[num++][0] = isa_other;
2632 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2633 isa);
2636 /* Add flag options. */
2637 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2639 if ((flags & flag_opts[i].mask) != 0)
2641 opts[num++][0] = flag_opts[i].option;
2642 flags &= ~ flag_opts[i].mask;
2646 if (flags && add_nl_p)
2648 opts[num++][0] = target_other;
2649 sprintf (target_other, "(other flags: %#x)", flags);
2652 /* Add -fpmath= option. */
2653 if (fpmath)
2655 opts[num][0] = "-mfpmath=";
2656 switch ((int) fpmath)
2658 case FPMATH_387:
2659 opts[num++][1] = "387";
2660 break;
2662 case FPMATH_SSE:
2663 opts[num++][1] = "sse";
2664 break;
2666 case FPMATH_387 | FPMATH_SSE:
2667 opts[num++][1] = "sse+387";
2668 break;
2670 default:
2671 gcc_unreachable ();
2675 /* Any options? */
2676 if (num == 0)
2677 return NULL;
2679 gcc_assert (num < ARRAY_SIZE (opts));
2681 /* Size the string. */
2682 len = 0;
2683 sep_len = (add_nl_p) ? 3 : 1;
2684 for (i = 0; i < num; i++)
2686 len += sep_len;
2687 for (j = 0; j < 2; j++)
2688 if (opts[i][j])
2689 len += strlen (opts[i][j]);
2692 /* Build the string. */
2693 ret = ptr = (char *) xmalloc (len);
2694 line_len = 0;
2696 for (i = 0; i < num; i++)
2698 size_t len2[2];
2700 for (j = 0; j < 2; j++)
2701 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2703 if (i != 0)
2705 *ptr++ = ' ';
2706 line_len++;
2708 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2710 *ptr++ = '\\';
2711 *ptr++ = '\n';
2712 line_len = 0;
2716 for (j = 0; j < 2; j++)
2717 if (opts[i][j])
2719 memcpy (ptr, opts[i][j], len2[j]);
2720 ptr += len2[j];
2721 line_len += len2[j];
2725 *ptr = '\0';
2726 gcc_assert (ret + len >= ptr);
2728 return ret;
2731 /* Return true, if profiling code should be emitted before
2732 prologue. Otherwise it returns false.
2733 Note: For x86 with "hotfix" it is sorried. */
2734 static bool
2735 ix86_profile_before_prologue (void)
2737 return flag_fentry != 0;
2740 /* Function that is callable from the debugger to print the current
2741 options. */
2742 void
2743 ix86_debug_options (void)
2745 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2746 ix86_arch_string, ix86_tune_string,
2747 ix86_fpmath, true);
2749 if (opts)
2751 fprintf (stderr, "%s\n\n", opts);
2752 free (opts);
2754 else
2755 fputs ("<no options>\n\n", stderr);
2757 return;
2760 static const char *stringop_alg_names[] = {
2761 #define DEF_ENUM
2762 #define DEF_ALG(alg, name) #name,
2763 #include "stringop.def"
2764 #undef DEF_ENUM
2765 #undef DEF_ALG
2768 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2769 The string is of the following form (or comma separated list of it):
2771 strategy_alg:max_size:[align|noalign]
2773 where the full size range for the strategy is either [0, max_size] or
2774 [min_size, max_size], in which min_size is the max_size + 1 of the
2775 preceding range. The last size range must have max_size == -1.
2777 Examples:
2780 -mmemcpy-strategy=libcall:-1:noalign
2782 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2786 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2788 This is to tell the compiler to use the following strategy for memset
2789 1) when the expected size is between [1, 16], use rep_8byte strategy;
2790 2) when the size is between [17, 2048], use vector_loop;
2791 3) when the size is > 2048, use libcall. */
2793 struct stringop_size_range
2795 int max;
2796 stringop_alg alg;
2797 bool noalign;
2800 static void
2801 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2803 const struct stringop_algs *default_algs;
2804 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2805 char *curr_range_str, *next_range_str;
2806 int i = 0, n = 0;
2808 if (is_memset)
2809 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2810 else
2811 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2813 curr_range_str = strategy_str;
2817 int maxs;
2818 stringop_alg alg;
2819 char alg_name[128];
2820 char align[16];
2821 next_range_str = strchr (curr_range_str, ',');
2822 if (next_range_str)
2823 *next_range_str++ = '\0';
2825 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2826 alg_name, &maxs, align))
2828 error ("wrong arg %s to option %s", curr_range_str,
2829 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2830 return;
2833 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2835 error ("size ranges of option %s should be increasing",
2836 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2837 return;
2840 for (i = 0; i < last_alg; i++)
2842 if (!strcmp (alg_name, stringop_alg_names[i]))
2844 alg = (stringop_alg) i;
2845 break;
2849 if (i == last_alg)
2851 error ("wrong stringop strategy name %s specified for option %s",
2852 alg_name,
2853 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2854 return;
2857 input_ranges[n].max = maxs;
2858 input_ranges[n].alg = alg;
2859 if (!strcmp (align, "align"))
2860 input_ranges[n].noalign = false;
2861 else if (!strcmp (align, "noalign"))
2862 input_ranges[n].noalign = true;
2863 else
2865 error ("unknown alignment %s specified for option %s",
2866 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2867 return;
2869 n++;
2870 curr_range_str = next_range_str;
2872 while (curr_range_str);
2874 if (input_ranges[n - 1].max != -1)
2876 error ("the max value for the last size range should be -1"
2877 " for option %s",
2878 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2879 return;
2882 if (n > MAX_STRINGOP_ALGS)
2884 error ("too many size ranges specified in option %s",
2885 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2886 return;
2889 /* Now override the default algs array. */
2890 for (i = 0; i < n; i++)
2892 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2893 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2894 = input_ranges[i].alg;
2895 *const_cast<int *>(&default_algs->size[i].noalign)
2896 = input_ranges[i].noalign;
2901 /* parse -mtune-ctrl= option. When DUMP is true,
2902 print the features that are explicitly set. */
2904 static void
2905 parse_mtune_ctrl_str (bool dump)
2907 if (!ix86_tune_ctrl_string)
2908 return;
2910 char *next_feature_string = NULL;
2911 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2912 char *orig = curr_feature_string;
2913 int i;
2916 bool clear = false;
2918 next_feature_string = strchr (curr_feature_string, ',');
2919 if (next_feature_string)
2920 *next_feature_string++ = '\0';
2921 if (*curr_feature_string == '^')
2923 curr_feature_string++;
2924 clear = true;
2926 for (i = 0; i < X86_TUNE_LAST; i++)
2928 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2930 ix86_tune_features[i] = !clear;
2931 if (dump)
2932 fprintf (stderr, "Explicitly %s feature %s\n",
2933 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2934 break;
2937 if (i == X86_TUNE_LAST)
2938 error ("Unknown parameter to option -mtune-ctrl: %s",
2939 clear ? curr_feature_string - 1 : curr_feature_string);
2940 curr_feature_string = next_feature_string;
2942 while (curr_feature_string);
2943 free (orig);
2946 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2947 processor type. */
2949 static void
2950 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2952 unsigned int ix86_tune_mask = 1u << ix86_tune;
2953 int i;
2955 for (i = 0; i < X86_TUNE_LAST; ++i)
2957 if (ix86_tune_no_default)
2958 ix86_tune_features[i] = 0;
2959 else
2960 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
2963 if (dump)
2965 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
2966 for (i = 0; i < X86_TUNE_LAST; i++)
2967 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
2968 ix86_tune_features[i] ? "on" : "off");
2971 parse_mtune_ctrl_str (dump);
2975 /* Override various settings based on options. If MAIN_ARGS_P, the
2976 options are from the command line, otherwise they are from
2977 attributes. */
2979 static void
2980 ix86_option_override_internal (bool main_args_p)
2982 int i;
2983 unsigned int ix86_arch_mask, ix86_tune_mask;
2984 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2985 const char *prefix;
2986 const char *suffix;
2987 const char *sw;
2989 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2990 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2991 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2992 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2993 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2994 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2995 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2996 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2997 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2998 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2999 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3000 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3001 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3002 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3003 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3004 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3005 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3006 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3007 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3008 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3009 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3010 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3011 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3012 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3013 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3014 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3015 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3016 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3017 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3018 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3019 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3020 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3021 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3022 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3023 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3024 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3025 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3026 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3027 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3028 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3030 /* if this reaches 64, need to widen struct pta flags below */
3032 static struct pta
3034 const char *const name; /* processor name or nickname. */
3035 const enum processor_type processor;
3036 const enum attr_cpu schedule;
3037 const unsigned HOST_WIDE_INT flags;
3039 const processor_alias_table[] =
3041 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3042 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3043 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3044 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3045 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3046 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3047 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3048 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3049 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3050 PTA_MMX | PTA_SSE | PTA_FXSR},
3051 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3052 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3053 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3054 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3055 PTA_MMX | PTA_SSE | PTA_FXSR},
3056 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3057 PTA_MMX | PTA_SSE | PTA_FXSR},
3058 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3059 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3060 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3061 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3062 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3063 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3064 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3065 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3066 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3067 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3068 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3069 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3070 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3071 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3072 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3073 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3074 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3075 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
3076 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3077 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3078 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3079 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3080 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
3081 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3082 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3083 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3084 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3085 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3086 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3087 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3088 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3089 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3090 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3091 | PTA_XSAVEOPT},
3092 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3093 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3094 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3095 {"slm", PROCESSOR_SLM, CPU_SLM,
3096 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3097 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3098 | PTA_FXSR},
3099 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3100 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3101 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3102 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3103 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3104 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3105 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3106 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3107 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3108 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3109 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3110 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3111 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3112 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3113 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3114 {"x86-64", PROCESSOR_K8, CPU_K8,
3115 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3116 {"k8", PROCESSOR_K8, CPU_K8,
3117 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3118 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3119 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3120 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3121 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3122 {"opteron", PROCESSOR_K8, CPU_K8,
3123 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3124 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3125 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3126 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3127 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3128 {"athlon64", PROCESSOR_K8, CPU_K8,
3129 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3130 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3131 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3132 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3133 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3134 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3135 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3136 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3137 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3138 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3139 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3140 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3141 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3142 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3143 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3144 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3145 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3146 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3147 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3148 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3149 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3150 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3151 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3152 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3153 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3154 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3155 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3156 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3157 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3158 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3159 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3160 | PTA_XSAVEOPT | PTA_FSGSBASE},
3161 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3162 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3163 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3164 | PTA_FXSR | PTA_XSAVE},
3165 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3166 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3167 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3168 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3169 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3170 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3172 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3173 PTA_HLE /* flags are only used for -march switch. */ },
3174 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3175 PTA_64BIT
3176 | PTA_HLE /* flags are only used for -march switch. */ },
3179 /* -mrecip options. */
3180 static struct
3182 const char *string; /* option name */
3183 unsigned int mask; /* mask bits to set */
3185 const recip_options[] =
3187 { "all", RECIP_MASK_ALL },
3188 { "none", RECIP_MASK_NONE },
3189 { "div", RECIP_MASK_DIV },
3190 { "sqrt", RECIP_MASK_SQRT },
3191 { "vec-div", RECIP_MASK_VEC_DIV },
3192 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3195 int const pta_size = ARRAY_SIZE (processor_alias_table);
3197 /* Set up prefix/suffix so the error messages refer to either the command
3198 line argument, or the attribute(target). */
3199 if (main_args_p)
3201 prefix = "-m";
3202 suffix = "";
3203 sw = "switch";
3205 else
3207 prefix = "option(\"";
3208 suffix = "\")";
3209 sw = "attribute";
3212 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3213 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3214 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3215 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3216 #ifdef TARGET_BI_ARCH
3217 else
3219 #if TARGET_BI_ARCH == 1
3220 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3221 is on and OPTION_MASK_ABI_X32 is off. We turn off
3222 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3223 -mx32. */
3224 if (TARGET_X32)
3225 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3226 #else
3227 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3228 on and OPTION_MASK_ABI_64 is off. We turn off
3229 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3230 -m64. */
3231 if (TARGET_LP64)
3232 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3233 #endif
3235 #endif
3237 if (TARGET_X32)
3239 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3240 OPTION_MASK_ABI_64 for TARGET_X32. */
3241 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3242 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3244 else if (TARGET_LP64)
3246 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3247 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3248 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3249 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3252 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3253 SUBTARGET_OVERRIDE_OPTIONS;
3254 #endif
3256 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3257 SUBSUBTARGET_OVERRIDE_OPTIONS;
3258 #endif
3260 /* -fPIC is the default for x86_64. */
3261 if (TARGET_MACHO && TARGET_64BIT)
3262 flag_pic = 2;
3264 /* Need to check -mtune=generic first. */
3265 if (ix86_tune_string)
3267 if (!strcmp (ix86_tune_string, "generic")
3268 || !strcmp (ix86_tune_string, "i686")
3269 /* As special support for cross compilers we read -mtune=native
3270 as -mtune=generic. With native compilers we won't see the
3271 -mtune=native, as it was changed by the driver. */
3272 || !strcmp (ix86_tune_string, "native"))
3274 if (TARGET_64BIT)
3275 ix86_tune_string = "generic64";
3276 else
3277 ix86_tune_string = "generic32";
3279 /* If this call is for setting the option attribute, allow the
3280 generic32/generic64 that was previously set. */
3281 else if (!main_args_p
3282 && (!strcmp (ix86_tune_string, "generic32")
3283 || !strcmp (ix86_tune_string, "generic64")))
3285 else if (!strncmp (ix86_tune_string, "generic", 7))
3286 error ("bad value (%s) for %stune=%s %s",
3287 ix86_tune_string, prefix, suffix, sw);
3288 else if (!strcmp (ix86_tune_string, "x86-64"))
3289 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3290 "%stune=k8%s or %stune=generic%s instead as appropriate",
3291 prefix, suffix, prefix, suffix, prefix, suffix);
3293 else
3295 if (ix86_arch_string)
3296 ix86_tune_string = ix86_arch_string;
3297 if (!ix86_tune_string)
3299 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3300 ix86_tune_defaulted = 1;
3303 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3304 need to use a sensible tune option. */
3305 if (!strcmp (ix86_tune_string, "generic")
3306 || !strcmp (ix86_tune_string, "x86-64")
3307 || !strcmp (ix86_tune_string, "i686"))
3309 if (TARGET_64BIT)
3310 ix86_tune_string = "generic64";
3311 else
3312 ix86_tune_string = "generic32";
3316 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3318 /* rep; movq isn't available in 32-bit code. */
3319 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3320 ix86_stringop_alg = no_stringop;
3323 if (!ix86_arch_string)
3324 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3325 else
3326 ix86_arch_specified = 1;
3328 if (global_options_set.x_ix86_pmode)
3330 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3331 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3332 error ("address mode %qs not supported in the %s bit mode",
3333 TARGET_64BIT ? "short" : "long",
3334 TARGET_64BIT ? "64" : "32");
3336 else
3337 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3339 if (!global_options_set.x_ix86_abi)
3340 ix86_abi = DEFAULT_ABI;
3342 if (global_options_set.x_ix86_cmodel)
3344 switch (ix86_cmodel)
3346 case CM_SMALL:
3347 case CM_SMALL_PIC:
3348 if (flag_pic)
3349 ix86_cmodel = CM_SMALL_PIC;
3350 if (!TARGET_64BIT)
3351 error ("code model %qs not supported in the %s bit mode",
3352 "small", "32");
3353 break;
3355 case CM_MEDIUM:
3356 case CM_MEDIUM_PIC:
3357 if (flag_pic)
3358 ix86_cmodel = CM_MEDIUM_PIC;
3359 if (!TARGET_64BIT)
3360 error ("code model %qs not supported in the %s bit mode",
3361 "medium", "32");
3362 else if (TARGET_X32)
3363 error ("code model %qs not supported in x32 mode",
3364 "medium");
3365 break;
3367 case CM_LARGE:
3368 case CM_LARGE_PIC:
3369 if (flag_pic)
3370 ix86_cmodel = CM_LARGE_PIC;
3371 if (!TARGET_64BIT)
3372 error ("code model %qs not supported in the %s bit mode",
3373 "large", "32");
3374 else if (TARGET_X32)
3375 error ("code model %qs not supported in x32 mode",
3376 "large");
3377 break;
3379 case CM_32:
3380 if (flag_pic)
3381 error ("code model %s does not support PIC mode", "32");
3382 if (TARGET_64BIT)
3383 error ("code model %qs not supported in the %s bit mode",
3384 "32", "64");
3385 break;
3387 case CM_KERNEL:
3388 if (flag_pic)
3390 error ("code model %s does not support PIC mode", "kernel");
3391 ix86_cmodel = CM_32;
3393 if (!TARGET_64BIT)
3394 error ("code model %qs not supported in the %s bit mode",
3395 "kernel", "32");
3396 break;
3398 default:
3399 gcc_unreachable ();
3402 else
3404 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3405 use of rip-relative addressing. This eliminates fixups that
3406 would otherwise be needed if this object is to be placed in a
3407 DLL, and is essentially just as efficient as direct addressing. */
3408 if (TARGET_64BIT && (TARGET_RDOS || TARGET_PECOFF))
3409 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3410 else if (TARGET_64BIT)
3411 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3412 else
3413 ix86_cmodel = CM_32;
3415 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3417 error ("-masm=intel not supported in this configuration");
3418 ix86_asm_dialect = ASM_ATT;
3420 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3421 sorry ("%i-bit mode not compiled in",
3422 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3424 for (i = 0; i < pta_size; i++)
3425 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3427 ix86_schedule = processor_alias_table[i].schedule;
3428 ix86_arch = processor_alias_table[i].processor;
3429 /* Default cpu tuning to the architecture. */
3430 ix86_tune = ix86_arch;
3432 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3433 error ("CPU you selected does not support x86-64 "
3434 "instruction set");
3436 if (processor_alias_table[i].flags & PTA_MMX
3437 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3438 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3439 if (processor_alias_table[i].flags & PTA_3DNOW
3440 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3441 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3442 if (processor_alias_table[i].flags & PTA_3DNOW_A
3443 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3444 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3445 if (processor_alias_table[i].flags & PTA_SSE
3446 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3447 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3448 if (processor_alias_table[i].flags & PTA_SSE2
3449 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3450 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3451 if (processor_alias_table[i].flags & PTA_SSE3
3452 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3453 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3454 if (processor_alias_table[i].flags & PTA_SSSE3
3455 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3456 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3457 if (processor_alias_table[i].flags & PTA_SSE4_1
3458 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3459 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3460 if (processor_alias_table[i].flags & PTA_SSE4_2
3461 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3462 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3463 if (processor_alias_table[i].flags & PTA_AVX
3464 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3465 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3466 if (processor_alias_table[i].flags & PTA_AVX2
3467 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3468 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3469 if (processor_alias_table[i].flags & PTA_FMA
3470 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3471 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3472 if (processor_alias_table[i].flags & PTA_SSE4A
3473 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3474 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3475 if (processor_alias_table[i].flags & PTA_FMA4
3476 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3477 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3478 if (processor_alias_table[i].flags & PTA_XOP
3479 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3480 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3481 if (processor_alias_table[i].flags & PTA_LWP
3482 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3483 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3484 if (processor_alias_table[i].flags & PTA_ABM
3485 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3486 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3487 if (processor_alias_table[i].flags & PTA_BMI
3488 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3489 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3490 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3491 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3492 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3493 if (processor_alias_table[i].flags & PTA_TBM
3494 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3495 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3496 if (processor_alias_table[i].flags & PTA_BMI2
3497 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3498 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3499 if (processor_alias_table[i].flags & PTA_CX16
3500 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3501 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3502 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3503 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3504 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3505 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3506 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3507 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3508 if (processor_alias_table[i].flags & PTA_MOVBE
3509 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3510 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3511 if (processor_alias_table[i].flags & PTA_AES
3512 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3513 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3514 if (processor_alias_table[i].flags & PTA_PCLMUL
3515 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3516 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3517 if (processor_alias_table[i].flags & PTA_FSGSBASE
3518 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3519 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3520 if (processor_alias_table[i].flags & PTA_RDRND
3521 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3522 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3523 if (processor_alias_table[i].flags & PTA_F16C
3524 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3525 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3526 if (processor_alias_table[i].flags & PTA_RTM
3527 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3528 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3529 if (processor_alias_table[i].flags & PTA_HLE
3530 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3531 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3532 if (processor_alias_table[i].flags & PTA_PRFCHW
3533 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3534 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3535 if (processor_alias_table[i].flags & PTA_RDSEED
3536 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3537 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3538 if (processor_alias_table[i].flags & PTA_ADX
3539 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3540 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3541 if (processor_alias_table[i].flags & PTA_FXSR
3542 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3543 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3544 if (processor_alias_table[i].flags & PTA_XSAVE
3545 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3546 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3547 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3548 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3549 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3550 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3551 x86_prefetch_sse = true;
3553 break;
3556 if (!strcmp (ix86_arch_string, "generic"))
3557 error ("generic CPU can be used only for %stune=%s %s",
3558 prefix, suffix, sw);
3559 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3560 error ("bad value (%s) for %sarch=%s %s",
3561 ix86_arch_string, prefix, suffix, sw);
3563 ix86_arch_mask = 1u << ix86_arch;
3564 for (i = 0; i < X86_ARCH_LAST; ++i)
3565 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3567 for (i = 0; i < pta_size; i++)
3568 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3570 ix86_schedule = processor_alias_table[i].schedule;
3571 ix86_tune = processor_alias_table[i].processor;
3572 if (TARGET_64BIT)
3574 if (!(processor_alias_table[i].flags & PTA_64BIT))
3576 if (ix86_tune_defaulted)
3578 ix86_tune_string = "x86-64";
3579 for (i = 0; i < pta_size; i++)
3580 if (! strcmp (ix86_tune_string,
3581 processor_alias_table[i].name))
3582 break;
3583 ix86_schedule = processor_alias_table[i].schedule;
3584 ix86_tune = processor_alias_table[i].processor;
3586 else
3587 error ("CPU you selected does not support x86-64 "
3588 "instruction set");
3591 else
3593 /* Adjust tuning when compiling for 32-bit ABI. */
3594 switch (ix86_tune)
3596 case PROCESSOR_GENERIC64:
3597 ix86_tune = PROCESSOR_GENERIC32;
3598 ix86_schedule = CPU_PENTIUMPRO;
3599 break;
3601 default:
3602 break;
3605 /* Intel CPUs have always interpreted SSE prefetch instructions as
3606 NOPs; so, we can enable SSE prefetch instructions even when
3607 -mtune (rather than -march) points us to a processor that has them.
3608 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3609 higher processors. */
3610 if (TARGET_CMOV
3611 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3612 x86_prefetch_sse = true;
3613 break;
3616 if (ix86_tune_specified && i == pta_size)
3617 error ("bad value (%s) for %stune=%s %s",
3618 ix86_tune_string, prefix, suffix, sw);
3620 set_ix86_tune_features (ix86_tune, ix86_dump_tunes);
3622 #ifndef USE_IX86_FRAME_POINTER
3623 #define USE_IX86_FRAME_POINTER 0
3624 #endif
3626 #ifndef USE_X86_64_FRAME_POINTER
3627 #define USE_X86_64_FRAME_POINTER 0
3628 #endif
3630 /* Set the default values for switches whose default depends on TARGET_64BIT
3631 in case they weren't overwritten by command line options. */
3632 if (TARGET_64BIT)
3634 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3635 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3636 if (flag_asynchronous_unwind_tables == 2)
3637 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3638 if (flag_pcc_struct_return == 2)
3639 flag_pcc_struct_return = 0;
3641 else
3643 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3644 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3645 if (flag_asynchronous_unwind_tables == 2)
3646 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3647 if (flag_pcc_struct_return == 2)
3648 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3651 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3652 if (optimize_size)
3653 ix86_cost = &ix86_size_cost;
3654 else
3655 ix86_cost = ix86_tune_cost;
3657 /* Arrange to set up i386_stack_locals for all functions. */
3658 init_machine_status = ix86_init_machine_status;
3660 /* Validate -mregparm= value. */
3661 if (global_options_set.x_ix86_regparm)
3663 if (TARGET_64BIT)
3664 warning (0, "-mregparm is ignored in 64-bit mode");
3665 if (ix86_regparm > REGPARM_MAX)
3667 error ("-mregparm=%d is not between 0 and %d",
3668 ix86_regparm, REGPARM_MAX);
3669 ix86_regparm = 0;
3672 if (TARGET_64BIT)
3673 ix86_regparm = REGPARM_MAX;
3675 /* Default align_* from the processor table. */
3676 if (align_loops == 0)
3678 align_loops = processor_target_table[ix86_tune].align_loop;
3679 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3681 if (align_jumps == 0)
3683 align_jumps = processor_target_table[ix86_tune].align_jump;
3684 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3686 if (align_functions == 0)
3688 align_functions = processor_target_table[ix86_tune].align_func;
3691 /* Provide default for -mbranch-cost= value. */
3692 if (!global_options_set.x_ix86_branch_cost)
3693 ix86_branch_cost = ix86_cost->branch_cost;
3695 if (TARGET_64BIT)
3697 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3699 /* Enable by default the SSE and MMX builtins. Do allow the user to
3700 explicitly disable any of these. In particular, disabling SSE and
3701 MMX for kernel code is extremely useful. */
3702 if (!ix86_arch_specified)
3703 ix86_isa_flags
3704 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3705 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3707 if (TARGET_RTD)
3708 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3710 else
3712 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3714 if (!ix86_arch_specified)
3715 ix86_isa_flags
3716 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3718 /* i386 ABI does not specify red zone. It still makes sense to use it
3719 when programmer takes care to stack from being destroyed. */
3720 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3721 target_flags |= MASK_NO_RED_ZONE;
3724 /* Keep nonleaf frame pointers. */
3725 if (flag_omit_frame_pointer)
3726 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3727 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3728 flag_omit_frame_pointer = 1;
3730 /* If we're doing fast math, we don't care about comparison order
3731 wrt NaNs. This lets us use a shorter comparison sequence. */
3732 if (flag_finite_math_only)
3733 target_flags &= ~MASK_IEEE_FP;
3735 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3736 since the insns won't need emulation. */
3737 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3738 target_flags &= ~MASK_NO_FANCY_MATH_387;
3740 /* Likewise, if the target doesn't have a 387, or we've specified
3741 software floating point, don't use 387 inline intrinsics. */
3742 if (!TARGET_80387)
3743 target_flags |= MASK_NO_FANCY_MATH_387;
3745 /* Turn on MMX builtins for -msse. */
3746 if (TARGET_SSE)
3747 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3749 /* Enable SSE prefetch. */
3750 if (TARGET_SSE || (TARGET_PRFCHW && !TARGET_3DNOW))
3751 x86_prefetch_sse = true;
3753 /* Enable prefetch{,w} instructions for -m3dnow. */
3754 if (TARGET_3DNOW)
3755 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW & ~ix86_isa_flags_explicit;
3757 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3758 if (TARGET_SSE4_2 || TARGET_ABM)
3759 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3761 /* Enable lzcnt instruction for -mabm. */
3762 if (TARGET_ABM)
3763 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3765 /* Validate -mpreferred-stack-boundary= value or default it to
3766 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3767 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3768 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3770 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3771 int max = (TARGET_SEH ? 4 : 12);
3773 if (ix86_preferred_stack_boundary_arg < min
3774 || ix86_preferred_stack_boundary_arg > max)
3776 if (min == max)
3777 error ("-mpreferred-stack-boundary is not supported "
3778 "for this target");
3779 else
3780 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3781 ix86_preferred_stack_boundary_arg, min, max);
3783 else
3784 ix86_preferred_stack_boundary
3785 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3788 /* Set the default value for -mstackrealign. */
3789 if (ix86_force_align_arg_pointer == -1)
3790 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3792 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3794 /* Validate -mincoming-stack-boundary= value or default it to
3795 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3796 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3797 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3799 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3800 || ix86_incoming_stack_boundary_arg > 12)
3801 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3802 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3803 else
3805 ix86_user_incoming_stack_boundary
3806 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3807 ix86_incoming_stack_boundary
3808 = ix86_user_incoming_stack_boundary;
3812 /* Accept -msseregparm only if at least SSE support is enabled. */
3813 if (TARGET_SSEREGPARM
3814 && ! TARGET_SSE)
3815 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3817 if (global_options_set.x_ix86_fpmath)
3819 if (ix86_fpmath & FPMATH_SSE)
3821 if (!TARGET_SSE)
3823 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3824 ix86_fpmath = FPMATH_387;
3826 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3828 warning (0, "387 instruction set disabled, using SSE arithmetics");
3829 ix86_fpmath = FPMATH_SSE;
3833 else
3834 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3836 /* If the i387 is disabled, then do not return values in it. */
3837 if (!TARGET_80387)
3838 target_flags &= ~MASK_FLOAT_RETURNS;
3840 /* Use external vectorized library in vectorizing intrinsics. */
3841 if (global_options_set.x_ix86_veclibabi_type)
3842 switch (ix86_veclibabi_type)
3844 case ix86_veclibabi_type_svml:
3845 ix86_veclib_handler = ix86_veclibabi_svml;
3846 break;
3848 case ix86_veclibabi_type_acml:
3849 ix86_veclib_handler = ix86_veclibabi_acml;
3850 break;
3852 default:
3853 gcc_unreachable ();
3856 ix86_tune_mask = 1u << ix86_tune;
3857 if ((!USE_IX86_FRAME_POINTER
3858 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3859 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3860 && !optimize_size)
3861 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3863 /* ??? Unwind info is not correct around the CFG unless either a frame
3864 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3865 unwind info generation to be aware of the CFG and propagating states
3866 around edges. */
3867 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3868 || flag_exceptions || flag_non_call_exceptions)
3869 && flag_omit_frame_pointer
3870 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3872 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3873 warning (0, "unwind tables currently require either a frame pointer "
3874 "or %saccumulate-outgoing-args%s for correctness",
3875 prefix, suffix);
3876 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3879 /* If stack probes are required, the space used for large function
3880 arguments on the stack must also be probed, so enable
3881 -maccumulate-outgoing-args so this happens in the prologue. */
3882 if (TARGET_STACK_PROBE
3883 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3885 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3886 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3887 "for correctness", prefix, suffix);
3888 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3891 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3893 char *p;
3894 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3895 p = strchr (internal_label_prefix, 'X');
3896 internal_label_prefix_len = p - internal_label_prefix;
3897 *p = '\0';
3900 /* When scheduling description is not available, disable scheduler pass
3901 so it won't slow down the compilation and make x87 code slower. */
3902 if (!TARGET_SCHEDULE)
3903 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3905 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3906 ix86_tune_cost->simultaneous_prefetches,
3907 global_options.x_param_values,
3908 global_options_set.x_param_values);
3909 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3910 ix86_tune_cost->prefetch_block,
3911 global_options.x_param_values,
3912 global_options_set.x_param_values);
3913 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3914 ix86_tune_cost->l1_cache_size,
3915 global_options.x_param_values,
3916 global_options_set.x_param_values);
3917 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3918 ix86_tune_cost->l2_cache_size,
3919 global_options.x_param_values,
3920 global_options_set.x_param_values);
3922 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3923 if (flag_prefetch_loop_arrays < 0
3924 && HAVE_prefetch
3925 && (optimize >= 3 || flag_profile_use)
3926 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3927 flag_prefetch_loop_arrays = 1;
3929 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3930 can be optimized to ap = __builtin_next_arg (0). */
3931 if (!TARGET_64BIT && !flag_split_stack)
3932 targetm.expand_builtin_va_start = NULL;
3934 if (TARGET_64BIT)
3936 ix86_gen_leave = gen_leave_rex64;
3937 if (Pmode == DImode)
3939 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3940 ix86_gen_tls_local_dynamic_base_64
3941 = gen_tls_local_dynamic_base_64_di;
3943 else
3945 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3946 ix86_gen_tls_local_dynamic_base_64
3947 = gen_tls_local_dynamic_base_64_si;
3950 else
3951 ix86_gen_leave = gen_leave;
3953 if (Pmode == DImode)
3955 ix86_gen_add3 = gen_adddi3;
3956 ix86_gen_sub3 = gen_subdi3;
3957 ix86_gen_sub3_carry = gen_subdi3_carry;
3958 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3959 ix86_gen_andsp = gen_anddi3;
3960 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3961 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3962 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3963 ix86_gen_monitor = gen_sse3_monitor_di;
3965 else
3967 ix86_gen_add3 = gen_addsi3;
3968 ix86_gen_sub3 = gen_subsi3;
3969 ix86_gen_sub3_carry = gen_subsi3_carry;
3970 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3971 ix86_gen_andsp = gen_andsi3;
3972 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3973 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3974 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3975 ix86_gen_monitor = gen_sse3_monitor_si;
3978 #ifdef USE_IX86_CLD
3979 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3980 if (!TARGET_64BIT)
3981 target_flags |= MASK_CLD & ~target_flags_explicit;
3982 #endif
3984 if (!TARGET_64BIT && flag_pic)
3986 if (flag_fentry > 0)
3987 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3988 "with -fpic");
3989 flag_fentry = 0;
3991 else if (TARGET_SEH)
3993 if (flag_fentry == 0)
3994 sorry ("-mno-fentry isn%'t compatible with SEH");
3995 flag_fentry = 1;
3997 else if (flag_fentry < 0)
3999 #if defined(PROFILE_BEFORE_PROLOGUE)
4000 flag_fentry = 1;
4001 #else
4002 flag_fentry = 0;
4003 #endif
4006 /* When not optimize for size, enable vzeroupper optimization for
4007 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4008 AVX unaligned load/store. */
4009 if (!optimize_size)
4011 if (flag_expensive_optimizations
4012 && !(target_flags_explicit & MASK_VZEROUPPER))
4013 target_flags |= MASK_VZEROUPPER;
4014 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
4015 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4016 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4017 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
4018 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4019 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4020 /* Enable 128-bit AVX instruction generation
4021 for the auto-vectorizer. */
4022 if (TARGET_AVX128_OPTIMAL
4023 && !(target_flags_explicit & MASK_PREFER_AVX128))
4024 target_flags |= MASK_PREFER_AVX128;
4027 if (ix86_recip_name)
4029 char *p = ASTRDUP (ix86_recip_name);
4030 char *q;
4031 unsigned int mask, i;
4032 bool invert;
4034 while ((q = strtok (p, ",")) != NULL)
4036 p = NULL;
4037 if (*q == '!')
4039 invert = true;
4040 q++;
4042 else
4043 invert = false;
4045 if (!strcmp (q, "default"))
4046 mask = RECIP_MASK_ALL;
4047 else
4049 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4050 if (!strcmp (q, recip_options[i].string))
4052 mask = recip_options[i].mask;
4053 break;
4056 if (i == ARRAY_SIZE (recip_options))
4058 error ("unknown option for -mrecip=%s", q);
4059 invert = false;
4060 mask = RECIP_MASK_NONE;
4064 recip_mask_explicit |= mask;
4065 if (invert)
4066 recip_mask &= ~mask;
4067 else
4068 recip_mask |= mask;
4072 if (TARGET_RECIP)
4073 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4074 else if (target_flags_explicit & MASK_RECIP)
4075 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4077 /* Default long double to 64-bit for Bionic. */
4078 if (TARGET_HAS_BIONIC
4079 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
4080 target_flags |= MASK_LONG_DOUBLE_64;
4082 /* Save the initial options in case the user does function specific
4083 options. */
4084 if (main_args_p)
4085 target_option_default_node = target_option_current_node
4086 = build_target_option_node ();
4088 /* Handle stack protector */
4089 if (!global_options_set.x_ix86_stack_protector_guard)
4090 ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4092 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4093 if (ix86_tune_memcpy_strategy)
4095 char *str = xstrdup (ix86_tune_memcpy_strategy);
4096 ix86_parse_stringop_strategy_string (str, false);
4097 free (str);
4100 if (ix86_tune_memset_strategy)
4102 char *str = xstrdup (ix86_tune_memset_strategy);
4103 ix86_parse_stringop_strategy_string (str, true);
4104 free (str);
4108 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4110 static void
4111 ix86_option_override (void)
4113 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4114 static struct register_pass_info insert_vzeroupper_info
4115 = { pass_insert_vzeroupper, "reload",
4116 1, PASS_POS_INSERT_AFTER
4119 ix86_option_override_internal (true);
4122 /* This needs to be done at start up. It's convenient to do it here. */
4123 register_pass (&insert_vzeroupper_info);
4126 /* Update register usage after having seen the compiler flags. */
4128 static void
4129 ix86_conditional_register_usage (void)
4131 int i, c_mask;
4132 unsigned int j;
4134 /* The PIC register, if it exists, is fixed. */
4135 j = PIC_OFFSET_TABLE_REGNUM;
4136 if (j != INVALID_REGNUM)
4137 fixed_regs[j] = call_used_regs[j] = 1;
4139 /* For 32-bit targets, squash the REX registers. */
4140 if (! TARGET_64BIT)
4142 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4143 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4144 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4145 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4148 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4149 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4150 : TARGET_64BIT ? (1 << 2)
4151 : (1 << 1));
4153 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4155 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4157 /* Set/reset conditionally defined registers from
4158 CALL_USED_REGISTERS initializer. */
4159 if (call_used_regs[i] > 1)
4160 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4162 /* Calculate registers of CLOBBERED_REGS register set
4163 as call used registers from GENERAL_REGS register set. */
4164 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4165 && call_used_regs[i])
4166 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4169 /* If MMX is disabled, squash the registers. */
4170 if (! TARGET_MMX)
4171 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4172 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4173 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4175 /* If SSE is disabled, squash the registers. */
4176 if (! TARGET_SSE)
4177 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4178 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4179 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4181 /* If the FPU is disabled, squash the registers. */
4182 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4183 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4184 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4185 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4189 /* Save the current options */
4191 static void
4192 ix86_function_specific_save (struct cl_target_option *ptr)
4194 ptr->arch = ix86_arch;
4195 ptr->schedule = ix86_schedule;
4196 ptr->tune = ix86_tune;
4197 ptr->branch_cost = ix86_branch_cost;
4198 ptr->tune_defaulted = ix86_tune_defaulted;
4199 ptr->arch_specified = ix86_arch_specified;
4200 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4201 ptr->ix86_target_flags_explicit = target_flags_explicit;
4202 ptr->x_recip_mask_explicit = recip_mask_explicit;
4204 /* The fields are char but the variables are not; make sure the
4205 values fit in the fields. */
4206 gcc_assert (ptr->arch == ix86_arch);
4207 gcc_assert (ptr->schedule == ix86_schedule);
4208 gcc_assert (ptr->tune == ix86_tune);
4209 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4212 /* Restore the current options */
4214 static void
4215 ix86_function_specific_restore (struct cl_target_option *ptr)
4217 enum processor_type old_tune = ix86_tune;
4218 enum processor_type old_arch = ix86_arch;
4219 unsigned int ix86_arch_mask;
4220 int i;
4222 ix86_arch = (enum processor_type) ptr->arch;
4223 ix86_schedule = (enum attr_cpu) ptr->schedule;
4224 ix86_tune = (enum processor_type) ptr->tune;
4225 ix86_branch_cost = ptr->branch_cost;
4226 ix86_tune_defaulted = ptr->tune_defaulted;
4227 ix86_arch_specified = ptr->arch_specified;
4228 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4229 target_flags_explicit = ptr->ix86_target_flags_explicit;
4230 recip_mask_explicit = ptr->x_recip_mask_explicit;
4232 /* Recreate the arch feature tests if the arch changed */
4233 if (old_arch != ix86_arch)
4235 ix86_arch_mask = 1u << ix86_arch;
4236 for (i = 0; i < X86_ARCH_LAST; ++i)
4237 ix86_arch_features[i]
4238 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4241 /* Recreate the tune optimization tests */
4242 if (old_tune != ix86_tune)
4243 set_ix86_tune_features (ix86_tune, false);
4246 /* Print the current options */
4248 static void
4249 ix86_function_specific_print (FILE *file, int indent,
4250 struct cl_target_option *ptr)
4252 char *target_string
4253 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4254 NULL, NULL, ptr->x_ix86_fpmath, false);
4256 fprintf (file, "%*sarch = %d (%s)\n",
4257 indent, "",
4258 ptr->arch,
4259 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4260 ? cpu_names[ptr->arch]
4261 : "<unknown>"));
4263 fprintf (file, "%*stune = %d (%s)\n",
4264 indent, "",
4265 ptr->tune,
4266 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4267 ? cpu_names[ptr->tune]
4268 : "<unknown>"));
4270 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4272 if (target_string)
4274 fprintf (file, "%*s%s\n", indent, "", target_string);
4275 free (target_string);
4280 /* Inner function to process the attribute((target(...))), take an argument and
4281 set the current options from the argument. If we have a list, recursively go
4282 over the list. */
4284 static bool
4285 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4286 struct gcc_options *enum_opts_set)
4288 char *next_optstr;
4289 bool ret = true;
4291 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4292 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4293 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4294 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4295 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4297 enum ix86_opt_type
4299 ix86_opt_unknown,
4300 ix86_opt_yes,
4301 ix86_opt_no,
4302 ix86_opt_str,
4303 ix86_opt_enum,
4304 ix86_opt_isa
4307 static const struct
4309 const char *string;
4310 size_t len;
4311 enum ix86_opt_type type;
4312 int opt;
4313 int mask;
4314 } attrs[] = {
4315 /* isa options */
4316 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4317 IX86_ATTR_ISA ("abm", OPT_mabm),
4318 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4319 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4320 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4321 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4322 IX86_ATTR_ISA ("aes", OPT_maes),
4323 IX86_ATTR_ISA ("avx", OPT_mavx),
4324 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4325 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4326 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4327 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4328 IX86_ATTR_ISA ("sse", OPT_msse),
4329 IX86_ATTR_ISA ("sse2", OPT_msse2),
4330 IX86_ATTR_ISA ("sse3", OPT_msse3),
4331 IX86_ATTR_ISA ("sse4", OPT_msse4),
4332 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4333 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4334 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4335 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4336 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4337 IX86_ATTR_ISA ("fma", OPT_mfma),
4338 IX86_ATTR_ISA ("xop", OPT_mxop),
4339 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4340 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4341 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4342 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4343 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4344 IX86_ATTR_ISA ("hle", OPT_mhle),
4345 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4346 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4347 IX86_ATTR_ISA ("adx", OPT_madx),
4348 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4349 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4350 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4352 /* enum options */
4353 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4355 /* string options */
4356 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4357 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4359 /* flag options */
4360 IX86_ATTR_YES ("cld",
4361 OPT_mcld,
4362 MASK_CLD),
4364 IX86_ATTR_NO ("fancy-math-387",
4365 OPT_mfancy_math_387,
4366 MASK_NO_FANCY_MATH_387),
4368 IX86_ATTR_YES ("ieee-fp",
4369 OPT_mieee_fp,
4370 MASK_IEEE_FP),
4372 IX86_ATTR_YES ("inline-all-stringops",
4373 OPT_minline_all_stringops,
4374 MASK_INLINE_ALL_STRINGOPS),
4376 IX86_ATTR_YES ("inline-stringops-dynamically",
4377 OPT_minline_stringops_dynamically,
4378 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4380 IX86_ATTR_NO ("align-stringops",
4381 OPT_mno_align_stringops,
4382 MASK_NO_ALIGN_STRINGOPS),
4384 IX86_ATTR_YES ("recip",
4385 OPT_mrecip,
4386 MASK_RECIP),
4390 /* If this is a list, recurse to get the options. */
4391 if (TREE_CODE (args) == TREE_LIST)
4393 bool ret = true;
4395 for (; args; args = TREE_CHAIN (args))
4396 if (TREE_VALUE (args)
4397 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4398 p_strings, enum_opts_set))
4399 ret = false;
4401 return ret;
4404 else if (TREE_CODE (args) != STRING_CST)
4406 error ("attribute %<target%> argument not a string");
4407 return false;
4410 /* Handle multiple arguments separated by commas. */
4411 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4413 while (next_optstr && *next_optstr != '\0')
4415 char *p = next_optstr;
4416 char *orig_p = p;
4417 char *comma = strchr (next_optstr, ',');
4418 const char *opt_string;
4419 size_t len, opt_len;
4420 int opt;
4421 bool opt_set_p;
4422 char ch;
4423 unsigned i;
4424 enum ix86_opt_type type = ix86_opt_unknown;
4425 int mask = 0;
4427 if (comma)
4429 *comma = '\0';
4430 len = comma - next_optstr;
4431 next_optstr = comma + 1;
4433 else
4435 len = strlen (p);
4436 next_optstr = NULL;
4439 /* Recognize no-xxx. */
4440 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4442 opt_set_p = false;
4443 p += 3;
4444 len -= 3;
4446 else
4447 opt_set_p = true;
4449 /* Find the option. */
4450 ch = *p;
4451 opt = N_OPTS;
4452 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4454 type = attrs[i].type;
4455 opt_len = attrs[i].len;
4456 if (ch == attrs[i].string[0]
4457 && ((type != ix86_opt_str && type != ix86_opt_enum)
4458 ? len == opt_len
4459 : len > opt_len)
4460 && memcmp (p, attrs[i].string, opt_len) == 0)
4462 opt = attrs[i].opt;
4463 mask = attrs[i].mask;
4464 opt_string = attrs[i].string;
4465 break;
4469 /* Process the option. */
4470 if (opt == N_OPTS)
4472 error ("attribute(target(\"%s\")) is unknown", orig_p);
4473 ret = false;
4476 else if (type == ix86_opt_isa)
4478 struct cl_decoded_option decoded;
4480 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4481 ix86_handle_option (&global_options, &global_options_set,
4482 &decoded, input_location);
4485 else if (type == ix86_opt_yes || type == ix86_opt_no)
4487 if (type == ix86_opt_no)
4488 opt_set_p = !opt_set_p;
4490 if (opt_set_p)
4491 target_flags |= mask;
4492 else
4493 target_flags &= ~mask;
4496 else if (type == ix86_opt_str)
4498 if (p_strings[opt])
4500 error ("option(\"%s\") was already specified", opt_string);
4501 ret = false;
4503 else
4504 p_strings[opt] = xstrdup (p + opt_len);
4507 else if (type == ix86_opt_enum)
4509 bool arg_ok;
4510 int value;
4512 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4513 if (arg_ok)
4514 set_option (&global_options, enum_opts_set, opt, value,
4515 p + opt_len, DK_UNSPECIFIED, input_location,
4516 global_dc);
4517 else
4519 error ("attribute(target(\"%s\")) is unknown", orig_p);
4520 ret = false;
4524 else
4525 gcc_unreachable ();
4528 return ret;
4531 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4533 tree
4534 ix86_valid_target_attribute_tree (tree args)
4536 const char *orig_arch_string = ix86_arch_string;
4537 const char *orig_tune_string = ix86_tune_string;
4538 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4539 int orig_tune_defaulted = ix86_tune_defaulted;
4540 int orig_arch_specified = ix86_arch_specified;
4541 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4542 tree t = NULL_TREE;
4543 int i;
4544 struct cl_target_option *def
4545 = TREE_TARGET_OPTION (target_option_default_node);
4546 struct gcc_options enum_opts_set;
4548 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4550 /* Process each of the options on the chain. */
4551 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4552 &enum_opts_set))
4553 return error_mark_node;
4555 /* If the changed options are different from the default, rerun
4556 ix86_option_override_internal, and then save the options away.
4557 The string options are are attribute options, and will be undone
4558 when we copy the save structure. */
4559 if (ix86_isa_flags != def->x_ix86_isa_flags
4560 || target_flags != def->x_target_flags
4561 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4562 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4563 || enum_opts_set.x_ix86_fpmath)
4565 /* If we are using the default tune= or arch=, undo the string assigned,
4566 and use the default. */
4567 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4568 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4569 else if (!orig_arch_specified)
4570 ix86_arch_string = NULL;
4572 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4573 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4574 else if (orig_tune_defaulted)
4575 ix86_tune_string = NULL;
4577 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4578 if (enum_opts_set.x_ix86_fpmath)
4579 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4580 else if (!TARGET_64BIT && TARGET_SSE)
4582 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4583 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4586 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4587 ix86_option_override_internal (false);
4589 /* Add any builtin functions with the new isa if any. */
4590 ix86_add_new_builtins (ix86_isa_flags);
4592 /* Save the current options unless we are validating options for
4593 #pragma. */
4594 t = build_target_option_node ();
4596 ix86_arch_string = orig_arch_string;
4597 ix86_tune_string = orig_tune_string;
4598 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4600 /* Free up memory allocated to hold the strings */
4601 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4602 free (option_strings[i]);
4605 return t;
4608 /* Hook to validate attribute((target("string"))). */
4610 static bool
4611 ix86_valid_target_attribute_p (tree fndecl,
4612 tree ARG_UNUSED (name),
4613 tree args,
4614 int ARG_UNUSED (flags))
4616 struct cl_target_option cur_target;
4617 bool ret = true;
4619 /* attribute((target("default"))) does nothing, beyond
4620 affecting multi-versioning. */
4621 if (TREE_VALUE (args)
4622 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4623 && TREE_CHAIN (args) == NULL_TREE
4624 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4625 return true;
4627 tree old_optimize = build_optimization_node ();
4628 tree new_target, new_optimize;
4629 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4631 /* If the function changed the optimization levels as well as setting target
4632 options, start with the optimizations specified. */
4633 if (func_optimize && func_optimize != old_optimize)
4634 cl_optimization_restore (&global_options,
4635 TREE_OPTIMIZATION (func_optimize));
4637 /* The target attributes may also change some optimization flags, so update
4638 the optimization options if necessary. */
4639 cl_target_option_save (&cur_target, &global_options);
4640 new_target = ix86_valid_target_attribute_tree (args);
4641 new_optimize = build_optimization_node ();
4643 if (new_target == error_mark_node)
4644 ret = false;
4646 else if (fndecl && new_target)
4648 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4650 if (old_optimize != new_optimize)
4651 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4654 cl_target_option_restore (&global_options, &cur_target);
4656 if (old_optimize != new_optimize)
4657 cl_optimization_restore (&global_options,
4658 TREE_OPTIMIZATION (old_optimize));
4660 return ret;
4664 /* Hook to determine if one function can safely inline another. */
4666 static bool
4667 ix86_can_inline_p (tree caller, tree callee)
4669 bool ret = false;
4670 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4671 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4673 /* If callee has no option attributes, then it is ok to inline. */
4674 if (!callee_tree)
4675 ret = true;
4677 /* If caller has no option attributes, but callee does then it is not ok to
4678 inline. */
4679 else if (!caller_tree)
4680 ret = false;
4682 else
4684 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4685 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4687 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4688 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4689 function. */
4690 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4691 != callee_opts->x_ix86_isa_flags)
4692 ret = false;
4694 /* See if we have the same non-isa options. */
4695 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4696 ret = false;
4698 /* See if arch, tune, etc. are the same. */
4699 else if (caller_opts->arch != callee_opts->arch)
4700 ret = false;
4702 else if (caller_opts->tune != callee_opts->tune)
4703 ret = false;
4705 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4706 ret = false;
4708 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4709 ret = false;
4711 else
4712 ret = true;
4715 return ret;
4719 /* Remember the last target of ix86_set_current_function. */
4720 static GTY(()) tree ix86_previous_fndecl;
4722 /* Invalidate ix86_previous_fndecl cache. */
4723 void
4724 ix86_reset_previous_fndecl (void)
4726 ix86_previous_fndecl = NULL_TREE;
4729 /* Establish appropriate back-end context for processing the function
4730 FNDECL. The argument might be NULL to indicate processing at top
4731 level, outside of any function scope. */
4732 static void
4733 ix86_set_current_function (tree fndecl)
4735 /* Only change the context if the function changes. This hook is called
4736 several times in the course of compiling a function, and we don't want to
4737 slow things down too much or call target_reinit when it isn't safe. */
4738 if (fndecl && fndecl != ix86_previous_fndecl)
4740 tree old_tree = (ix86_previous_fndecl
4741 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4742 : NULL_TREE);
4744 tree new_tree = (fndecl
4745 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4746 : NULL_TREE);
4748 ix86_previous_fndecl = fndecl;
4749 if (old_tree == new_tree)
4752 else if (new_tree)
4754 cl_target_option_restore (&global_options,
4755 TREE_TARGET_OPTION (new_tree));
4756 target_reinit ();
4759 else if (old_tree)
4761 struct cl_target_option *def
4762 = TREE_TARGET_OPTION (target_option_current_node);
4764 cl_target_option_restore (&global_options, def);
4765 target_reinit ();
4771 /* Return true if this goes in large data/bss. */
4773 static bool
4774 ix86_in_large_data_p (tree exp)
4776 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4777 return false;
4779 /* Functions are never large data. */
4780 if (TREE_CODE (exp) == FUNCTION_DECL)
4781 return false;
4783 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4785 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4786 if (strcmp (section, ".ldata") == 0
4787 || strcmp (section, ".lbss") == 0)
4788 return true;
4789 return false;
4791 else
4793 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4795 /* If this is an incomplete type with size 0, then we can't put it
4796 in data because it might be too big when completed. */
4797 if (!size || size > ix86_section_threshold)
4798 return true;
4801 return false;
4804 /* Switch to the appropriate section for output of DECL.
4805 DECL is either a `VAR_DECL' node or a constant of some sort.
4806 RELOC indicates whether forming the initial value of DECL requires
4807 link-time relocations. */
4809 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4810 ATTRIBUTE_UNUSED;
4812 static section *
4813 x86_64_elf_select_section (tree decl, int reloc,
4814 unsigned HOST_WIDE_INT align)
4816 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4817 && ix86_in_large_data_p (decl))
4819 const char *sname = NULL;
4820 unsigned int flags = SECTION_WRITE;
4821 switch (categorize_decl_for_section (decl, reloc))
4823 case SECCAT_DATA:
4824 sname = ".ldata";
4825 break;
4826 case SECCAT_DATA_REL:
4827 sname = ".ldata.rel";
4828 break;
4829 case SECCAT_DATA_REL_LOCAL:
4830 sname = ".ldata.rel.local";
4831 break;
4832 case SECCAT_DATA_REL_RO:
4833 sname = ".ldata.rel.ro";
4834 break;
4835 case SECCAT_DATA_REL_RO_LOCAL:
4836 sname = ".ldata.rel.ro.local";
4837 break;
4838 case SECCAT_BSS:
4839 sname = ".lbss";
4840 flags |= SECTION_BSS;
4841 break;
4842 case SECCAT_RODATA:
4843 case SECCAT_RODATA_MERGE_STR:
4844 case SECCAT_RODATA_MERGE_STR_INIT:
4845 case SECCAT_RODATA_MERGE_CONST:
4846 sname = ".lrodata";
4847 flags = 0;
4848 break;
4849 case SECCAT_SRODATA:
4850 case SECCAT_SDATA:
4851 case SECCAT_SBSS:
4852 gcc_unreachable ();
4853 case SECCAT_TEXT:
4854 case SECCAT_TDATA:
4855 case SECCAT_TBSS:
4856 /* We don't split these for medium model. Place them into
4857 default sections and hope for best. */
4858 break;
4860 if (sname)
4862 /* We might get called with string constants, but get_named_section
4863 doesn't like them as they are not DECLs. Also, we need to set
4864 flags in that case. */
4865 if (!DECL_P (decl))
4866 return get_section (sname, flags, NULL);
4867 return get_named_section (decl, sname, reloc);
4870 return default_elf_select_section (decl, reloc, align);
4873 /* Build up a unique section name, expressed as a
4874 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4875 RELOC indicates whether the initial value of EXP requires
4876 link-time relocations. */
4878 static void ATTRIBUTE_UNUSED
4879 x86_64_elf_unique_section (tree decl, int reloc)
4881 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4882 && ix86_in_large_data_p (decl))
4884 const char *prefix = NULL;
4885 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4886 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4888 switch (categorize_decl_for_section (decl, reloc))
4890 case SECCAT_DATA:
4891 case SECCAT_DATA_REL:
4892 case SECCAT_DATA_REL_LOCAL:
4893 case SECCAT_DATA_REL_RO:
4894 case SECCAT_DATA_REL_RO_LOCAL:
4895 prefix = one_only ? ".ld" : ".ldata";
4896 break;
4897 case SECCAT_BSS:
4898 prefix = one_only ? ".lb" : ".lbss";
4899 break;
4900 case SECCAT_RODATA:
4901 case SECCAT_RODATA_MERGE_STR:
4902 case SECCAT_RODATA_MERGE_STR_INIT:
4903 case SECCAT_RODATA_MERGE_CONST:
4904 prefix = one_only ? ".lr" : ".lrodata";
4905 break;
4906 case SECCAT_SRODATA:
4907 case SECCAT_SDATA:
4908 case SECCAT_SBSS:
4909 gcc_unreachable ();
4910 case SECCAT_TEXT:
4911 case SECCAT_TDATA:
4912 case SECCAT_TBSS:
4913 /* We don't split these for medium model. Place them into
4914 default sections and hope for best. */
4915 break;
4917 if (prefix)
4919 const char *name, *linkonce;
4920 char *string;
4922 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4923 name = targetm.strip_name_encoding (name);
4925 /* If we're using one_only, then there needs to be a .gnu.linkonce
4926 prefix to the section name. */
4927 linkonce = one_only ? ".gnu.linkonce" : "";
4929 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4931 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4932 return;
4935 default_unique_section (decl, reloc);
4938 #ifdef COMMON_ASM_OP
4939 /* This says how to output assembler code to declare an
4940 uninitialized external linkage data object.
4942 For medium model x86-64 we need to use .largecomm opcode for
4943 large objects. */
4944 void
4945 x86_elf_aligned_common (FILE *file,
4946 const char *name, unsigned HOST_WIDE_INT size,
4947 int align)
4949 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4950 && size > (unsigned int)ix86_section_threshold)
4951 fputs (".largecomm\t", file);
4952 else
4953 fputs (COMMON_ASM_OP, file);
4954 assemble_name (file, name);
4955 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4956 size, align / BITS_PER_UNIT);
4958 #endif
4960 /* Utility function for targets to use in implementing
4961 ASM_OUTPUT_ALIGNED_BSS. */
4963 void
4964 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4965 const char *name, unsigned HOST_WIDE_INT size,
4966 int align)
4968 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4969 && size > (unsigned int)ix86_section_threshold)
4970 switch_to_section (get_named_section (decl, ".lbss", 0));
4971 else
4972 switch_to_section (bss_section);
4973 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4974 #ifdef ASM_DECLARE_OBJECT_NAME
4975 last_assemble_variable_decl = decl;
4976 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4977 #else
4978 /* Standard thing is just output label for the object. */
4979 ASM_OUTPUT_LABEL (file, name);
4980 #endif /* ASM_DECLARE_OBJECT_NAME */
4981 ASM_OUTPUT_SKIP (file, size ? size : 1);
4984 /* Decide whether we must probe the stack before any space allocation
4985 on this target. It's essentially TARGET_STACK_PROBE except when
4986 -fstack-check causes the stack to be already probed differently. */
4988 bool
4989 ix86_target_stack_probe (void)
4991 /* Do not probe the stack twice if static stack checking is enabled. */
4992 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4993 return false;
4995 return TARGET_STACK_PROBE;
4998 /* Decide whether we can make a sibling call to a function. DECL is the
4999 declaration of the function being targeted by the call and EXP is the
5000 CALL_EXPR representing the call. */
5002 static bool
5003 ix86_function_ok_for_sibcall (tree decl, tree exp)
5005 tree type, decl_or_type;
5006 rtx a, b;
5008 /* If we are generating position-independent code, we cannot sibcall
5009 optimize any indirect call, or a direct call to a global function,
5010 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5011 if (!TARGET_MACHO
5012 && !TARGET_64BIT
5013 && flag_pic
5014 && (!decl || !targetm.binds_local_p (decl)))
5015 return false;
5017 /* If we need to align the outgoing stack, then sibcalling would
5018 unalign the stack, which may break the called function. */
5019 if (ix86_minimum_incoming_stack_boundary (true)
5020 < PREFERRED_STACK_BOUNDARY)
5021 return false;
5023 if (decl)
5025 decl_or_type = decl;
5026 type = TREE_TYPE (decl);
5028 else
5030 /* We're looking at the CALL_EXPR, we need the type of the function. */
5031 type = CALL_EXPR_FN (exp); /* pointer expression */
5032 type = TREE_TYPE (type); /* pointer type */
5033 type = TREE_TYPE (type); /* function type */
5034 decl_or_type = type;
5037 /* Check that the return value locations are the same. Like
5038 if we are returning floats on the 80387 register stack, we cannot
5039 make a sibcall from a function that doesn't return a float to a
5040 function that does or, conversely, from a function that does return
5041 a float to a function that doesn't; the necessary stack adjustment
5042 would not be executed. This is also the place we notice
5043 differences in the return value ABI. Note that it is ok for one
5044 of the functions to have void return type as long as the return
5045 value of the other is passed in a register. */
5046 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5047 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5048 cfun->decl, false);
5049 if (STACK_REG_P (a) || STACK_REG_P (b))
5051 if (!rtx_equal_p (a, b))
5052 return false;
5054 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5056 else if (!rtx_equal_p (a, b))
5057 return false;
5059 if (TARGET_64BIT)
5061 /* The SYSV ABI has more call-clobbered registers;
5062 disallow sibcalls from MS to SYSV. */
5063 if (cfun->machine->call_abi == MS_ABI
5064 && ix86_function_type_abi (type) == SYSV_ABI)
5065 return false;
5067 else
5069 /* If this call is indirect, we'll need to be able to use a
5070 call-clobbered register for the address of the target function.
5071 Make sure that all such registers are not used for passing
5072 parameters. Note that DLLIMPORT functions are indirect. */
5073 if (!decl
5074 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5076 if (ix86_function_regparm (type, NULL) >= 3)
5078 /* ??? Need to count the actual number of registers to be used,
5079 not the possible number of registers. Fix later. */
5080 return false;
5085 /* Otherwise okay. That also includes certain types of indirect calls. */
5086 return true;
5089 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5090 and "sseregparm" calling convention attributes;
5091 arguments as in struct attribute_spec.handler. */
5093 static tree
5094 ix86_handle_cconv_attribute (tree *node, tree name,
5095 tree args,
5096 int flags ATTRIBUTE_UNUSED,
5097 bool *no_add_attrs)
5099 if (TREE_CODE (*node) != FUNCTION_TYPE
5100 && TREE_CODE (*node) != METHOD_TYPE
5101 && TREE_CODE (*node) != FIELD_DECL
5102 && TREE_CODE (*node) != TYPE_DECL)
5104 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5105 name);
5106 *no_add_attrs = true;
5107 return NULL_TREE;
5110 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5111 if (is_attribute_p ("regparm", name))
5113 tree cst;
5115 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5117 error ("fastcall and regparm attributes are not compatible");
5120 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5122 error ("regparam and thiscall attributes are not compatible");
5125 cst = TREE_VALUE (args);
5126 if (TREE_CODE (cst) != INTEGER_CST)
5128 warning (OPT_Wattributes,
5129 "%qE attribute requires an integer constant argument",
5130 name);
5131 *no_add_attrs = true;
5133 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5135 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5136 name, REGPARM_MAX);
5137 *no_add_attrs = true;
5140 return NULL_TREE;
5143 if (TARGET_64BIT)
5145 /* Do not warn when emulating the MS ABI. */
5146 if ((TREE_CODE (*node) != FUNCTION_TYPE
5147 && TREE_CODE (*node) != METHOD_TYPE)
5148 || ix86_function_type_abi (*node) != MS_ABI)
5149 warning (OPT_Wattributes, "%qE attribute ignored",
5150 name);
5151 *no_add_attrs = true;
5152 return NULL_TREE;
5155 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5156 if (is_attribute_p ("fastcall", name))
5158 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5160 error ("fastcall and cdecl attributes are not compatible");
5162 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5164 error ("fastcall and stdcall attributes are not compatible");
5166 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5168 error ("fastcall and regparm attributes are not compatible");
5170 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5172 error ("fastcall and thiscall attributes are not compatible");
5176 /* Can combine stdcall with fastcall (redundant), regparm and
5177 sseregparm. */
5178 else if (is_attribute_p ("stdcall", name))
5180 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5182 error ("stdcall and cdecl attributes are not compatible");
5184 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5186 error ("stdcall and fastcall attributes are not compatible");
5188 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5190 error ("stdcall and thiscall attributes are not compatible");
5194 /* Can combine cdecl with regparm and sseregparm. */
5195 else if (is_attribute_p ("cdecl", name))
5197 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5199 error ("stdcall and cdecl attributes are not compatible");
5201 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5203 error ("fastcall and cdecl attributes are not compatible");
5205 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5207 error ("cdecl and thiscall attributes are not compatible");
5210 else if (is_attribute_p ("thiscall", name))
5212 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5213 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5214 name);
5215 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5217 error ("stdcall and thiscall attributes are not compatible");
5219 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5221 error ("fastcall and thiscall attributes are not compatible");
5223 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5225 error ("cdecl and thiscall attributes are not compatible");
5229 /* Can combine sseregparm with all attributes. */
5231 return NULL_TREE;
5234 /* The transactional memory builtins are implicitly regparm or fastcall
5235 depending on the ABI. Override the generic do-nothing attribute that
5236 these builtins were declared with, and replace it with one of the two
5237 attributes that we expect elsewhere. */
5239 static tree
5240 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5241 tree args ATTRIBUTE_UNUSED,
5242 int flags ATTRIBUTE_UNUSED,
5243 bool *no_add_attrs)
5245 tree alt;
5247 /* In no case do we want to add the placeholder attribute. */
5248 *no_add_attrs = true;
5250 /* The 64-bit ABI is unchanged for transactional memory. */
5251 if (TARGET_64BIT)
5252 return NULL_TREE;
5254 /* ??? Is there a better way to validate 32-bit windows? We have
5255 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5256 if (CHECK_STACK_LIMIT > 0)
5257 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5258 else
5260 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5261 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5263 decl_attributes (node, alt, flags);
5265 return NULL_TREE;
5268 /* This function determines from TYPE the calling-convention. */
5270 unsigned int
5271 ix86_get_callcvt (const_tree type)
5273 unsigned int ret = 0;
5274 bool is_stdarg;
5275 tree attrs;
5277 if (TARGET_64BIT)
5278 return IX86_CALLCVT_CDECL;
5280 attrs = TYPE_ATTRIBUTES (type);
5281 if (attrs != NULL_TREE)
5283 if (lookup_attribute ("cdecl", attrs))
5284 ret |= IX86_CALLCVT_CDECL;
5285 else if (lookup_attribute ("stdcall", attrs))
5286 ret |= IX86_CALLCVT_STDCALL;
5287 else if (lookup_attribute ("fastcall", attrs))
5288 ret |= IX86_CALLCVT_FASTCALL;
5289 else if (lookup_attribute ("thiscall", attrs))
5290 ret |= IX86_CALLCVT_THISCALL;
5292 /* Regparam isn't allowed for thiscall and fastcall. */
5293 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5295 if (lookup_attribute ("regparm", attrs))
5296 ret |= IX86_CALLCVT_REGPARM;
5297 if (lookup_attribute ("sseregparm", attrs))
5298 ret |= IX86_CALLCVT_SSEREGPARM;
5301 if (IX86_BASE_CALLCVT(ret) != 0)
5302 return ret;
5305 is_stdarg = stdarg_p (type);
5306 if (TARGET_RTD && !is_stdarg)
5307 return IX86_CALLCVT_STDCALL | ret;
5309 if (ret != 0
5310 || is_stdarg
5311 || TREE_CODE (type) != METHOD_TYPE
5312 || ix86_function_type_abi (type) != MS_ABI)
5313 return IX86_CALLCVT_CDECL | ret;
5315 return IX86_CALLCVT_THISCALL;
5318 /* Return 0 if the attributes for two types are incompatible, 1 if they
5319 are compatible, and 2 if they are nearly compatible (which causes a
5320 warning to be generated). */
5322 static int
5323 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5325 unsigned int ccvt1, ccvt2;
5327 if (TREE_CODE (type1) != FUNCTION_TYPE
5328 && TREE_CODE (type1) != METHOD_TYPE)
5329 return 1;
5331 ccvt1 = ix86_get_callcvt (type1);
5332 ccvt2 = ix86_get_callcvt (type2);
5333 if (ccvt1 != ccvt2)
5334 return 0;
5335 if (ix86_function_regparm (type1, NULL)
5336 != ix86_function_regparm (type2, NULL))
5337 return 0;
5339 return 1;
5342 /* Return the regparm value for a function with the indicated TYPE and DECL.
5343 DECL may be NULL when calling function indirectly
5344 or considering a libcall. */
5346 static int
5347 ix86_function_regparm (const_tree type, const_tree decl)
5349 tree attr;
5350 int regparm;
5351 unsigned int ccvt;
5353 if (TARGET_64BIT)
5354 return (ix86_function_type_abi (type) == SYSV_ABI
5355 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5356 ccvt = ix86_get_callcvt (type);
5357 regparm = ix86_regparm;
5359 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5361 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5362 if (attr)
5364 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5365 return regparm;
5368 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5369 return 2;
5370 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5371 return 1;
5373 /* Use register calling convention for local functions when possible. */
5374 if (decl
5375 && TREE_CODE (decl) == FUNCTION_DECL
5376 && optimize
5377 && !(profile_flag && !flag_fentry))
5379 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5380 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5381 if (i && i->local && i->can_change_signature)
5383 int local_regparm, globals = 0, regno;
5385 /* Make sure no regparm register is taken by a
5386 fixed register variable. */
5387 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5388 if (fixed_regs[local_regparm])
5389 break;
5391 /* We don't want to use regparm(3) for nested functions as
5392 these use a static chain pointer in the third argument. */
5393 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5394 local_regparm = 2;
5396 /* In 32-bit mode save a register for the split stack. */
5397 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5398 local_regparm = 2;
5400 /* Each fixed register usage increases register pressure,
5401 so less registers should be used for argument passing.
5402 This functionality can be overriden by an explicit
5403 regparm value. */
5404 for (regno = AX_REG; regno <= DI_REG; regno++)
5405 if (fixed_regs[regno])
5406 globals++;
5408 local_regparm
5409 = globals < local_regparm ? local_regparm - globals : 0;
5411 if (local_regparm > regparm)
5412 regparm = local_regparm;
5416 return regparm;
5419 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5420 DFmode (2) arguments in SSE registers for a function with the
5421 indicated TYPE and DECL. DECL may be NULL when calling function
5422 indirectly or considering a libcall. Otherwise return 0. */
5424 static int
5425 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5427 gcc_assert (!TARGET_64BIT);
5429 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5430 by the sseregparm attribute. */
5431 if (TARGET_SSEREGPARM
5432 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5434 if (!TARGET_SSE)
5436 if (warn)
5438 if (decl)
5439 error ("calling %qD with attribute sseregparm without "
5440 "SSE/SSE2 enabled", decl);
5441 else
5442 error ("calling %qT with attribute sseregparm without "
5443 "SSE/SSE2 enabled", type);
5445 return 0;
5448 return 2;
5451 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5452 (and DFmode for SSE2) arguments in SSE registers. */
5453 if (decl && TARGET_SSE_MATH && optimize
5454 && !(profile_flag && !flag_fentry))
5456 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5457 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5458 if (i && i->local && i->can_change_signature)
5459 return TARGET_SSE2 ? 2 : 1;
5462 return 0;
5465 /* Return true if EAX is live at the start of the function. Used by
5466 ix86_expand_prologue to determine if we need special help before
5467 calling allocate_stack_worker. */
5469 static bool
5470 ix86_eax_live_at_start_p (void)
5472 /* Cheat. Don't bother working forward from ix86_function_regparm
5473 to the function type to whether an actual argument is located in
5474 eax. Instead just look at cfg info, which is still close enough
5475 to correct at this point. This gives false positives for broken
5476 functions that might use uninitialized data that happens to be
5477 allocated in eax, but who cares? */
5478 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5481 static bool
5482 ix86_keep_aggregate_return_pointer (tree fntype)
5484 tree attr;
5486 if (!TARGET_64BIT)
5488 attr = lookup_attribute ("callee_pop_aggregate_return",
5489 TYPE_ATTRIBUTES (fntype));
5490 if (attr)
5491 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5493 /* For 32-bit MS-ABI the default is to keep aggregate
5494 return pointer. */
5495 if (ix86_function_type_abi (fntype) == MS_ABI)
5496 return true;
5498 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5501 /* Value is the number of bytes of arguments automatically
5502 popped when returning from a subroutine call.
5503 FUNDECL is the declaration node of the function (as a tree),
5504 FUNTYPE is the data type of the function (as a tree),
5505 or for a library call it is an identifier node for the subroutine name.
5506 SIZE is the number of bytes of arguments passed on the stack.
5508 On the 80386, the RTD insn may be used to pop them if the number
5509 of args is fixed, but if the number is variable then the caller
5510 must pop them all. RTD can't be used for library calls now
5511 because the library is compiled with the Unix compiler.
5512 Use of RTD is a selectable option, since it is incompatible with
5513 standard Unix calling sequences. If the option is not selected,
5514 the caller must always pop the args.
5516 The attribute stdcall is equivalent to RTD on a per module basis. */
5518 static int
5519 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5521 unsigned int ccvt;
5523 /* None of the 64-bit ABIs pop arguments. */
5524 if (TARGET_64BIT)
5525 return 0;
5527 ccvt = ix86_get_callcvt (funtype);
5529 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5530 | IX86_CALLCVT_THISCALL)) != 0
5531 && ! stdarg_p (funtype))
5532 return size;
5534 /* Lose any fake structure return argument if it is passed on the stack. */
5535 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5536 && !ix86_keep_aggregate_return_pointer (funtype))
5538 int nregs = ix86_function_regparm (funtype, fundecl);
5539 if (nregs == 0)
5540 return GET_MODE_SIZE (Pmode);
5543 return 0;
5546 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5548 static bool
5549 ix86_legitimate_combined_insn (rtx insn)
5551 /* Check operand constraints in case hard registers were propagated
5552 into insn pattern. This check prevents combine pass from
5553 generating insn patterns with invalid hard register operands.
5554 These invalid insns can eventually confuse reload to error out
5555 with a spill failure. See also PRs 46829 and 46843. */
5556 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5558 int i;
5560 extract_insn (insn);
5561 preprocess_constraints ();
5563 for (i = 0; i < recog_data.n_operands; i++)
5565 rtx op = recog_data.operand[i];
5566 enum machine_mode mode = GET_MODE (op);
5567 struct operand_alternative *op_alt;
5568 int offset = 0;
5569 bool win;
5570 int j;
5572 /* A unary operator may be accepted by the predicate, but it
5573 is irrelevant for matching constraints. */
5574 if (UNARY_P (op))
5575 op = XEXP (op, 0);
5577 if (GET_CODE (op) == SUBREG)
5579 if (REG_P (SUBREG_REG (op))
5580 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5581 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5582 GET_MODE (SUBREG_REG (op)),
5583 SUBREG_BYTE (op),
5584 GET_MODE (op));
5585 op = SUBREG_REG (op);
5588 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5589 continue;
5591 op_alt = recog_op_alt[i];
5593 /* Operand has no constraints, anything is OK. */
5594 win = !recog_data.n_alternatives;
5596 for (j = 0; j < recog_data.n_alternatives; j++)
5598 if (op_alt[j].anything_ok
5599 || (op_alt[j].matches != -1
5600 && operands_match_p
5601 (recog_data.operand[i],
5602 recog_data.operand[op_alt[j].matches]))
5603 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5605 win = true;
5606 break;
5610 if (!win)
5611 return false;
5615 return true;
5618 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5620 static unsigned HOST_WIDE_INT
5621 ix86_asan_shadow_offset (void)
5623 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5624 : HOST_WIDE_INT_C (0x7fff8000))
5625 : (HOST_WIDE_INT_1 << 29);
5628 /* Argument support functions. */
5630 /* Return true when register may be used to pass function parameters. */
5631 bool
5632 ix86_function_arg_regno_p (int regno)
5634 int i;
5635 const int *parm_regs;
5637 if (!TARGET_64BIT)
5639 if (TARGET_MACHO)
5640 return (regno < REGPARM_MAX
5641 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5642 else
5643 return (regno < REGPARM_MAX
5644 || (TARGET_MMX && MMX_REGNO_P (regno)
5645 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5646 || (TARGET_SSE && SSE_REGNO_P (regno)
5647 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5650 if (TARGET_MACHO)
5652 if (SSE_REGNO_P (regno) && TARGET_SSE)
5653 return true;
5655 else
5657 if (TARGET_SSE && SSE_REGNO_P (regno)
5658 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5659 return true;
5662 /* TODO: The function should depend on current function ABI but
5663 builtins.c would need updating then. Therefore we use the
5664 default ABI. */
5666 /* RAX is used as hidden argument to va_arg functions. */
5667 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5668 return true;
5670 if (ix86_abi == MS_ABI)
5671 parm_regs = x86_64_ms_abi_int_parameter_registers;
5672 else
5673 parm_regs = x86_64_int_parameter_registers;
5674 for (i = 0; i < (ix86_abi == MS_ABI
5675 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5676 if (regno == parm_regs[i])
5677 return true;
5678 return false;
5681 /* Return if we do not know how to pass TYPE solely in registers. */
5683 static bool
5684 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5686 if (must_pass_in_stack_var_size_or_pad (mode, type))
5687 return true;
5689 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5690 The layout_type routine is crafty and tries to trick us into passing
5691 currently unsupported vector types on the stack by using TImode. */
5692 return (!TARGET_64BIT && mode == TImode
5693 && type && TREE_CODE (type) != VECTOR_TYPE);
5696 /* It returns the size, in bytes, of the area reserved for arguments passed
5697 in registers for the function represented by fndecl dependent to the used
5698 abi format. */
5700 ix86_reg_parm_stack_space (const_tree fndecl)
5702 enum calling_abi call_abi = SYSV_ABI;
5703 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5704 call_abi = ix86_function_abi (fndecl);
5705 else
5706 call_abi = ix86_function_type_abi (fndecl);
5707 if (TARGET_64BIT && call_abi == MS_ABI)
5708 return 32;
5709 return 0;
5712 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5713 call abi used. */
5714 enum calling_abi
5715 ix86_function_type_abi (const_tree fntype)
5717 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5719 enum calling_abi abi = ix86_abi;
5720 if (abi == SYSV_ABI)
5722 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5723 abi = MS_ABI;
5725 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5726 abi = SYSV_ABI;
5727 return abi;
5729 return ix86_abi;
5732 static bool
5733 ix86_function_ms_hook_prologue (const_tree fn)
5735 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5737 if (decl_function_context (fn) != NULL_TREE)
5738 error_at (DECL_SOURCE_LOCATION (fn),
5739 "ms_hook_prologue is not compatible with nested function");
5740 else
5741 return true;
5743 return false;
5746 static enum calling_abi
5747 ix86_function_abi (const_tree fndecl)
5749 if (! fndecl)
5750 return ix86_abi;
5751 return ix86_function_type_abi (TREE_TYPE (fndecl));
5754 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5755 call abi used. */
5756 enum calling_abi
5757 ix86_cfun_abi (void)
5759 if (! cfun)
5760 return ix86_abi;
5761 return cfun->machine->call_abi;
5764 /* Write the extra assembler code needed to declare a function properly. */
5766 void
5767 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5768 tree decl)
5770 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5772 if (is_ms_hook)
5774 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5775 unsigned int filler_cc = 0xcccccccc;
5777 for (i = 0; i < filler_count; i += 4)
5778 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5781 #ifdef SUBTARGET_ASM_UNWIND_INIT
5782 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5783 #endif
5785 ASM_OUTPUT_LABEL (asm_out_file, fname);
5787 /* Output magic byte marker, if hot-patch attribute is set. */
5788 if (is_ms_hook)
5790 if (TARGET_64BIT)
5792 /* leaq [%rsp + 0], %rsp */
5793 asm_fprintf (asm_out_file, ASM_BYTE
5794 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5796 else
5798 /* movl.s %edi, %edi
5799 push %ebp
5800 movl.s %esp, %ebp */
5801 asm_fprintf (asm_out_file, ASM_BYTE
5802 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5807 /* regclass.c */
5808 extern void init_regs (void);
5810 /* Implementation of call abi switching target hook. Specific to FNDECL
5811 the specific call register sets are set. See also
5812 ix86_conditional_register_usage for more details. */
5813 void
5814 ix86_call_abi_override (const_tree fndecl)
5816 if (fndecl == NULL_TREE)
5817 cfun->machine->call_abi = ix86_abi;
5818 else
5819 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5822 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5823 expensive re-initialization of init_regs each time we switch function context
5824 since this is needed only during RTL expansion. */
5825 static void
5826 ix86_maybe_switch_abi (void)
5828 if (TARGET_64BIT &&
5829 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5830 reinit_regs ();
5833 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5834 for a call to a function whose data type is FNTYPE.
5835 For a library call, FNTYPE is 0. */
5837 void
5838 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5839 tree fntype, /* tree ptr for function decl */
5840 rtx libname, /* SYMBOL_REF of library name or 0 */
5841 tree fndecl,
5842 int caller)
5844 struct cgraph_local_info *i;
5846 memset (cum, 0, sizeof (*cum));
5848 if (fndecl)
5850 i = cgraph_local_info (fndecl);
5851 cum->call_abi = ix86_function_abi (fndecl);
5853 else
5855 i = NULL;
5856 cum->call_abi = ix86_function_type_abi (fntype);
5859 cum->caller = caller;
5861 /* Set up the number of registers to use for passing arguments. */
5863 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5864 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5865 "or subtarget optimization implying it");
5866 cum->nregs = ix86_regparm;
5867 if (TARGET_64BIT)
5869 cum->nregs = (cum->call_abi == SYSV_ABI
5870 ? X86_64_REGPARM_MAX
5871 : X86_64_MS_REGPARM_MAX);
5873 if (TARGET_SSE)
5875 cum->sse_nregs = SSE_REGPARM_MAX;
5876 if (TARGET_64BIT)
5878 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5879 ? X86_64_SSE_REGPARM_MAX
5880 : X86_64_MS_SSE_REGPARM_MAX);
5883 if (TARGET_MMX)
5884 cum->mmx_nregs = MMX_REGPARM_MAX;
5885 cum->warn_avx = true;
5886 cum->warn_sse = true;
5887 cum->warn_mmx = true;
5889 /* Because type might mismatch in between caller and callee, we need to
5890 use actual type of function for local calls.
5891 FIXME: cgraph_analyze can be told to actually record if function uses
5892 va_start so for local functions maybe_vaarg can be made aggressive
5893 helping K&R code.
5894 FIXME: once typesytem is fixed, we won't need this code anymore. */
5895 if (i && i->local && i->can_change_signature)
5896 fntype = TREE_TYPE (fndecl);
5897 cum->maybe_vaarg = (fntype
5898 ? (!prototype_p (fntype) || stdarg_p (fntype))
5899 : !libname);
5901 if (!TARGET_64BIT)
5903 /* If there are variable arguments, then we won't pass anything
5904 in registers in 32-bit mode. */
5905 if (stdarg_p (fntype))
5907 cum->nregs = 0;
5908 cum->sse_nregs = 0;
5909 cum->mmx_nregs = 0;
5910 cum->warn_avx = 0;
5911 cum->warn_sse = 0;
5912 cum->warn_mmx = 0;
5913 return;
5916 /* Use ecx and edx registers if function has fastcall attribute,
5917 else look for regparm information. */
5918 if (fntype)
5920 unsigned int ccvt = ix86_get_callcvt (fntype);
5921 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5923 cum->nregs = 1;
5924 cum->fastcall = 1; /* Same first register as in fastcall. */
5926 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5928 cum->nregs = 2;
5929 cum->fastcall = 1;
5931 else
5932 cum->nregs = ix86_function_regparm (fntype, fndecl);
5935 /* Set up the number of SSE registers used for passing SFmode
5936 and DFmode arguments. Warn for mismatching ABI. */
5937 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5941 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5942 But in the case of vector types, it is some vector mode.
5944 When we have only some of our vector isa extensions enabled, then there
5945 are some modes for which vector_mode_supported_p is false. For these
5946 modes, the generic vector support in gcc will choose some non-vector mode
5947 in order to implement the type. By computing the natural mode, we'll
5948 select the proper ABI location for the operand and not depend on whatever
5949 the middle-end decides to do with these vector types.
5951 The midde-end can't deal with the vector types > 16 bytes. In this
5952 case, we return the original mode and warn ABI change if CUM isn't
5953 NULL. */
5955 static enum machine_mode
5956 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5958 enum machine_mode mode = TYPE_MODE (type);
5960 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5962 HOST_WIDE_INT size = int_size_in_bytes (type);
5963 if ((size == 8 || size == 16 || size == 32)
5964 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5965 && TYPE_VECTOR_SUBPARTS (type) > 1)
5967 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5969 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5970 mode = MIN_MODE_VECTOR_FLOAT;
5971 else
5972 mode = MIN_MODE_VECTOR_INT;
5974 /* Get the mode which has this inner mode and number of units. */
5975 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5976 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5977 && GET_MODE_INNER (mode) == innermode)
5979 if (size == 32 && !TARGET_AVX)
5981 static bool warnedavx;
5983 if (cum
5984 && !warnedavx
5985 && cum->warn_avx)
5987 warnedavx = true;
5988 warning (0, "AVX vector argument without AVX "
5989 "enabled changes the ABI");
5991 return TYPE_MODE (type);
5993 else if ((size == 8 || size == 16) && !TARGET_SSE)
5995 static bool warnedsse;
5997 if (cum
5998 && !warnedsse
5999 && cum->warn_sse)
6001 warnedsse = true;
6002 warning (0, "SSE vector argument without SSE "
6003 "enabled changes the ABI");
6005 return mode;
6007 else
6008 return mode;
6011 gcc_unreachable ();
6015 return mode;
6018 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6019 this may not agree with the mode that the type system has chosen for the
6020 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6021 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6023 static rtx
6024 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6025 unsigned int regno)
6027 rtx tmp;
6029 if (orig_mode != BLKmode)
6030 tmp = gen_rtx_REG (orig_mode, regno);
6031 else
6033 tmp = gen_rtx_REG (mode, regno);
6034 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6035 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6038 return tmp;
6041 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6042 of this code is to classify each 8bytes of incoming argument by the register
6043 class and assign registers accordingly. */
6045 /* Return the union class of CLASS1 and CLASS2.
6046 See the x86-64 PS ABI for details. */
6048 static enum x86_64_reg_class
6049 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6051 /* Rule #1: If both classes are equal, this is the resulting class. */
6052 if (class1 == class2)
6053 return class1;
6055 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6056 the other class. */
6057 if (class1 == X86_64_NO_CLASS)
6058 return class2;
6059 if (class2 == X86_64_NO_CLASS)
6060 return class1;
6062 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6063 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6064 return X86_64_MEMORY_CLASS;
6066 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6067 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6068 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6069 return X86_64_INTEGERSI_CLASS;
6070 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6071 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6072 return X86_64_INTEGER_CLASS;
6074 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6075 MEMORY is used. */
6076 if (class1 == X86_64_X87_CLASS
6077 || class1 == X86_64_X87UP_CLASS
6078 || class1 == X86_64_COMPLEX_X87_CLASS
6079 || class2 == X86_64_X87_CLASS
6080 || class2 == X86_64_X87UP_CLASS
6081 || class2 == X86_64_COMPLEX_X87_CLASS)
6082 return X86_64_MEMORY_CLASS;
6084 /* Rule #6: Otherwise class SSE is used. */
6085 return X86_64_SSE_CLASS;
6088 /* Classify the argument of type TYPE and mode MODE.
6089 CLASSES will be filled by the register class used to pass each word
6090 of the operand. The number of words is returned. In case the parameter
6091 should be passed in memory, 0 is returned. As a special case for zero
6092 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6094 BIT_OFFSET is used internally for handling records and specifies offset
6095 of the offset in bits modulo 256 to avoid overflow cases.
6097 See the x86-64 PS ABI for details.
6100 static int
6101 classify_argument (enum machine_mode mode, const_tree type,
6102 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6104 HOST_WIDE_INT bytes =
6105 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6106 int words
6107 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6109 /* Variable sized entities are always passed/returned in memory. */
6110 if (bytes < 0)
6111 return 0;
6113 if (mode != VOIDmode
6114 && targetm.calls.must_pass_in_stack (mode, type))
6115 return 0;
6117 if (type && AGGREGATE_TYPE_P (type))
6119 int i;
6120 tree field;
6121 enum x86_64_reg_class subclasses[MAX_CLASSES];
6123 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6124 if (bytes > 32)
6125 return 0;
6127 for (i = 0; i < words; i++)
6128 classes[i] = X86_64_NO_CLASS;
6130 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6131 signalize memory class, so handle it as special case. */
6132 if (!words)
6134 classes[0] = X86_64_NO_CLASS;
6135 return 1;
6138 /* Classify each field of record and merge classes. */
6139 switch (TREE_CODE (type))
6141 case RECORD_TYPE:
6142 /* And now merge the fields of structure. */
6143 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6145 if (TREE_CODE (field) == FIELD_DECL)
6147 int num;
6149 if (TREE_TYPE (field) == error_mark_node)
6150 continue;
6152 /* Bitfields are always classified as integer. Handle them
6153 early, since later code would consider them to be
6154 misaligned integers. */
6155 if (DECL_BIT_FIELD (field))
6157 for (i = (int_bit_position (field)
6158 + (bit_offset % 64)) / 8 / 8;
6159 i < ((int_bit_position (field) + (bit_offset % 64))
6160 + tree_low_cst (DECL_SIZE (field), 0)
6161 + 63) / 8 / 8; i++)
6162 classes[i] =
6163 merge_classes (X86_64_INTEGER_CLASS,
6164 classes[i]);
6166 else
6168 int pos;
6170 type = TREE_TYPE (field);
6172 /* Flexible array member is ignored. */
6173 if (TYPE_MODE (type) == BLKmode
6174 && TREE_CODE (type) == ARRAY_TYPE
6175 && TYPE_SIZE (type) == NULL_TREE
6176 && TYPE_DOMAIN (type) != NULL_TREE
6177 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6178 == NULL_TREE))
6180 static bool warned;
6182 if (!warned && warn_psabi)
6184 warned = true;
6185 inform (input_location,
6186 "the ABI of passing struct with"
6187 " a flexible array member has"
6188 " changed in GCC 4.4");
6190 continue;
6192 num = classify_argument (TYPE_MODE (type), type,
6193 subclasses,
6194 (int_bit_position (field)
6195 + bit_offset) % 256);
6196 if (!num)
6197 return 0;
6198 pos = (int_bit_position (field)
6199 + (bit_offset % 64)) / 8 / 8;
6200 for (i = 0; i < num && (i + pos) < words; i++)
6201 classes[i + pos] =
6202 merge_classes (subclasses[i], classes[i + pos]);
6206 break;
6208 case ARRAY_TYPE:
6209 /* Arrays are handled as small records. */
6211 int num;
6212 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6213 TREE_TYPE (type), subclasses, bit_offset);
6214 if (!num)
6215 return 0;
6217 /* The partial classes are now full classes. */
6218 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6219 subclasses[0] = X86_64_SSE_CLASS;
6220 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6221 && !((bit_offset % 64) == 0 && bytes == 4))
6222 subclasses[0] = X86_64_INTEGER_CLASS;
6224 for (i = 0; i < words; i++)
6225 classes[i] = subclasses[i % num];
6227 break;
6229 case UNION_TYPE:
6230 case QUAL_UNION_TYPE:
6231 /* Unions are similar to RECORD_TYPE but offset is always 0.
6233 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6235 if (TREE_CODE (field) == FIELD_DECL)
6237 int num;
6239 if (TREE_TYPE (field) == error_mark_node)
6240 continue;
6242 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6243 TREE_TYPE (field), subclasses,
6244 bit_offset);
6245 if (!num)
6246 return 0;
6247 for (i = 0; i < num; i++)
6248 classes[i] = merge_classes (subclasses[i], classes[i]);
6251 break;
6253 default:
6254 gcc_unreachable ();
6257 if (words > 2)
6259 /* When size > 16 bytes, if the first one isn't
6260 X86_64_SSE_CLASS or any other ones aren't
6261 X86_64_SSEUP_CLASS, everything should be passed in
6262 memory. */
6263 if (classes[0] != X86_64_SSE_CLASS)
6264 return 0;
6266 for (i = 1; i < words; i++)
6267 if (classes[i] != X86_64_SSEUP_CLASS)
6268 return 0;
6271 /* Final merger cleanup. */
6272 for (i = 0; i < words; i++)
6274 /* If one class is MEMORY, everything should be passed in
6275 memory. */
6276 if (classes[i] == X86_64_MEMORY_CLASS)
6277 return 0;
6279 /* The X86_64_SSEUP_CLASS should be always preceded by
6280 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6281 if (classes[i] == X86_64_SSEUP_CLASS
6282 && classes[i - 1] != X86_64_SSE_CLASS
6283 && classes[i - 1] != X86_64_SSEUP_CLASS)
6285 /* The first one should never be X86_64_SSEUP_CLASS. */
6286 gcc_assert (i != 0);
6287 classes[i] = X86_64_SSE_CLASS;
6290 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6291 everything should be passed in memory. */
6292 if (classes[i] == X86_64_X87UP_CLASS
6293 && (classes[i - 1] != X86_64_X87_CLASS))
6295 static bool warned;
6297 /* The first one should never be X86_64_X87UP_CLASS. */
6298 gcc_assert (i != 0);
6299 if (!warned && warn_psabi)
6301 warned = true;
6302 inform (input_location,
6303 "the ABI of passing union with long double"
6304 " has changed in GCC 4.4");
6306 return 0;
6309 return words;
6312 /* Compute alignment needed. We align all types to natural boundaries with
6313 exception of XFmode that is aligned to 64bits. */
6314 if (mode != VOIDmode && mode != BLKmode)
6316 int mode_alignment = GET_MODE_BITSIZE (mode);
6318 if (mode == XFmode)
6319 mode_alignment = 128;
6320 else if (mode == XCmode)
6321 mode_alignment = 256;
6322 if (COMPLEX_MODE_P (mode))
6323 mode_alignment /= 2;
6324 /* Misaligned fields are always returned in memory. */
6325 if (bit_offset % mode_alignment)
6326 return 0;
6329 /* for V1xx modes, just use the base mode */
6330 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6331 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6332 mode = GET_MODE_INNER (mode);
6334 /* Classification of atomic types. */
6335 switch (mode)
6337 case SDmode:
6338 case DDmode:
6339 classes[0] = X86_64_SSE_CLASS;
6340 return 1;
6341 case TDmode:
6342 classes[0] = X86_64_SSE_CLASS;
6343 classes[1] = X86_64_SSEUP_CLASS;
6344 return 2;
6345 case DImode:
6346 case SImode:
6347 case HImode:
6348 case QImode:
6349 case CSImode:
6350 case CHImode:
6351 case CQImode:
6353 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6355 if (size <= 32)
6357 classes[0] = X86_64_INTEGERSI_CLASS;
6358 return 1;
6360 else if (size <= 64)
6362 classes[0] = X86_64_INTEGER_CLASS;
6363 return 1;
6365 else if (size <= 64+32)
6367 classes[0] = X86_64_INTEGER_CLASS;
6368 classes[1] = X86_64_INTEGERSI_CLASS;
6369 return 2;
6371 else if (size <= 64+64)
6373 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6374 return 2;
6376 else
6377 gcc_unreachable ();
6379 case CDImode:
6380 case TImode:
6381 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6382 return 2;
6383 case COImode:
6384 case OImode:
6385 /* OImode shouldn't be used directly. */
6386 gcc_unreachable ();
6387 case CTImode:
6388 return 0;
6389 case SFmode:
6390 if (!(bit_offset % 64))
6391 classes[0] = X86_64_SSESF_CLASS;
6392 else
6393 classes[0] = X86_64_SSE_CLASS;
6394 return 1;
6395 case DFmode:
6396 classes[0] = X86_64_SSEDF_CLASS;
6397 return 1;
6398 case XFmode:
6399 classes[0] = X86_64_X87_CLASS;
6400 classes[1] = X86_64_X87UP_CLASS;
6401 return 2;
6402 case TFmode:
6403 classes[0] = X86_64_SSE_CLASS;
6404 classes[1] = X86_64_SSEUP_CLASS;
6405 return 2;
6406 case SCmode:
6407 classes[0] = X86_64_SSE_CLASS;
6408 if (!(bit_offset % 64))
6409 return 1;
6410 else
6412 static bool warned;
6414 if (!warned && warn_psabi)
6416 warned = true;
6417 inform (input_location,
6418 "the ABI of passing structure with complex float"
6419 " member has changed in GCC 4.4");
6421 classes[1] = X86_64_SSESF_CLASS;
6422 return 2;
6424 case DCmode:
6425 classes[0] = X86_64_SSEDF_CLASS;
6426 classes[1] = X86_64_SSEDF_CLASS;
6427 return 2;
6428 case XCmode:
6429 classes[0] = X86_64_COMPLEX_X87_CLASS;
6430 return 1;
6431 case TCmode:
6432 /* This modes is larger than 16 bytes. */
6433 return 0;
6434 case V8SFmode:
6435 case V8SImode:
6436 case V32QImode:
6437 case V16HImode:
6438 case V4DFmode:
6439 case V4DImode:
6440 classes[0] = X86_64_SSE_CLASS;
6441 classes[1] = X86_64_SSEUP_CLASS;
6442 classes[2] = X86_64_SSEUP_CLASS;
6443 classes[3] = X86_64_SSEUP_CLASS;
6444 return 4;
6445 case V4SFmode:
6446 case V4SImode:
6447 case V16QImode:
6448 case V8HImode:
6449 case V2DFmode:
6450 case V2DImode:
6451 classes[0] = X86_64_SSE_CLASS;
6452 classes[1] = X86_64_SSEUP_CLASS;
6453 return 2;
6454 case V1TImode:
6455 case V1DImode:
6456 case V2SFmode:
6457 case V2SImode:
6458 case V4HImode:
6459 case V8QImode:
6460 classes[0] = X86_64_SSE_CLASS;
6461 return 1;
6462 case BLKmode:
6463 case VOIDmode:
6464 return 0;
6465 default:
6466 gcc_assert (VECTOR_MODE_P (mode));
6468 if (bytes > 16)
6469 return 0;
6471 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6473 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6474 classes[0] = X86_64_INTEGERSI_CLASS;
6475 else
6476 classes[0] = X86_64_INTEGER_CLASS;
6477 classes[1] = X86_64_INTEGER_CLASS;
6478 return 1 + (bytes > 8);
6482 /* Examine the argument and return set number of register required in each
6483 class. Return 0 iff parameter should be passed in memory. */
6484 static int
6485 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6486 int *int_nregs, int *sse_nregs)
6488 enum x86_64_reg_class regclass[MAX_CLASSES];
6489 int n = classify_argument (mode, type, regclass, 0);
6491 *int_nregs = 0;
6492 *sse_nregs = 0;
6493 if (!n)
6494 return 0;
6495 for (n--; n >= 0; n--)
6496 switch (regclass[n])
6498 case X86_64_INTEGER_CLASS:
6499 case X86_64_INTEGERSI_CLASS:
6500 (*int_nregs)++;
6501 break;
6502 case X86_64_SSE_CLASS:
6503 case X86_64_SSESF_CLASS:
6504 case X86_64_SSEDF_CLASS:
6505 (*sse_nregs)++;
6506 break;
6507 case X86_64_NO_CLASS:
6508 case X86_64_SSEUP_CLASS:
6509 break;
6510 case X86_64_X87_CLASS:
6511 case X86_64_X87UP_CLASS:
6512 if (!in_return)
6513 return 0;
6514 break;
6515 case X86_64_COMPLEX_X87_CLASS:
6516 return in_return ? 2 : 0;
6517 case X86_64_MEMORY_CLASS:
6518 gcc_unreachable ();
6520 return 1;
6523 /* Construct container for the argument used by GCC interface. See
6524 FUNCTION_ARG for the detailed description. */
6526 static rtx
6527 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6528 const_tree type, int in_return, int nintregs, int nsseregs,
6529 const int *intreg, int sse_regno)
6531 /* The following variables hold the static issued_error state. */
6532 static bool issued_sse_arg_error;
6533 static bool issued_sse_ret_error;
6534 static bool issued_x87_ret_error;
6536 enum machine_mode tmpmode;
6537 int bytes =
6538 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6539 enum x86_64_reg_class regclass[MAX_CLASSES];
6540 int n;
6541 int i;
6542 int nexps = 0;
6543 int needed_sseregs, needed_intregs;
6544 rtx exp[MAX_CLASSES];
6545 rtx ret;
6547 n = classify_argument (mode, type, regclass, 0);
6548 if (!n)
6549 return NULL;
6550 if (!examine_argument (mode, type, in_return, &needed_intregs,
6551 &needed_sseregs))
6552 return NULL;
6553 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6554 return NULL;
6556 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6557 some less clueful developer tries to use floating-point anyway. */
6558 if (needed_sseregs && !TARGET_SSE)
6560 if (in_return)
6562 if (!issued_sse_ret_error)
6564 error ("SSE register return with SSE disabled");
6565 issued_sse_ret_error = true;
6568 else if (!issued_sse_arg_error)
6570 error ("SSE register argument with SSE disabled");
6571 issued_sse_arg_error = true;
6573 return NULL;
6576 /* Likewise, error if the ABI requires us to return values in the
6577 x87 registers and the user specified -mno-80387. */
6578 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6579 for (i = 0; i < n; i++)
6580 if (regclass[i] == X86_64_X87_CLASS
6581 || regclass[i] == X86_64_X87UP_CLASS
6582 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6584 if (!issued_x87_ret_error)
6586 error ("x87 register return with x87 disabled");
6587 issued_x87_ret_error = true;
6589 return NULL;
6592 /* First construct simple cases. Avoid SCmode, since we want to use
6593 single register to pass this type. */
6594 if (n == 1 && mode != SCmode)
6595 switch (regclass[0])
6597 case X86_64_INTEGER_CLASS:
6598 case X86_64_INTEGERSI_CLASS:
6599 return gen_rtx_REG (mode, intreg[0]);
6600 case X86_64_SSE_CLASS:
6601 case X86_64_SSESF_CLASS:
6602 case X86_64_SSEDF_CLASS:
6603 if (mode != BLKmode)
6604 return gen_reg_or_parallel (mode, orig_mode,
6605 SSE_REGNO (sse_regno));
6606 break;
6607 case X86_64_X87_CLASS:
6608 case X86_64_COMPLEX_X87_CLASS:
6609 return gen_rtx_REG (mode, FIRST_STACK_REG);
6610 case X86_64_NO_CLASS:
6611 /* Zero sized array, struct or class. */
6612 return NULL;
6613 default:
6614 gcc_unreachable ();
6616 if (n == 2
6617 && regclass[0] == X86_64_SSE_CLASS
6618 && regclass[1] == X86_64_SSEUP_CLASS
6619 && mode != BLKmode)
6620 return gen_reg_or_parallel (mode, orig_mode,
6621 SSE_REGNO (sse_regno));
6622 if (n == 4
6623 && regclass[0] == X86_64_SSE_CLASS
6624 && regclass[1] == X86_64_SSEUP_CLASS
6625 && regclass[2] == X86_64_SSEUP_CLASS
6626 && regclass[3] == X86_64_SSEUP_CLASS
6627 && mode != BLKmode)
6628 return gen_reg_or_parallel (mode, orig_mode,
6629 SSE_REGNO (sse_regno));
6630 if (n == 2
6631 && regclass[0] == X86_64_X87_CLASS
6632 && regclass[1] == X86_64_X87UP_CLASS)
6633 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6635 if (n == 2
6636 && regclass[0] == X86_64_INTEGER_CLASS
6637 && regclass[1] == X86_64_INTEGER_CLASS
6638 && (mode == CDImode || mode == TImode || mode == TFmode)
6639 && intreg[0] + 1 == intreg[1])
6640 return gen_rtx_REG (mode, intreg[0]);
6642 /* Otherwise figure out the entries of the PARALLEL. */
6643 for (i = 0; i < n; i++)
6645 int pos;
6647 switch (regclass[i])
6649 case X86_64_NO_CLASS:
6650 break;
6651 case X86_64_INTEGER_CLASS:
6652 case X86_64_INTEGERSI_CLASS:
6653 /* Merge TImodes on aligned occasions here too. */
6654 if (i * 8 + 8 > bytes)
6655 tmpmode
6656 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6657 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6658 tmpmode = SImode;
6659 else
6660 tmpmode = DImode;
6661 /* We've requested 24 bytes we
6662 don't have mode for. Use DImode. */
6663 if (tmpmode == BLKmode)
6664 tmpmode = DImode;
6665 exp [nexps++]
6666 = gen_rtx_EXPR_LIST (VOIDmode,
6667 gen_rtx_REG (tmpmode, *intreg),
6668 GEN_INT (i*8));
6669 intreg++;
6670 break;
6671 case X86_64_SSESF_CLASS:
6672 exp [nexps++]
6673 = gen_rtx_EXPR_LIST (VOIDmode,
6674 gen_rtx_REG (SFmode,
6675 SSE_REGNO (sse_regno)),
6676 GEN_INT (i*8));
6677 sse_regno++;
6678 break;
6679 case X86_64_SSEDF_CLASS:
6680 exp [nexps++]
6681 = gen_rtx_EXPR_LIST (VOIDmode,
6682 gen_rtx_REG (DFmode,
6683 SSE_REGNO (sse_regno)),
6684 GEN_INT (i*8));
6685 sse_regno++;
6686 break;
6687 case X86_64_SSE_CLASS:
6688 pos = i;
6689 switch (n)
6691 case 1:
6692 tmpmode = DImode;
6693 break;
6694 case 2:
6695 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6697 tmpmode = TImode;
6698 i++;
6700 else
6701 tmpmode = DImode;
6702 break;
6703 case 4:
6704 gcc_assert (i == 0
6705 && regclass[1] == X86_64_SSEUP_CLASS
6706 && regclass[2] == X86_64_SSEUP_CLASS
6707 && regclass[3] == X86_64_SSEUP_CLASS);
6708 tmpmode = OImode;
6709 i += 3;
6710 break;
6711 default:
6712 gcc_unreachable ();
6714 exp [nexps++]
6715 = gen_rtx_EXPR_LIST (VOIDmode,
6716 gen_rtx_REG (tmpmode,
6717 SSE_REGNO (sse_regno)),
6718 GEN_INT (pos*8));
6719 sse_regno++;
6720 break;
6721 default:
6722 gcc_unreachable ();
6726 /* Empty aligned struct, union or class. */
6727 if (nexps == 0)
6728 return NULL;
6730 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6731 for (i = 0; i < nexps; i++)
6732 XVECEXP (ret, 0, i) = exp [i];
6733 return ret;
6736 /* Update the data in CUM to advance over an argument of mode MODE
6737 and data type TYPE. (TYPE is null for libcalls where that information
6738 may not be available.) */
6740 static void
6741 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6742 const_tree type, HOST_WIDE_INT bytes,
6743 HOST_WIDE_INT words)
6745 switch (mode)
6747 default:
6748 break;
6750 case BLKmode:
6751 if (bytes < 0)
6752 break;
6753 /* FALLTHRU */
6755 case DImode:
6756 case SImode:
6757 case HImode:
6758 case QImode:
6759 cum->words += words;
6760 cum->nregs -= words;
6761 cum->regno += words;
6763 if (cum->nregs <= 0)
6765 cum->nregs = 0;
6766 cum->regno = 0;
6768 break;
6770 case OImode:
6771 /* OImode shouldn't be used directly. */
6772 gcc_unreachable ();
6774 case DFmode:
6775 if (cum->float_in_sse < 2)
6776 break;
6777 case SFmode:
6778 if (cum->float_in_sse < 1)
6779 break;
6780 /* FALLTHRU */
6782 case V8SFmode:
6783 case V8SImode:
6784 case V32QImode:
6785 case V16HImode:
6786 case V4DFmode:
6787 case V4DImode:
6788 case TImode:
6789 case V16QImode:
6790 case V8HImode:
6791 case V4SImode:
6792 case V2DImode:
6793 case V4SFmode:
6794 case V2DFmode:
6795 if (!type || !AGGREGATE_TYPE_P (type))
6797 cum->sse_words += words;
6798 cum->sse_nregs -= 1;
6799 cum->sse_regno += 1;
6800 if (cum->sse_nregs <= 0)
6802 cum->sse_nregs = 0;
6803 cum->sse_regno = 0;
6806 break;
6808 case V8QImode:
6809 case V4HImode:
6810 case V2SImode:
6811 case V2SFmode:
6812 case V1TImode:
6813 case V1DImode:
6814 if (!type || !AGGREGATE_TYPE_P (type))
6816 cum->mmx_words += words;
6817 cum->mmx_nregs -= 1;
6818 cum->mmx_regno += 1;
6819 if (cum->mmx_nregs <= 0)
6821 cum->mmx_nregs = 0;
6822 cum->mmx_regno = 0;
6825 break;
6829 static void
6830 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6831 const_tree type, HOST_WIDE_INT words, bool named)
6833 int int_nregs, sse_nregs;
6835 /* Unnamed 256bit vector mode parameters are passed on stack. */
6836 if (!named && VALID_AVX256_REG_MODE (mode))
6837 return;
6839 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6840 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6842 cum->nregs -= int_nregs;
6843 cum->sse_nregs -= sse_nregs;
6844 cum->regno += int_nregs;
6845 cum->sse_regno += sse_nregs;
6847 else
6849 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6850 cum->words = (cum->words + align - 1) & ~(align - 1);
6851 cum->words += words;
6855 static void
6856 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6857 HOST_WIDE_INT words)
6859 /* Otherwise, this should be passed indirect. */
6860 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6862 cum->words += words;
6863 if (cum->nregs > 0)
6865 cum->nregs -= 1;
6866 cum->regno += 1;
6870 /* Update the data in CUM to advance over an argument of mode MODE and
6871 data type TYPE. (TYPE is null for libcalls where that information
6872 may not be available.) */
6874 static void
6875 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6876 const_tree type, bool named)
6878 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6879 HOST_WIDE_INT bytes, words;
6881 if (mode == BLKmode)
6882 bytes = int_size_in_bytes (type);
6883 else
6884 bytes = GET_MODE_SIZE (mode);
6885 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6887 if (type)
6888 mode = type_natural_mode (type, NULL);
6890 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6891 function_arg_advance_ms_64 (cum, bytes, words);
6892 else if (TARGET_64BIT)
6893 function_arg_advance_64 (cum, mode, type, words, named);
6894 else
6895 function_arg_advance_32 (cum, mode, type, bytes, words);
6898 /* Define where to put the arguments to a function.
6899 Value is zero to push the argument on the stack,
6900 or a hard register in which to store the argument.
6902 MODE is the argument's machine mode.
6903 TYPE is the data type of the argument (as a tree).
6904 This is null for libcalls where that information may
6905 not be available.
6906 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6907 the preceding args and about the function being called.
6908 NAMED is nonzero if this argument is a named parameter
6909 (otherwise it is an extra parameter matching an ellipsis). */
6911 static rtx
6912 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6913 enum machine_mode orig_mode, const_tree type,
6914 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6916 static bool warnedsse, warnedmmx;
6918 /* Avoid the AL settings for the Unix64 ABI. */
6919 if (mode == VOIDmode)
6920 return constm1_rtx;
6922 switch (mode)
6924 default:
6925 break;
6927 case BLKmode:
6928 if (bytes < 0)
6929 break;
6930 /* FALLTHRU */
6931 case DImode:
6932 case SImode:
6933 case HImode:
6934 case QImode:
6935 if (words <= cum->nregs)
6937 int regno = cum->regno;
6939 /* Fastcall allocates the first two DWORD (SImode) or
6940 smaller arguments to ECX and EDX if it isn't an
6941 aggregate type . */
6942 if (cum->fastcall)
6944 if (mode == BLKmode
6945 || mode == DImode
6946 || (type && AGGREGATE_TYPE_P (type)))
6947 break;
6949 /* ECX not EAX is the first allocated register. */
6950 if (regno == AX_REG)
6951 regno = CX_REG;
6953 return gen_rtx_REG (mode, regno);
6955 break;
6957 case DFmode:
6958 if (cum->float_in_sse < 2)
6959 break;
6960 case SFmode:
6961 if (cum->float_in_sse < 1)
6962 break;
6963 /* FALLTHRU */
6964 case TImode:
6965 /* In 32bit, we pass TImode in xmm registers. */
6966 case V16QImode:
6967 case V8HImode:
6968 case V4SImode:
6969 case V2DImode:
6970 case V4SFmode:
6971 case V2DFmode:
6972 if (!type || !AGGREGATE_TYPE_P (type))
6974 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6976 warnedsse = true;
6977 warning (0, "SSE vector argument without SSE enabled "
6978 "changes the ABI");
6980 if (cum->sse_nregs)
6981 return gen_reg_or_parallel (mode, orig_mode,
6982 cum->sse_regno + FIRST_SSE_REG);
6984 break;
6986 case OImode:
6987 /* OImode shouldn't be used directly. */
6988 gcc_unreachable ();
6990 case V8SFmode:
6991 case V8SImode:
6992 case V32QImode:
6993 case V16HImode:
6994 case V4DFmode:
6995 case V4DImode:
6996 if (!type || !AGGREGATE_TYPE_P (type))
6998 if (cum->sse_nregs)
6999 return gen_reg_or_parallel (mode, orig_mode,
7000 cum->sse_regno + FIRST_SSE_REG);
7002 break;
7004 case V8QImode:
7005 case V4HImode:
7006 case V2SImode:
7007 case V2SFmode:
7008 case V1TImode:
7009 case V1DImode:
7010 if (!type || !AGGREGATE_TYPE_P (type))
7012 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7014 warnedmmx = true;
7015 warning (0, "MMX vector argument without MMX enabled "
7016 "changes the ABI");
7018 if (cum->mmx_nregs)
7019 return gen_reg_or_parallel (mode, orig_mode,
7020 cum->mmx_regno + FIRST_MMX_REG);
7022 break;
7025 return NULL_RTX;
7028 static rtx
7029 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7030 enum machine_mode orig_mode, const_tree type, bool named)
7032 /* Handle a hidden AL argument containing number of registers
7033 for varargs x86-64 functions. */
7034 if (mode == VOIDmode)
7035 return GEN_INT (cum->maybe_vaarg
7036 ? (cum->sse_nregs < 0
7037 ? X86_64_SSE_REGPARM_MAX
7038 : cum->sse_regno)
7039 : -1);
7041 switch (mode)
7043 default:
7044 break;
7046 case V8SFmode:
7047 case V8SImode:
7048 case V32QImode:
7049 case V16HImode:
7050 case V4DFmode:
7051 case V4DImode:
7052 /* Unnamed 256bit vector mode parameters are passed on stack. */
7053 if (!named)
7054 return NULL;
7055 break;
7058 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7059 cum->sse_nregs,
7060 &x86_64_int_parameter_registers [cum->regno],
7061 cum->sse_regno);
7064 static rtx
7065 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7066 enum machine_mode orig_mode, bool named,
7067 HOST_WIDE_INT bytes)
7069 unsigned int regno;
7071 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7072 We use value of -2 to specify that current function call is MSABI. */
7073 if (mode == VOIDmode)
7074 return GEN_INT (-2);
7076 /* If we've run out of registers, it goes on the stack. */
7077 if (cum->nregs == 0)
7078 return NULL_RTX;
7080 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7082 /* Only floating point modes are passed in anything but integer regs. */
7083 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7085 if (named)
7086 regno = cum->regno + FIRST_SSE_REG;
7087 else
7089 rtx t1, t2;
7091 /* Unnamed floating parameters are passed in both the
7092 SSE and integer registers. */
7093 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7094 t2 = gen_rtx_REG (mode, regno);
7095 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7096 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7097 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7100 /* Handle aggregated types passed in register. */
7101 if (orig_mode == BLKmode)
7103 if (bytes > 0 && bytes <= 8)
7104 mode = (bytes > 4 ? DImode : SImode);
7105 if (mode == BLKmode)
7106 mode = DImode;
7109 return gen_reg_or_parallel (mode, orig_mode, regno);
7112 /* Return where to put the arguments to a function.
7113 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7115 MODE is the argument's machine mode. TYPE is the data type of the
7116 argument. It is null for libcalls where that information may not be
7117 available. CUM gives information about the preceding args and about
7118 the function being called. NAMED is nonzero if this argument is a
7119 named parameter (otherwise it is an extra parameter matching an
7120 ellipsis). */
7122 static rtx
7123 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7124 const_tree type, bool named)
7126 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7127 enum machine_mode mode = omode;
7128 HOST_WIDE_INT bytes, words;
7129 rtx arg;
7131 if (mode == BLKmode)
7132 bytes = int_size_in_bytes (type);
7133 else
7134 bytes = GET_MODE_SIZE (mode);
7135 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7137 /* To simplify the code below, represent vector types with a vector mode
7138 even if MMX/SSE are not active. */
7139 if (type && TREE_CODE (type) == VECTOR_TYPE)
7140 mode = type_natural_mode (type, cum);
7142 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7143 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7144 else if (TARGET_64BIT)
7145 arg = function_arg_64 (cum, mode, omode, type, named);
7146 else
7147 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7149 return arg;
7152 /* A C expression that indicates when an argument must be passed by
7153 reference. If nonzero for an argument, a copy of that argument is
7154 made in memory and a pointer to the argument is passed instead of
7155 the argument itself. The pointer is passed in whatever way is
7156 appropriate for passing a pointer to that type. */
7158 static bool
7159 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7160 enum machine_mode mode ATTRIBUTE_UNUSED,
7161 const_tree type, bool named ATTRIBUTE_UNUSED)
7163 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7165 /* See Windows x64 Software Convention. */
7166 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7168 int msize = (int) GET_MODE_SIZE (mode);
7169 if (type)
7171 /* Arrays are passed by reference. */
7172 if (TREE_CODE (type) == ARRAY_TYPE)
7173 return true;
7175 if (AGGREGATE_TYPE_P (type))
7177 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7178 are passed by reference. */
7179 msize = int_size_in_bytes (type);
7183 /* __m128 is passed by reference. */
7184 switch (msize) {
7185 case 1: case 2: case 4: case 8:
7186 break;
7187 default:
7188 return true;
7191 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7192 return 1;
7194 return 0;
7197 /* Return true when TYPE should be 128bit aligned for 32bit argument
7198 passing ABI. XXX: This function is obsolete and is only used for
7199 checking psABI compatibility with previous versions of GCC. */
7201 static bool
7202 ix86_compat_aligned_value_p (const_tree type)
7204 enum machine_mode mode = TYPE_MODE (type);
7205 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7206 || mode == TDmode
7207 || mode == TFmode
7208 || mode == TCmode)
7209 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7210 return true;
7211 if (TYPE_ALIGN (type) < 128)
7212 return false;
7214 if (AGGREGATE_TYPE_P (type))
7216 /* Walk the aggregates recursively. */
7217 switch (TREE_CODE (type))
7219 case RECORD_TYPE:
7220 case UNION_TYPE:
7221 case QUAL_UNION_TYPE:
7223 tree field;
7225 /* Walk all the structure fields. */
7226 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7228 if (TREE_CODE (field) == FIELD_DECL
7229 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7230 return true;
7232 break;
7235 case ARRAY_TYPE:
7236 /* Just for use if some languages passes arrays by value. */
7237 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7238 return true;
7239 break;
7241 default:
7242 gcc_unreachable ();
7245 return false;
7248 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7249 XXX: This function is obsolete and is only used for checking psABI
7250 compatibility with previous versions of GCC. */
7252 static unsigned int
7253 ix86_compat_function_arg_boundary (enum machine_mode mode,
7254 const_tree type, unsigned int align)
7256 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7257 natural boundaries. */
7258 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7260 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7261 make an exception for SSE modes since these require 128bit
7262 alignment.
7264 The handling here differs from field_alignment. ICC aligns MMX
7265 arguments to 4 byte boundaries, while structure fields are aligned
7266 to 8 byte boundaries. */
7267 if (!type)
7269 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7270 align = PARM_BOUNDARY;
7272 else
7274 if (!ix86_compat_aligned_value_p (type))
7275 align = PARM_BOUNDARY;
7278 if (align > BIGGEST_ALIGNMENT)
7279 align = BIGGEST_ALIGNMENT;
7280 return align;
7283 /* Return true when TYPE should be 128bit aligned for 32bit argument
7284 passing ABI. */
7286 static bool
7287 ix86_contains_aligned_value_p (const_tree type)
7289 enum machine_mode mode = TYPE_MODE (type);
7291 if (mode == XFmode || mode == XCmode)
7292 return false;
7294 if (TYPE_ALIGN (type) < 128)
7295 return false;
7297 if (AGGREGATE_TYPE_P (type))
7299 /* Walk the aggregates recursively. */
7300 switch (TREE_CODE (type))
7302 case RECORD_TYPE:
7303 case UNION_TYPE:
7304 case QUAL_UNION_TYPE:
7306 tree field;
7308 /* Walk all the structure fields. */
7309 for (field = TYPE_FIELDS (type);
7310 field;
7311 field = DECL_CHAIN (field))
7313 if (TREE_CODE (field) == FIELD_DECL
7314 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7315 return true;
7317 break;
7320 case ARRAY_TYPE:
7321 /* Just for use if some languages passes arrays by value. */
7322 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7323 return true;
7324 break;
7326 default:
7327 gcc_unreachable ();
7330 else
7331 return TYPE_ALIGN (type) >= 128;
7333 return false;
7336 /* Gives the alignment boundary, in bits, of an argument with the
7337 specified mode and type. */
7339 static unsigned int
7340 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7342 unsigned int align;
7343 if (type)
7345 /* Since the main variant type is used for call, we convert it to
7346 the main variant type. */
7347 type = TYPE_MAIN_VARIANT (type);
7348 align = TYPE_ALIGN (type);
7350 else
7351 align = GET_MODE_ALIGNMENT (mode);
7352 if (align < PARM_BOUNDARY)
7353 align = PARM_BOUNDARY;
7354 else
7356 static bool warned;
7357 unsigned int saved_align = align;
7359 if (!TARGET_64BIT)
7361 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7362 if (!type)
7364 if (mode == XFmode || mode == XCmode)
7365 align = PARM_BOUNDARY;
7367 else if (!ix86_contains_aligned_value_p (type))
7368 align = PARM_BOUNDARY;
7370 if (align < 128)
7371 align = PARM_BOUNDARY;
7374 if (warn_psabi
7375 && !warned
7376 && align != ix86_compat_function_arg_boundary (mode, type,
7377 saved_align))
7379 warned = true;
7380 inform (input_location,
7381 "The ABI for passing parameters with %d-byte"
7382 " alignment has changed in GCC 4.6",
7383 align / BITS_PER_UNIT);
7387 return align;
7390 /* Return true if N is a possible register number of function value. */
7392 static bool
7393 ix86_function_value_regno_p (const unsigned int regno)
7395 switch (regno)
7397 case AX_REG:
7398 return true;
7400 case FIRST_FLOAT_REG:
7401 /* TODO: The function should depend on current function ABI but
7402 builtins.c would need updating then. Therefore we use the
7403 default ABI. */
7404 if (TARGET_64BIT && ix86_abi == MS_ABI)
7405 return false;
7406 return TARGET_FLOAT_RETURNS_IN_80387;
7408 case FIRST_SSE_REG:
7409 return TARGET_SSE;
7411 case FIRST_MMX_REG:
7412 if (TARGET_MACHO || TARGET_64BIT)
7413 return false;
7414 return TARGET_MMX;
7417 return false;
7420 /* Define how to find the value returned by a function.
7421 VALTYPE is the data type of the value (as a tree).
7422 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7423 otherwise, FUNC is 0. */
7425 static rtx
7426 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7427 const_tree fntype, const_tree fn)
7429 unsigned int regno;
7431 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7432 we normally prevent this case when mmx is not available. However
7433 some ABIs may require the result to be returned like DImode. */
7434 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7435 regno = FIRST_MMX_REG;
7437 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7438 we prevent this case when sse is not available. However some ABIs
7439 may require the result to be returned like integer TImode. */
7440 else if (mode == TImode
7441 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7442 regno = FIRST_SSE_REG;
7444 /* 32-byte vector modes in %ymm0. */
7445 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7446 regno = FIRST_SSE_REG;
7448 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7449 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7450 regno = FIRST_FLOAT_REG;
7451 else
7452 /* Most things go in %eax. */
7453 regno = AX_REG;
7455 /* Override FP return register with %xmm0 for local functions when
7456 SSE math is enabled or for functions with sseregparm attribute. */
7457 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7459 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7460 if ((sse_level >= 1 && mode == SFmode)
7461 || (sse_level == 2 && mode == DFmode))
7462 regno = FIRST_SSE_REG;
7465 /* OImode shouldn't be used directly. */
7466 gcc_assert (mode != OImode);
7468 return gen_rtx_REG (orig_mode, regno);
7471 static rtx
7472 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7473 const_tree valtype)
7475 rtx ret;
7477 /* Handle libcalls, which don't provide a type node. */
7478 if (valtype == NULL)
7480 unsigned int regno;
7482 switch (mode)
7484 case SFmode:
7485 case SCmode:
7486 case DFmode:
7487 case DCmode:
7488 case TFmode:
7489 case SDmode:
7490 case DDmode:
7491 case TDmode:
7492 regno = FIRST_SSE_REG;
7493 break;
7494 case XFmode:
7495 case XCmode:
7496 regno = FIRST_FLOAT_REG;
7497 break;
7498 case TCmode:
7499 return NULL;
7500 default:
7501 regno = AX_REG;
7504 return gen_rtx_REG (mode, regno);
7506 else if (POINTER_TYPE_P (valtype))
7508 /* Pointers are always returned in word_mode. */
7509 mode = word_mode;
7512 ret = construct_container (mode, orig_mode, valtype, 1,
7513 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7514 x86_64_int_return_registers, 0);
7516 /* For zero sized structures, construct_container returns NULL, but we
7517 need to keep rest of compiler happy by returning meaningful value. */
7518 if (!ret)
7519 ret = gen_rtx_REG (orig_mode, AX_REG);
7521 return ret;
7524 static rtx
7525 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7526 const_tree valtype)
7528 unsigned int regno = AX_REG;
7530 if (TARGET_SSE)
7532 switch (GET_MODE_SIZE (mode))
7534 case 16:
7535 if (valtype != NULL_TREE
7536 && !VECTOR_INTEGER_TYPE_P (valtype)
7537 && !VECTOR_INTEGER_TYPE_P (valtype)
7538 && !INTEGRAL_TYPE_P (valtype)
7539 && !VECTOR_FLOAT_TYPE_P (valtype))
7540 break;
7541 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7542 && !COMPLEX_MODE_P (mode))
7543 regno = FIRST_SSE_REG;
7544 break;
7545 case 8:
7546 case 4:
7547 if (mode == SFmode || mode == DFmode)
7548 regno = FIRST_SSE_REG;
7549 break;
7550 default:
7551 break;
7554 return gen_rtx_REG (orig_mode, regno);
7557 static rtx
7558 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7559 enum machine_mode orig_mode, enum machine_mode mode)
7561 const_tree fn, fntype;
7563 fn = NULL_TREE;
7564 if (fntype_or_decl && DECL_P (fntype_or_decl))
7565 fn = fntype_or_decl;
7566 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7568 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7569 return function_value_ms_64 (orig_mode, mode, valtype);
7570 else if (TARGET_64BIT)
7571 return function_value_64 (orig_mode, mode, valtype);
7572 else
7573 return function_value_32 (orig_mode, mode, fntype, fn);
7576 static rtx
7577 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7578 bool outgoing ATTRIBUTE_UNUSED)
7580 enum machine_mode mode, orig_mode;
7582 orig_mode = TYPE_MODE (valtype);
7583 mode = type_natural_mode (valtype, NULL);
7584 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7587 /* Pointer function arguments and return values are promoted to
7588 word_mode. */
7590 static enum machine_mode
7591 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7592 int *punsignedp, const_tree fntype,
7593 int for_return)
7595 if (type != NULL_TREE && POINTER_TYPE_P (type))
7597 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7598 return word_mode;
7600 return default_promote_function_mode (type, mode, punsignedp, fntype,
7601 for_return);
7604 /* Return true if a structure, union or array with MODE containing FIELD
7605 should be accessed using BLKmode. */
7607 static bool
7608 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7610 /* Union with XFmode must be in BLKmode. */
7611 return (mode == XFmode
7612 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7613 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7617 ix86_libcall_value (enum machine_mode mode)
7619 return ix86_function_value_1 (NULL, NULL, mode, mode);
7622 /* Return true iff type is returned in memory. */
7624 static bool ATTRIBUTE_UNUSED
7625 return_in_memory_32 (const_tree type, enum machine_mode mode)
7627 HOST_WIDE_INT size;
7629 if (mode == BLKmode)
7630 return true;
7632 size = int_size_in_bytes (type);
7634 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7635 return false;
7637 if (VECTOR_MODE_P (mode) || mode == TImode)
7639 /* User-created vectors small enough to fit in EAX. */
7640 if (size < 8)
7641 return false;
7643 /* MMX/3dNow values are returned in MM0,
7644 except when it doesn't exits or the ABI prescribes otherwise. */
7645 if (size == 8)
7646 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7648 /* SSE values are returned in XMM0, except when it doesn't exist. */
7649 if (size == 16)
7650 return !TARGET_SSE;
7652 /* AVX values are returned in YMM0, except when it doesn't exist. */
7653 if (size == 32)
7654 return !TARGET_AVX;
7657 if (mode == XFmode)
7658 return false;
7660 if (size > 12)
7661 return true;
7663 /* OImode shouldn't be used directly. */
7664 gcc_assert (mode != OImode);
7666 return false;
7669 static bool ATTRIBUTE_UNUSED
7670 return_in_memory_64 (const_tree type, enum machine_mode mode)
7672 int needed_intregs, needed_sseregs;
7673 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7676 static bool ATTRIBUTE_UNUSED
7677 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7679 HOST_WIDE_INT size = int_size_in_bytes (type);
7681 /* __m128 is returned in xmm0. */
7682 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7683 || VECTOR_FLOAT_TYPE_P (type))
7684 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7685 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7686 return false;
7688 /* Otherwise, the size must be exactly in [1248]. */
7689 return size != 1 && size != 2 && size != 4 && size != 8;
7692 static bool
7693 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7695 #ifdef SUBTARGET_RETURN_IN_MEMORY
7696 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7697 #else
7698 const enum machine_mode mode = type_natural_mode (type, NULL);
7700 if (TARGET_64BIT)
7702 if (ix86_function_type_abi (fntype) == MS_ABI)
7703 return return_in_memory_ms_64 (type, mode);
7704 else
7705 return return_in_memory_64 (type, mode);
7707 else
7708 return return_in_memory_32 (type, mode);
7709 #endif
7712 /* When returning SSE vector types, we have a choice of either
7713 (1) being abi incompatible with a -march switch, or
7714 (2) generating an error.
7715 Given no good solution, I think the safest thing is one warning.
7716 The user won't be able to use -Werror, but....
7718 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7719 called in response to actually generating a caller or callee that
7720 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7721 via aggregate_value_p for general type probing from tree-ssa. */
7723 static rtx
7724 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7726 static bool warnedsse, warnedmmx;
7728 if (!TARGET_64BIT && type)
7730 /* Look at the return type of the function, not the function type. */
7731 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7733 if (!TARGET_SSE && !warnedsse)
7735 if (mode == TImode
7736 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7738 warnedsse = true;
7739 warning (0, "SSE vector return without SSE enabled "
7740 "changes the ABI");
7744 if (!TARGET_MMX && !warnedmmx)
7746 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7748 warnedmmx = true;
7749 warning (0, "MMX vector return without MMX enabled "
7750 "changes the ABI");
7755 return NULL;
7759 /* Create the va_list data type. */
7761 /* Returns the calling convention specific va_list date type.
7762 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7764 static tree
7765 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7767 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7769 /* For i386 we use plain pointer to argument area. */
7770 if (!TARGET_64BIT || abi == MS_ABI)
7771 return build_pointer_type (char_type_node);
7773 record = lang_hooks.types.make_type (RECORD_TYPE);
7774 type_decl = build_decl (BUILTINS_LOCATION,
7775 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7777 f_gpr = build_decl (BUILTINS_LOCATION,
7778 FIELD_DECL, get_identifier ("gp_offset"),
7779 unsigned_type_node);
7780 f_fpr = build_decl (BUILTINS_LOCATION,
7781 FIELD_DECL, get_identifier ("fp_offset"),
7782 unsigned_type_node);
7783 f_ovf = build_decl (BUILTINS_LOCATION,
7784 FIELD_DECL, get_identifier ("overflow_arg_area"),
7785 ptr_type_node);
7786 f_sav = build_decl (BUILTINS_LOCATION,
7787 FIELD_DECL, get_identifier ("reg_save_area"),
7788 ptr_type_node);
7790 va_list_gpr_counter_field = f_gpr;
7791 va_list_fpr_counter_field = f_fpr;
7793 DECL_FIELD_CONTEXT (f_gpr) = record;
7794 DECL_FIELD_CONTEXT (f_fpr) = record;
7795 DECL_FIELD_CONTEXT (f_ovf) = record;
7796 DECL_FIELD_CONTEXT (f_sav) = record;
7798 TYPE_STUB_DECL (record) = type_decl;
7799 TYPE_NAME (record) = type_decl;
7800 TYPE_FIELDS (record) = f_gpr;
7801 DECL_CHAIN (f_gpr) = f_fpr;
7802 DECL_CHAIN (f_fpr) = f_ovf;
7803 DECL_CHAIN (f_ovf) = f_sav;
7805 layout_type (record);
7807 /* The correct type is an array type of one element. */
7808 return build_array_type (record, build_index_type (size_zero_node));
7811 /* Setup the builtin va_list data type and for 64-bit the additional
7812 calling convention specific va_list data types. */
7814 static tree
7815 ix86_build_builtin_va_list (void)
7817 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7819 /* Initialize abi specific va_list builtin types. */
7820 if (TARGET_64BIT)
7822 tree t;
7823 if (ix86_abi == MS_ABI)
7825 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7826 if (TREE_CODE (t) != RECORD_TYPE)
7827 t = build_variant_type_copy (t);
7828 sysv_va_list_type_node = t;
7830 else
7832 t = ret;
7833 if (TREE_CODE (t) != RECORD_TYPE)
7834 t = build_variant_type_copy (t);
7835 sysv_va_list_type_node = t;
7837 if (ix86_abi != MS_ABI)
7839 t = ix86_build_builtin_va_list_abi (MS_ABI);
7840 if (TREE_CODE (t) != RECORD_TYPE)
7841 t = build_variant_type_copy (t);
7842 ms_va_list_type_node = t;
7844 else
7846 t = ret;
7847 if (TREE_CODE (t) != RECORD_TYPE)
7848 t = build_variant_type_copy (t);
7849 ms_va_list_type_node = t;
7853 return ret;
7856 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7858 static void
7859 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7861 rtx save_area, mem;
7862 alias_set_type set;
7863 int i, max;
7865 /* GPR size of varargs save area. */
7866 if (cfun->va_list_gpr_size)
7867 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7868 else
7869 ix86_varargs_gpr_size = 0;
7871 /* FPR size of varargs save area. We don't need it if we don't pass
7872 anything in SSE registers. */
7873 if (TARGET_SSE && cfun->va_list_fpr_size)
7874 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7875 else
7876 ix86_varargs_fpr_size = 0;
7878 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7879 return;
7881 save_area = frame_pointer_rtx;
7882 set = get_varargs_alias_set ();
7884 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7885 if (max > X86_64_REGPARM_MAX)
7886 max = X86_64_REGPARM_MAX;
7888 for (i = cum->regno; i < max; i++)
7890 mem = gen_rtx_MEM (word_mode,
7891 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7892 MEM_NOTRAP_P (mem) = 1;
7893 set_mem_alias_set (mem, set);
7894 emit_move_insn (mem,
7895 gen_rtx_REG (word_mode,
7896 x86_64_int_parameter_registers[i]));
7899 if (ix86_varargs_fpr_size)
7901 enum machine_mode smode;
7902 rtx label, test;
7904 /* Now emit code to save SSE registers. The AX parameter contains number
7905 of SSE parameter registers used to call this function, though all we
7906 actually check here is the zero/non-zero status. */
7908 label = gen_label_rtx ();
7909 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7910 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7911 label));
7913 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7914 we used movdqa (i.e. TImode) instead? Perhaps even better would
7915 be if we could determine the real mode of the data, via a hook
7916 into pass_stdarg. Ignore all that for now. */
7917 smode = V4SFmode;
7918 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7919 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7921 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7922 if (max > X86_64_SSE_REGPARM_MAX)
7923 max = X86_64_SSE_REGPARM_MAX;
7925 for (i = cum->sse_regno; i < max; ++i)
7927 mem = plus_constant (Pmode, save_area,
7928 i * 16 + ix86_varargs_gpr_size);
7929 mem = gen_rtx_MEM (smode, mem);
7930 MEM_NOTRAP_P (mem) = 1;
7931 set_mem_alias_set (mem, set);
7932 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7934 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7937 emit_label (label);
7941 static void
7942 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7944 alias_set_type set = get_varargs_alias_set ();
7945 int i;
7947 /* Reset to zero, as there might be a sysv vaarg used
7948 before. */
7949 ix86_varargs_gpr_size = 0;
7950 ix86_varargs_fpr_size = 0;
7952 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7954 rtx reg, mem;
7956 mem = gen_rtx_MEM (Pmode,
7957 plus_constant (Pmode, virtual_incoming_args_rtx,
7958 i * UNITS_PER_WORD));
7959 MEM_NOTRAP_P (mem) = 1;
7960 set_mem_alias_set (mem, set);
7962 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7963 emit_move_insn (mem, reg);
7967 static void
7968 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7969 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7970 int no_rtl)
7972 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7973 CUMULATIVE_ARGS next_cum;
7974 tree fntype;
7976 /* This argument doesn't appear to be used anymore. Which is good,
7977 because the old code here didn't suppress rtl generation. */
7978 gcc_assert (!no_rtl);
7980 if (!TARGET_64BIT)
7981 return;
7983 fntype = TREE_TYPE (current_function_decl);
7985 /* For varargs, we do not want to skip the dummy va_dcl argument.
7986 For stdargs, we do want to skip the last named argument. */
7987 next_cum = *cum;
7988 if (stdarg_p (fntype))
7989 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7990 true);
7992 if (cum->call_abi == MS_ABI)
7993 setup_incoming_varargs_ms_64 (&next_cum);
7994 else
7995 setup_incoming_varargs_64 (&next_cum);
7998 /* Checks if TYPE is of kind va_list char *. */
8000 static bool
8001 is_va_list_char_pointer (tree type)
8003 tree canonic;
8005 /* For 32-bit it is always true. */
8006 if (!TARGET_64BIT)
8007 return true;
8008 canonic = ix86_canonical_va_list_type (type);
8009 return (canonic == ms_va_list_type_node
8010 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8013 /* Implement va_start. */
8015 static void
8016 ix86_va_start (tree valist, rtx nextarg)
8018 HOST_WIDE_INT words, n_gpr, n_fpr;
8019 tree f_gpr, f_fpr, f_ovf, f_sav;
8020 tree gpr, fpr, ovf, sav, t;
8021 tree type;
8022 rtx ovf_rtx;
8024 if (flag_split_stack
8025 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8027 unsigned int scratch_regno;
8029 /* When we are splitting the stack, we can't refer to the stack
8030 arguments using internal_arg_pointer, because they may be on
8031 the old stack. The split stack prologue will arrange to
8032 leave a pointer to the old stack arguments in a scratch
8033 register, which we here copy to a pseudo-register. The split
8034 stack prologue can't set the pseudo-register directly because
8035 it (the prologue) runs before any registers have been saved. */
8037 scratch_regno = split_stack_prologue_scratch_regno ();
8038 if (scratch_regno != INVALID_REGNUM)
8040 rtx reg, seq;
8042 reg = gen_reg_rtx (Pmode);
8043 cfun->machine->split_stack_varargs_pointer = reg;
8045 start_sequence ();
8046 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8047 seq = get_insns ();
8048 end_sequence ();
8050 push_topmost_sequence ();
8051 emit_insn_after (seq, entry_of_function ());
8052 pop_topmost_sequence ();
8056 /* Only 64bit target needs something special. */
8057 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8059 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8060 std_expand_builtin_va_start (valist, nextarg);
8061 else
8063 rtx va_r, next;
8065 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8066 next = expand_binop (ptr_mode, add_optab,
8067 cfun->machine->split_stack_varargs_pointer,
8068 crtl->args.arg_offset_rtx,
8069 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8070 convert_move (va_r, next, 0);
8072 return;
8075 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8076 f_fpr = DECL_CHAIN (f_gpr);
8077 f_ovf = DECL_CHAIN (f_fpr);
8078 f_sav = DECL_CHAIN (f_ovf);
8080 valist = build_simple_mem_ref (valist);
8081 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8082 /* The following should be folded into the MEM_REF offset. */
8083 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8084 f_gpr, NULL_TREE);
8085 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8086 f_fpr, NULL_TREE);
8087 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8088 f_ovf, NULL_TREE);
8089 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8090 f_sav, NULL_TREE);
8092 /* Count number of gp and fp argument registers used. */
8093 words = crtl->args.info.words;
8094 n_gpr = crtl->args.info.regno;
8095 n_fpr = crtl->args.info.sse_regno;
8097 if (cfun->va_list_gpr_size)
8099 type = TREE_TYPE (gpr);
8100 t = build2 (MODIFY_EXPR, type,
8101 gpr, build_int_cst (type, n_gpr * 8));
8102 TREE_SIDE_EFFECTS (t) = 1;
8103 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8106 if (TARGET_SSE && cfun->va_list_fpr_size)
8108 type = TREE_TYPE (fpr);
8109 t = build2 (MODIFY_EXPR, type, fpr,
8110 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8111 TREE_SIDE_EFFECTS (t) = 1;
8112 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8115 /* Find the overflow area. */
8116 type = TREE_TYPE (ovf);
8117 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8118 ovf_rtx = crtl->args.internal_arg_pointer;
8119 else
8120 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8121 t = make_tree (type, ovf_rtx);
8122 if (words != 0)
8123 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8124 t = build2 (MODIFY_EXPR, type, ovf, t);
8125 TREE_SIDE_EFFECTS (t) = 1;
8126 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8128 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8130 /* Find the register save area.
8131 Prologue of the function save it right above stack frame. */
8132 type = TREE_TYPE (sav);
8133 t = make_tree (type, frame_pointer_rtx);
8134 if (!ix86_varargs_gpr_size)
8135 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8136 t = build2 (MODIFY_EXPR, type, sav, t);
8137 TREE_SIDE_EFFECTS (t) = 1;
8138 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8142 /* Implement va_arg. */
8144 static tree
8145 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8146 gimple_seq *post_p)
8148 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8149 tree f_gpr, f_fpr, f_ovf, f_sav;
8150 tree gpr, fpr, ovf, sav, t;
8151 int size, rsize;
8152 tree lab_false, lab_over = NULL_TREE;
8153 tree addr, t2;
8154 rtx container;
8155 int indirect_p = 0;
8156 tree ptrtype;
8157 enum machine_mode nat_mode;
8158 unsigned int arg_boundary;
8160 /* Only 64bit target needs something special. */
8161 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8162 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8164 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8165 f_fpr = DECL_CHAIN (f_gpr);
8166 f_ovf = DECL_CHAIN (f_fpr);
8167 f_sav = DECL_CHAIN (f_ovf);
8169 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8170 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8171 valist = build_va_arg_indirect_ref (valist);
8172 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8173 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8174 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8176 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8177 if (indirect_p)
8178 type = build_pointer_type (type);
8179 size = int_size_in_bytes (type);
8180 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8182 nat_mode = type_natural_mode (type, NULL);
8183 switch (nat_mode)
8185 case V8SFmode:
8186 case V8SImode:
8187 case V32QImode:
8188 case V16HImode:
8189 case V4DFmode:
8190 case V4DImode:
8191 /* Unnamed 256bit vector mode parameters are passed on stack. */
8192 if (!TARGET_64BIT_MS_ABI)
8194 container = NULL;
8195 break;
8198 default:
8199 container = construct_container (nat_mode, TYPE_MODE (type),
8200 type, 0, X86_64_REGPARM_MAX,
8201 X86_64_SSE_REGPARM_MAX, intreg,
8203 break;
8206 /* Pull the value out of the saved registers. */
8208 addr = create_tmp_var (ptr_type_node, "addr");
8210 if (container)
8212 int needed_intregs, needed_sseregs;
8213 bool need_temp;
8214 tree int_addr, sse_addr;
8216 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8217 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8219 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8221 need_temp = (!REG_P (container)
8222 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8223 || TYPE_ALIGN (type) > 128));
8225 /* In case we are passing structure, verify that it is consecutive block
8226 on the register save area. If not we need to do moves. */
8227 if (!need_temp && !REG_P (container))
8229 /* Verify that all registers are strictly consecutive */
8230 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8232 int i;
8234 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8236 rtx slot = XVECEXP (container, 0, i);
8237 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8238 || INTVAL (XEXP (slot, 1)) != i * 16)
8239 need_temp = 1;
8242 else
8244 int i;
8246 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8248 rtx slot = XVECEXP (container, 0, i);
8249 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8250 || INTVAL (XEXP (slot, 1)) != i * 8)
8251 need_temp = 1;
8255 if (!need_temp)
8257 int_addr = addr;
8258 sse_addr = addr;
8260 else
8262 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8263 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8266 /* First ensure that we fit completely in registers. */
8267 if (needed_intregs)
8269 t = build_int_cst (TREE_TYPE (gpr),
8270 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8271 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8272 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8273 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8274 gimplify_and_add (t, pre_p);
8276 if (needed_sseregs)
8278 t = build_int_cst (TREE_TYPE (fpr),
8279 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8280 + X86_64_REGPARM_MAX * 8);
8281 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8282 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8283 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8284 gimplify_and_add (t, pre_p);
8287 /* Compute index to start of area used for integer regs. */
8288 if (needed_intregs)
8290 /* int_addr = gpr + sav; */
8291 t = fold_build_pointer_plus (sav, gpr);
8292 gimplify_assign (int_addr, t, pre_p);
8294 if (needed_sseregs)
8296 /* sse_addr = fpr + sav; */
8297 t = fold_build_pointer_plus (sav, fpr);
8298 gimplify_assign (sse_addr, t, pre_p);
8300 if (need_temp)
8302 int i, prev_size = 0;
8303 tree temp = create_tmp_var (type, "va_arg_tmp");
8305 /* addr = &temp; */
8306 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8307 gimplify_assign (addr, t, pre_p);
8309 for (i = 0; i < XVECLEN (container, 0); i++)
8311 rtx slot = XVECEXP (container, 0, i);
8312 rtx reg = XEXP (slot, 0);
8313 enum machine_mode mode = GET_MODE (reg);
8314 tree piece_type;
8315 tree addr_type;
8316 tree daddr_type;
8317 tree src_addr, src;
8318 int src_offset;
8319 tree dest_addr, dest;
8320 int cur_size = GET_MODE_SIZE (mode);
8322 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8323 prev_size = INTVAL (XEXP (slot, 1));
8324 if (prev_size + cur_size > size)
8326 cur_size = size - prev_size;
8327 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8328 if (mode == BLKmode)
8329 mode = QImode;
8331 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8332 if (mode == GET_MODE (reg))
8333 addr_type = build_pointer_type (piece_type);
8334 else
8335 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8336 true);
8337 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8338 true);
8340 if (SSE_REGNO_P (REGNO (reg)))
8342 src_addr = sse_addr;
8343 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8345 else
8347 src_addr = int_addr;
8348 src_offset = REGNO (reg) * 8;
8350 src_addr = fold_convert (addr_type, src_addr);
8351 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8353 dest_addr = fold_convert (daddr_type, addr);
8354 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8355 if (cur_size == GET_MODE_SIZE (mode))
8357 src = build_va_arg_indirect_ref (src_addr);
8358 dest = build_va_arg_indirect_ref (dest_addr);
8360 gimplify_assign (dest, src, pre_p);
8362 else
8364 tree copy
8365 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8366 3, dest_addr, src_addr,
8367 size_int (cur_size));
8368 gimplify_and_add (copy, pre_p);
8370 prev_size += cur_size;
8374 if (needed_intregs)
8376 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8377 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8378 gimplify_assign (gpr, t, pre_p);
8381 if (needed_sseregs)
8383 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8384 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8385 gimplify_assign (fpr, t, pre_p);
8388 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8390 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8393 /* ... otherwise out of the overflow area. */
8395 /* When we align parameter on stack for caller, if the parameter
8396 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8397 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8398 here with caller. */
8399 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8400 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8401 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8403 /* Care for on-stack alignment if needed. */
8404 if (arg_boundary <= 64 || size == 0)
8405 t = ovf;
8406 else
8408 HOST_WIDE_INT align = arg_boundary / 8;
8409 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8410 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8411 build_int_cst (TREE_TYPE (t), -align));
8414 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8415 gimplify_assign (addr, t, pre_p);
8417 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8418 gimplify_assign (unshare_expr (ovf), t, pre_p);
8420 if (container)
8421 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8423 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8424 addr = fold_convert (ptrtype, addr);
8426 if (indirect_p)
8427 addr = build_va_arg_indirect_ref (addr);
8428 return build_va_arg_indirect_ref (addr);
8431 /* Return true if OPNUM's MEM should be matched
8432 in movabs* patterns. */
8434 bool
8435 ix86_check_movabs (rtx insn, int opnum)
8437 rtx set, mem;
8439 set = PATTERN (insn);
8440 if (GET_CODE (set) == PARALLEL)
8441 set = XVECEXP (set, 0, 0);
8442 gcc_assert (GET_CODE (set) == SET);
8443 mem = XEXP (set, opnum);
8444 while (GET_CODE (mem) == SUBREG)
8445 mem = SUBREG_REG (mem);
8446 gcc_assert (MEM_P (mem));
8447 return volatile_ok || !MEM_VOLATILE_P (mem);
8450 /* Initialize the table of extra 80387 mathematical constants. */
8452 static void
8453 init_ext_80387_constants (void)
8455 static const char * cst[5] =
8457 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8458 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8459 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8460 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8461 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8463 int i;
8465 for (i = 0; i < 5; i++)
8467 real_from_string (&ext_80387_constants_table[i], cst[i]);
8468 /* Ensure each constant is rounded to XFmode precision. */
8469 real_convert (&ext_80387_constants_table[i],
8470 XFmode, &ext_80387_constants_table[i]);
8473 ext_80387_constants_init = 1;
8476 /* Return non-zero if the constant is something that
8477 can be loaded with a special instruction. */
8480 standard_80387_constant_p (rtx x)
8482 enum machine_mode mode = GET_MODE (x);
8484 REAL_VALUE_TYPE r;
8486 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8487 return -1;
8489 if (x == CONST0_RTX (mode))
8490 return 1;
8491 if (x == CONST1_RTX (mode))
8492 return 2;
8494 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8496 /* For XFmode constants, try to find a special 80387 instruction when
8497 optimizing for size or on those CPUs that benefit from them. */
8498 if (mode == XFmode
8499 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8501 int i;
8503 if (! ext_80387_constants_init)
8504 init_ext_80387_constants ();
8506 for (i = 0; i < 5; i++)
8507 if (real_identical (&r, &ext_80387_constants_table[i]))
8508 return i + 3;
8511 /* Load of the constant -0.0 or -1.0 will be split as
8512 fldz;fchs or fld1;fchs sequence. */
8513 if (real_isnegzero (&r))
8514 return 8;
8515 if (real_identical (&r, &dconstm1))
8516 return 9;
8518 return 0;
8521 /* Return the opcode of the special instruction to be used to load
8522 the constant X. */
8524 const char *
8525 standard_80387_constant_opcode (rtx x)
8527 switch (standard_80387_constant_p (x))
8529 case 1:
8530 return "fldz";
8531 case 2:
8532 return "fld1";
8533 case 3:
8534 return "fldlg2";
8535 case 4:
8536 return "fldln2";
8537 case 5:
8538 return "fldl2e";
8539 case 6:
8540 return "fldl2t";
8541 case 7:
8542 return "fldpi";
8543 case 8:
8544 case 9:
8545 return "#";
8546 default:
8547 gcc_unreachable ();
8551 /* Return the CONST_DOUBLE representing the 80387 constant that is
8552 loaded by the specified special instruction. The argument IDX
8553 matches the return value from standard_80387_constant_p. */
8556 standard_80387_constant_rtx (int idx)
8558 int i;
8560 if (! ext_80387_constants_init)
8561 init_ext_80387_constants ();
8563 switch (idx)
8565 case 3:
8566 case 4:
8567 case 5:
8568 case 6:
8569 case 7:
8570 i = idx - 3;
8571 break;
8573 default:
8574 gcc_unreachable ();
8577 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8578 XFmode);
8581 /* Return 1 if X is all 0s and 2 if x is all 1s
8582 in supported SSE/AVX vector mode. */
8585 standard_sse_constant_p (rtx x)
8587 enum machine_mode mode = GET_MODE (x);
8589 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8590 return 1;
8591 if (vector_all_ones_operand (x, mode))
8592 switch (mode)
8594 case V16QImode:
8595 case V8HImode:
8596 case V4SImode:
8597 case V2DImode:
8598 if (TARGET_SSE2)
8599 return 2;
8600 case V32QImode:
8601 case V16HImode:
8602 case V8SImode:
8603 case V4DImode:
8604 if (TARGET_AVX2)
8605 return 2;
8606 default:
8607 break;
8610 return 0;
8613 /* Return the opcode of the special instruction to be used to load
8614 the constant X. */
8616 const char *
8617 standard_sse_constant_opcode (rtx insn, rtx x)
8619 switch (standard_sse_constant_p (x))
8621 case 1:
8622 switch (get_attr_mode (insn))
8624 case MODE_TI:
8625 return "%vpxor\t%0, %d0";
8626 case MODE_V2DF:
8627 return "%vxorpd\t%0, %d0";
8628 case MODE_V4SF:
8629 return "%vxorps\t%0, %d0";
8631 case MODE_OI:
8632 return "vpxor\t%x0, %x0, %x0";
8633 case MODE_V4DF:
8634 return "vxorpd\t%x0, %x0, %x0";
8635 case MODE_V8SF:
8636 return "vxorps\t%x0, %x0, %x0";
8638 default:
8639 break;
8642 case 2:
8643 if (TARGET_AVX)
8644 return "vpcmpeqd\t%0, %0, %0";
8645 else
8646 return "pcmpeqd\t%0, %0";
8648 default:
8649 break;
8651 gcc_unreachable ();
8654 /* Returns true if OP contains a symbol reference */
8656 bool
8657 symbolic_reference_mentioned_p (rtx op)
8659 const char *fmt;
8660 int i;
8662 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8663 return true;
8665 fmt = GET_RTX_FORMAT (GET_CODE (op));
8666 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8668 if (fmt[i] == 'E')
8670 int j;
8672 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8673 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8674 return true;
8677 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8678 return true;
8681 return false;
8684 /* Return true if it is appropriate to emit `ret' instructions in the
8685 body of a function. Do this only if the epilogue is simple, needing a
8686 couple of insns. Prior to reloading, we can't tell how many registers
8687 must be saved, so return false then. Return false if there is no frame
8688 marker to de-allocate. */
8690 bool
8691 ix86_can_use_return_insn_p (void)
8693 struct ix86_frame frame;
8695 if (! reload_completed || frame_pointer_needed)
8696 return 0;
8698 /* Don't allow more than 32k pop, since that's all we can do
8699 with one instruction. */
8700 if (crtl->args.pops_args && crtl->args.size >= 32768)
8701 return 0;
8703 ix86_compute_frame_layout (&frame);
8704 return (frame.stack_pointer_offset == UNITS_PER_WORD
8705 && (frame.nregs + frame.nsseregs) == 0);
8708 /* Value should be nonzero if functions must have frame pointers.
8709 Zero means the frame pointer need not be set up (and parms may
8710 be accessed via the stack pointer) in functions that seem suitable. */
8712 static bool
8713 ix86_frame_pointer_required (void)
8715 /* If we accessed previous frames, then the generated code expects
8716 to be able to access the saved ebp value in our frame. */
8717 if (cfun->machine->accesses_prev_frame)
8718 return true;
8720 /* Several x86 os'es need a frame pointer for other reasons,
8721 usually pertaining to setjmp. */
8722 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8723 return true;
8725 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8726 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8727 return true;
8729 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8730 allocation is 4GB. */
8731 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8732 return true;
8734 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8735 turns off the frame pointer by default. Turn it back on now if
8736 we've not got a leaf function. */
8737 if (TARGET_OMIT_LEAF_FRAME_POINTER
8738 && (!crtl->is_leaf
8739 || ix86_current_function_calls_tls_descriptor))
8740 return true;
8742 if (crtl->profile && !flag_fentry)
8743 return true;
8745 return false;
8748 /* Record that the current function accesses previous call frames. */
8750 void
8751 ix86_setup_frame_addresses (void)
8753 cfun->machine->accesses_prev_frame = 1;
8756 #ifndef USE_HIDDEN_LINKONCE
8757 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8758 # define USE_HIDDEN_LINKONCE 1
8759 # else
8760 # define USE_HIDDEN_LINKONCE 0
8761 # endif
8762 #endif
8764 static int pic_labels_used;
8766 /* Fills in the label name that should be used for a pc thunk for
8767 the given register. */
8769 static void
8770 get_pc_thunk_name (char name[32], unsigned int regno)
8772 gcc_assert (!TARGET_64BIT);
8774 if (USE_HIDDEN_LINKONCE)
8775 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8776 else
8777 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8781 /* This function generates code for -fpic that loads %ebx with
8782 the return address of the caller and then returns. */
8784 static void
8785 ix86_code_end (void)
8787 rtx xops[2];
8788 int regno;
8790 for (regno = AX_REG; regno <= SP_REG; regno++)
8792 char name[32];
8793 tree decl;
8795 if (!(pic_labels_used & (1 << regno)))
8796 continue;
8798 get_pc_thunk_name (name, regno);
8800 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8801 get_identifier (name),
8802 build_function_type_list (void_type_node, NULL_TREE));
8803 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8804 NULL_TREE, void_type_node);
8805 TREE_PUBLIC (decl) = 1;
8806 TREE_STATIC (decl) = 1;
8807 DECL_IGNORED_P (decl) = 1;
8809 #if TARGET_MACHO
8810 if (TARGET_MACHO)
8812 switch_to_section (darwin_sections[text_coal_section]);
8813 fputs ("\t.weak_definition\t", asm_out_file);
8814 assemble_name (asm_out_file, name);
8815 fputs ("\n\t.private_extern\t", asm_out_file);
8816 assemble_name (asm_out_file, name);
8817 putc ('\n', asm_out_file);
8818 ASM_OUTPUT_LABEL (asm_out_file, name);
8819 DECL_WEAK (decl) = 1;
8821 else
8822 #endif
8823 if (USE_HIDDEN_LINKONCE)
8825 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8827 targetm.asm_out.unique_section (decl, 0);
8828 switch_to_section (get_named_section (decl, NULL, 0));
8830 targetm.asm_out.globalize_label (asm_out_file, name);
8831 fputs ("\t.hidden\t", asm_out_file);
8832 assemble_name (asm_out_file, name);
8833 putc ('\n', asm_out_file);
8834 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8836 else
8838 switch_to_section (text_section);
8839 ASM_OUTPUT_LABEL (asm_out_file, name);
8842 DECL_INITIAL (decl) = make_node (BLOCK);
8843 current_function_decl = decl;
8844 init_function_start (decl);
8845 first_function_block_is_cold = false;
8846 /* Make sure unwind info is emitted for the thunk if needed. */
8847 final_start_function (emit_barrier (), asm_out_file, 1);
8849 /* Pad stack IP move with 4 instructions (two NOPs count
8850 as one instruction). */
8851 if (TARGET_PAD_SHORT_FUNCTION)
8853 int i = 8;
8855 while (i--)
8856 fputs ("\tnop\n", asm_out_file);
8859 xops[0] = gen_rtx_REG (Pmode, regno);
8860 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8861 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8862 fputs ("\tret\n", asm_out_file);
8863 final_end_function ();
8864 init_insn_lengths ();
8865 free_after_compilation (cfun);
8866 set_cfun (NULL);
8867 current_function_decl = NULL;
8870 if (flag_split_stack)
8871 file_end_indicate_split_stack ();
8874 /* Emit code for the SET_GOT patterns. */
8876 const char *
8877 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8879 rtx xops[3];
8881 xops[0] = dest;
8883 if (TARGET_VXWORKS_RTP && flag_pic)
8885 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8886 xops[2] = gen_rtx_MEM (Pmode,
8887 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8888 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8890 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8891 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8892 an unadorned address. */
8893 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8894 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8895 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8896 return "";
8899 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8901 if (!flag_pic)
8903 if (TARGET_MACHO)
8904 /* We don't need a pic base, we're not producing pic. */
8905 gcc_unreachable ();
8907 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8908 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8909 targetm.asm_out.internal_label (asm_out_file, "L",
8910 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8912 else
8914 char name[32];
8915 get_pc_thunk_name (name, REGNO (dest));
8916 pic_labels_used |= 1 << REGNO (dest);
8918 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8919 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8920 output_asm_insn ("call\t%X2", xops);
8922 #if TARGET_MACHO
8923 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
8924 This is what will be referenced by the Mach-O PIC subsystem. */
8925 if (machopic_should_output_picbase_label () || !label)
8926 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8928 /* When we are restoring the pic base at the site of a nonlocal label,
8929 and we decided to emit the pic base above, we will still output a
8930 local label used for calculating the correction offset (even though
8931 the offset will be 0 in that case). */
8932 if (label)
8933 targetm.asm_out.internal_label (asm_out_file, "L",
8934 CODE_LABEL_NUMBER (label));
8935 #endif
8938 if (!TARGET_MACHO)
8939 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8941 return "";
8944 /* Generate an "push" pattern for input ARG. */
8946 static rtx
8947 gen_push (rtx arg)
8949 struct machine_function *m = cfun->machine;
8951 if (m->fs.cfa_reg == stack_pointer_rtx)
8952 m->fs.cfa_offset += UNITS_PER_WORD;
8953 m->fs.sp_offset += UNITS_PER_WORD;
8955 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8956 arg = gen_rtx_REG (word_mode, REGNO (arg));
8958 return gen_rtx_SET (VOIDmode,
8959 gen_rtx_MEM (word_mode,
8960 gen_rtx_PRE_DEC (Pmode,
8961 stack_pointer_rtx)),
8962 arg);
8965 /* Generate an "pop" pattern for input ARG. */
8967 static rtx
8968 gen_pop (rtx arg)
8970 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8971 arg = gen_rtx_REG (word_mode, REGNO (arg));
8973 return gen_rtx_SET (VOIDmode,
8974 arg,
8975 gen_rtx_MEM (word_mode,
8976 gen_rtx_POST_INC (Pmode,
8977 stack_pointer_rtx)));
8980 /* Return >= 0 if there is an unused call-clobbered register available
8981 for the entire function. */
8983 static unsigned int
8984 ix86_select_alt_pic_regnum (void)
8986 if (crtl->is_leaf
8987 && !crtl->profile
8988 && !ix86_current_function_calls_tls_descriptor)
8990 int i, drap;
8991 /* Can't use the same register for both PIC and DRAP. */
8992 if (crtl->drap_reg)
8993 drap = REGNO (crtl->drap_reg);
8994 else
8995 drap = -1;
8996 for (i = 2; i >= 0; --i)
8997 if (i != drap && !df_regs_ever_live_p (i))
8998 return i;
9001 return INVALID_REGNUM;
9004 /* Return TRUE if we need to save REGNO. */
9006 static bool
9007 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9009 if (pic_offset_table_rtx
9010 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9011 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9012 || crtl->profile
9013 || crtl->calls_eh_return
9014 || crtl->uses_const_pool
9015 || cfun->has_nonlocal_label))
9016 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9018 if (crtl->calls_eh_return && maybe_eh_return)
9020 unsigned i;
9021 for (i = 0; ; i++)
9023 unsigned test = EH_RETURN_DATA_REGNO (i);
9024 if (test == INVALID_REGNUM)
9025 break;
9026 if (test == regno)
9027 return true;
9031 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9032 return true;
9034 return (df_regs_ever_live_p (regno)
9035 && !call_used_regs[regno]
9036 && !fixed_regs[regno]
9037 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9040 /* Return number of saved general prupose registers. */
9042 static int
9043 ix86_nsaved_regs (void)
9045 int nregs = 0;
9046 int regno;
9048 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9049 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9050 nregs ++;
9051 return nregs;
9054 /* Return number of saved SSE registrers. */
9056 static int
9057 ix86_nsaved_sseregs (void)
9059 int nregs = 0;
9060 int regno;
9062 if (!TARGET_64BIT_MS_ABI)
9063 return 0;
9064 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9065 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9066 nregs ++;
9067 return nregs;
9070 /* Given FROM and TO register numbers, say whether this elimination is
9071 allowed. If stack alignment is needed, we can only replace argument
9072 pointer with hard frame pointer, or replace frame pointer with stack
9073 pointer. Otherwise, frame pointer elimination is automatically
9074 handled and all other eliminations are valid. */
9076 static bool
9077 ix86_can_eliminate (const int from, const int to)
9079 if (stack_realign_fp)
9080 return ((from == ARG_POINTER_REGNUM
9081 && to == HARD_FRAME_POINTER_REGNUM)
9082 || (from == FRAME_POINTER_REGNUM
9083 && to == STACK_POINTER_REGNUM));
9084 else
9085 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9088 /* Return the offset between two registers, one to be eliminated, and the other
9089 its replacement, at the start of a routine. */
9091 HOST_WIDE_INT
9092 ix86_initial_elimination_offset (int from, int to)
9094 struct ix86_frame frame;
9095 ix86_compute_frame_layout (&frame);
9097 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9098 return frame.hard_frame_pointer_offset;
9099 else if (from == FRAME_POINTER_REGNUM
9100 && to == HARD_FRAME_POINTER_REGNUM)
9101 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9102 else
9104 gcc_assert (to == STACK_POINTER_REGNUM);
9106 if (from == ARG_POINTER_REGNUM)
9107 return frame.stack_pointer_offset;
9109 gcc_assert (from == FRAME_POINTER_REGNUM);
9110 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9114 /* In a dynamically-aligned function, we can't know the offset from
9115 stack pointer to frame pointer, so we must ensure that setjmp
9116 eliminates fp against the hard fp (%ebp) rather than trying to
9117 index from %esp up to the top of the frame across a gap that is
9118 of unknown (at compile-time) size. */
9119 static rtx
9120 ix86_builtin_setjmp_frame_value (void)
9122 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9125 /* When using -fsplit-stack, the allocation routines set a field in
9126 the TCB to the bottom of the stack plus this much space, measured
9127 in bytes. */
9129 #define SPLIT_STACK_AVAILABLE 256
9131 /* Fill structure ix86_frame about frame of currently computed function. */
9133 static void
9134 ix86_compute_frame_layout (struct ix86_frame *frame)
9136 unsigned HOST_WIDE_INT stack_alignment_needed;
9137 HOST_WIDE_INT offset;
9138 unsigned HOST_WIDE_INT preferred_alignment;
9139 HOST_WIDE_INT size = get_frame_size ();
9140 HOST_WIDE_INT to_allocate;
9142 frame->nregs = ix86_nsaved_regs ();
9143 frame->nsseregs = ix86_nsaved_sseregs ();
9145 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9146 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9148 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9149 function prologues and leaf. */
9150 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9151 && (!crtl->is_leaf || cfun->calls_alloca != 0
9152 || ix86_current_function_calls_tls_descriptor))
9154 preferred_alignment = 16;
9155 stack_alignment_needed = 16;
9156 crtl->preferred_stack_boundary = 128;
9157 crtl->stack_alignment_needed = 128;
9160 gcc_assert (!size || stack_alignment_needed);
9161 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9162 gcc_assert (preferred_alignment <= stack_alignment_needed);
9164 /* For SEH we have to limit the amount of code movement into the prologue.
9165 At present we do this via a BLOCKAGE, at which point there's very little
9166 scheduling that can be done, which means that there's very little point
9167 in doing anything except PUSHs. */
9168 if (TARGET_SEH)
9169 cfun->machine->use_fast_prologue_epilogue = false;
9171 /* During reload iteration the amount of registers saved can change.
9172 Recompute the value as needed. Do not recompute when amount of registers
9173 didn't change as reload does multiple calls to the function and does not
9174 expect the decision to change within single iteration. */
9175 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR)
9176 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9178 int count = frame->nregs;
9179 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9181 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9183 /* The fast prologue uses move instead of push to save registers. This
9184 is significantly longer, but also executes faster as modern hardware
9185 can execute the moves in parallel, but can't do that for push/pop.
9187 Be careful about choosing what prologue to emit: When function takes
9188 many instructions to execute we may use slow version as well as in
9189 case function is known to be outside hot spot (this is known with
9190 feedback only). Weight the size of function by number of registers
9191 to save as it is cheap to use one or two push instructions but very
9192 slow to use many of them. */
9193 if (count)
9194 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9195 if (node->frequency < NODE_FREQUENCY_NORMAL
9196 || (flag_branch_probabilities
9197 && node->frequency < NODE_FREQUENCY_HOT))
9198 cfun->machine->use_fast_prologue_epilogue = false;
9199 else
9200 cfun->machine->use_fast_prologue_epilogue
9201 = !expensive_function_p (count);
9204 frame->save_regs_using_mov
9205 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9206 /* If static stack checking is enabled and done with probes,
9207 the registers need to be saved before allocating the frame. */
9208 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9210 /* Skip return address. */
9211 offset = UNITS_PER_WORD;
9213 /* Skip pushed static chain. */
9214 if (ix86_static_chain_on_stack)
9215 offset += UNITS_PER_WORD;
9217 /* Skip saved base pointer. */
9218 if (frame_pointer_needed)
9219 offset += UNITS_PER_WORD;
9220 frame->hfp_save_offset = offset;
9222 /* The traditional frame pointer location is at the top of the frame. */
9223 frame->hard_frame_pointer_offset = offset;
9225 /* Register save area */
9226 offset += frame->nregs * UNITS_PER_WORD;
9227 frame->reg_save_offset = offset;
9229 /* On SEH target, registers are pushed just before the frame pointer
9230 location. */
9231 if (TARGET_SEH)
9232 frame->hard_frame_pointer_offset = offset;
9234 /* Align and set SSE register save area. */
9235 if (frame->nsseregs)
9237 /* The only ABI that has saved SSE registers (Win64) also has a
9238 16-byte aligned default stack, and thus we don't need to be
9239 within the re-aligned local stack frame to save them. */
9240 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9241 offset = (offset + 16 - 1) & -16;
9242 offset += frame->nsseregs * 16;
9244 frame->sse_reg_save_offset = offset;
9246 /* The re-aligned stack starts here. Values before this point are not
9247 directly comparable with values below this point. In order to make
9248 sure that no value happens to be the same before and after, force
9249 the alignment computation below to add a non-zero value. */
9250 if (stack_realign_fp)
9251 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9253 /* Va-arg area */
9254 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9255 offset += frame->va_arg_size;
9257 /* Align start of frame for local function. */
9258 if (stack_realign_fp
9259 || offset != frame->sse_reg_save_offset
9260 || size != 0
9261 || !crtl->is_leaf
9262 || cfun->calls_alloca
9263 || ix86_current_function_calls_tls_descriptor)
9264 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9266 /* Frame pointer points here. */
9267 frame->frame_pointer_offset = offset;
9269 offset += size;
9271 /* Add outgoing arguments area. Can be skipped if we eliminated
9272 all the function calls as dead code.
9273 Skipping is however impossible when function calls alloca. Alloca
9274 expander assumes that last crtl->outgoing_args_size
9275 of stack frame are unused. */
9276 if (ACCUMULATE_OUTGOING_ARGS
9277 && (!crtl->is_leaf || cfun->calls_alloca
9278 || ix86_current_function_calls_tls_descriptor))
9280 offset += crtl->outgoing_args_size;
9281 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9283 else
9284 frame->outgoing_arguments_size = 0;
9286 /* Align stack boundary. Only needed if we're calling another function
9287 or using alloca. */
9288 if (!crtl->is_leaf || cfun->calls_alloca
9289 || ix86_current_function_calls_tls_descriptor)
9290 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9292 /* We've reached end of stack frame. */
9293 frame->stack_pointer_offset = offset;
9295 /* Size prologue needs to allocate. */
9296 to_allocate = offset - frame->sse_reg_save_offset;
9298 if ((!to_allocate && frame->nregs <= 1)
9299 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9300 frame->save_regs_using_mov = false;
9302 if (ix86_using_red_zone ()
9303 && crtl->sp_is_unchanging
9304 && crtl->is_leaf
9305 && !ix86_current_function_calls_tls_descriptor)
9307 frame->red_zone_size = to_allocate;
9308 if (frame->save_regs_using_mov)
9309 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9310 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9311 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9313 else
9314 frame->red_zone_size = 0;
9315 frame->stack_pointer_offset -= frame->red_zone_size;
9317 /* The SEH frame pointer location is near the bottom of the frame.
9318 This is enforced by the fact that the difference between the
9319 stack pointer and the frame pointer is limited to 240 bytes in
9320 the unwind data structure. */
9321 if (TARGET_SEH)
9323 HOST_WIDE_INT diff;
9325 /* If we can leave the frame pointer where it is, do so. Also, returns
9326 the establisher frame for __builtin_frame_address (0). */
9327 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9328 if (diff <= SEH_MAX_FRAME_SIZE
9329 && (diff > 240 || (diff & 15) != 0)
9330 && !crtl->accesses_prior_frames)
9332 /* Ideally we'd determine what portion of the local stack frame
9333 (within the constraint of the lowest 240) is most heavily used.
9334 But without that complication, simply bias the frame pointer
9335 by 128 bytes so as to maximize the amount of the local stack
9336 frame that is addressable with 8-bit offsets. */
9337 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9342 /* This is semi-inlined memory_address_length, but simplified
9343 since we know that we're always dealing with reg+offset, and
9344 to avoid having to create and discard all that rtl. */
9346 static inline int
9347 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9349 int len = 4;
9351 if (offset == 0)
9353 /* EBP and R13 cannot be encoded without an offset. */
9354 len = (regno == BP_REG || regno == R13_REG);
9356 else if (IN_RANGE (offset, -128, 127))
9357 len = 1;
9359 /* ESP and R12 must be encoded with a SIB byte. */
9360 if (regno == SP_REG || regno == R12_REG)
9361 len++;
9363 return len;
9366 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9367 The valid base registers are taken from CFUN->MACHINE->FS. */
9369 static rtx
9370 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9372 const struct machine_function *m = cfun->machine;
9373 rtx base_reg = NULL;
9374 HOST_WIDE_INT base_offset = 0;
9376 if (m->use_fast_prologue_epilogue)
9378 /* Choose the base register most likely to allow the most scheduling
9379 opportunities. Generally FP is valid throughout the function,
9380 while DRAP must be reloaded within the epilogue. But choose either
9381 over the SP due to increased encoding size. */
9383 if (m->fs.fp_valid)
9385 base_reg = hard_frame_pointer_rtx;
9386 base_offset = m->fs.fp_offset - cfa_offset;
9388 else if (m->fs.drap_valid)
9390 base_reg = crtl->drap_reg;
9391 base_offset = 0 - cfa_offset;
9393 else if (m->fs.sp_valid)
9395 base_reg = stack_pointer_rtx;
9396 base_offset = m->fs.sp_offset - cfa_offset;
9399 else
9401 HOST_WIDE_INT toffset;
9402 int len = 16, tlen;
9404 /* Choose the base register with the smallest address encoding.
9405 With a tie, choose FP > DRAP > SP. */
9406 if (m->fs.sp_valid)
9408 base_reg = stack_pointer_rtx;
9409 base_offset = m->fs.sp_offset - cfa_offset;
9410 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9412 if (m->fs.drap_valid)
9414 toffset = 0 - cfa_offset;
9415 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9416 if (tlen <= len)
9418 base_reg = crtl->drap_reg;
9419 base_offset = toffset;
9420 len = tlen;
9423 if (m->fs.fp_valid)
9425 toffset = m->fs.fp_offset - cfa_offset;
9426 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9427 if (tlen <= len)
9429 base_reg = hard_frame_pointer_rtx;
9430 base_offset = toffset;
9431 len = tlen;
9435 gcc_assert (base_reg != NULL);
9437 return plus_constant (Pmode, base_reg, base_offset);
9440 /* Emit code to save registers in the prologue. */
9442 static void
9443 ix86_emit_save_regs (void)
9445 unsigned int regno;
9446 rtx insn;
9448 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9449 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9451 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9452 RTX_FRAME_RELATED_P (insn) = 1;
9456 /* Emit a single register save at CFA - CFA_OFFSET. */
9458 static void
9459 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9460 HOST_WIDE_INT cfa_offset)
9462 struct machine_function *m = cfun->machine;
9463 rtx reg = gen_rtx_REG (mode, regno);
9464 rtx mem, addr, base, insn;
9466 addr = choose_baseaddr (cfa_offset);
9467 mem = gen_frame_mem (mode, addr);
9469 /* For SSE saves, we need to indicate the 128-bit alignment. */
9470 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9472 insn = emit_move_insn (mem, reg);
9473 RTX_FRAME_RELATED_P (insn) = 1;
9475 base = addr;
9476 if (GET_CODE (base) == PLUS)
9477 base = XEXP (base, 0);
9478 gcc_checking_assert (REG_P (base));
9480 /* When saving registers into a re-aligned local stack frame, avoid
9481 any tricky guessing by dwarf2out. */
9482 if (m->fs.realigned)
9484 gcc_checking_assert (stack_realign_drap);
9486 if (regno == REGNO (crtl->drap_reg))
9488 /* A bit of a hack. We force the DRAP register to be saved in
9489 the re-aligned stack frame, which provides us with a copy
9490 of the CFA that will last past the prologue. Install it. */
9491 gcc_checking_assert (cfun->machine->fs.fp_valid);
9492 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9493 cfun->machine->fs.fp_offset - cfa_offset);
9494 mem = gen_rtx_MEM (mode, addr);
9495 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9497 else
9499 /* The frame pointer is a stable reference within the
9500 aligned frame. Use it. */
9501 gcc_checking_assert (cfun->machine->fs.fp_valid);
9502 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9503 cfun->machine->fs.fp_offset - cfa_offset);
9504 mem = gen_rtx_MEM (mode, addr);
9505 add_reg_note (insn, REG_CFA_EXPRESSION,
9506 gen_rtx_SET (VOIDmode, mem, reg));
9510 /* The memory may not be relative to the current CFA register,
9511 which means that we may need to generate a new pattern for
9512 use by the unwind info. */
9513 else if (base != m->fs.cfa_reg)
9515 addr = plus_constant (Pmode, m->fs.cfa_reg,
9516 m->fs.cfa_offset - cfa_offset);
9517 mem = gen_rtx_MEM (mode, addr);
9518 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9522 /* Emit code to save registers using MOV insns.
9523 First register is stored at CFA - CFA_OFFSET. */
9524 static void
9525 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9527 unsigned int regno;
9529 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9530 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9532 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9533 cfa_offset -= UNITS_PER_WORD;
9537 /* Emit code to save SSE registers using MOV insns.
9538 First register is stored at CFA - CFA_OFFSET. */
9539 static void
9540 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9542 unsigned int regno;
9544 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9545 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9547 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9548 cfa_offset -= 16;
9552 static GTY(()) rtx queued_cfa_restores;
9554 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9555 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9556 Don't add the note if the previously saved value will be left untouched
9557 within stack red-zone till return, as unwinders can find the same value
9558 in the register and on the stack. */
9560 static void
9561 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9563 if (!crtl->shrink_wrapped
9564 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9565 return;
9567 if (insn)
9569 add_reg_note (insn, REG_CFA_RESTORE, reg);
9570 RTX_FRAME_RELATED_P (insn) = 1;
9572 else
9573 queued_cfa_restores
9574 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9577 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9579 static void
9580 ix86_add_queued_cfa_restore_notes (rtx insn)
9582 rtx last;
9583 if (!queued_cfa_restores)
9584 return;
9585 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9587 XEXP (last, 1) = REG_NOTES (insn);
9588 REG_NOTES (insn) = queued_cfa_restores;
9589 queued_cfa_restores = NULL_RTX;
9590 RTX_FRAME_RELATED_P (insn) = 1;
9593 /* Expand prologue or epilogue stack adjustment.
9594 The pattern exist to put a dependency on all ebp-based memory accesses.
9595 STYLE should be negative if instructions should be marked as frame related,
9596 zero if %r11 register is live and cannot be freely used and positive
9597 otherwise. */
9599 static void
9600 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9601 int style, bool set_cfa)
9603 struct machine_function *m = cfun->machine;
9604 rtx insn;
9605 bool add_frame_related_expr = false;
9607 if (Pmode == SImode)
9608 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9609 else if (x86_64_immediate_operand (offset, DImode))
9610 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9611 else
9613 rtx tmp;
9614 /* r11 is used by indirect sibcall return as well, set before the
9615 epilogue and used after the epilogue. */
9616 if (style)
9617 tmp = gen_rtx_REG (DImode, R11_REG);
9618 else
9620 gcc_assert (src != hard_frame_pointer_rtx
9621 && dest != hard_frame_pointer_rtx);
9622 tmp = hard_frame_pointer_rtx;
9624 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9625 if (style < 0)
9626 add_frame_related_expr = true;
9628 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9631 insn = emit_insn (insn);
9632 if (style >= 0)
9633 ix86_add_queued_cfa_restore_notes (insn);
9635 if (set_cfa)
9637 rtx r;
9639 gcc_assert (m->fs.cfa_reg == src);
9640 m->fs.cfa_offset += INTVAL (offset);
9641 m->fs.cfa_reg = dest;
9643 r = gen_rtx_PLUS (Pmode, src, offset);
9644 r = gen_rtx_SET (VOIDmode, dest, r);
9645 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9646 RTX_FRAME_RELATED_P (insn) = 1;
9648 else if (style < 0)
9650 RTX_FRAME_RELATED_P (insn) = 1;
9651 if (add_frame_related_expr)
9653 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9654 r = gen_rtx_SET (VOIDmode, dest, r);
9655 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9659 if (dest == stack_pointer_rtx)
9661 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9662 bool valid = m->fs.sp_valid;
9664 if (src == hard_frame_pointer_rtx)
9666 valid = m->fs.fp_valid;
9667 ooffset = m->fs.fp_offset;
9669 else if (src == crtl->drap_reg)
9671 valid = m->fs.drap_valid;
9672 ooffset = 0;
9674 else
9676 /* Else there are two possibilities: SP itself, which we set
9677 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9678 taken care of this by hand along the eh_return path. */
9679 gcc_checking_assert (src == stack_pointer_rtx
9680 || offset == const0_rtx);
9683 m->fs.sp_offset = ooffset - INTVAL (offset);
9684 m->fs.sp_valid = valid;
9688 /* Find an available register to be used as dynamic realign argument
9689 pointer regsiter. Such a register will be written in prologue and
9690 used in begin of body, so it must not be
9691 1. parameter passing register.
9692 2. GOT pointer.
9693 We reuse static-chain register if it is available. Otherwise, we
9694 use DI for i386 and R13 for x86-64. We chose R13 since it has
9695 shorter encoding.
9697 Return: the regno of chosen register. */
9699 static unsigned int
9700 find_drap_reg (void)
9702 tree decl = cfun->decl;
9704 if (TARGET_64BIT)
9706 /* Use R13 for nested function or function need static chain.
9707 Since function with tail call may use any caller-saved
9708 registers in epilogue, DRAP must not use caller-saved
9709 register in such case. */
9710 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9711 return R13_REG;
9713 return R10_REG;
9715 else
9717 /* Use DI for nested function or function need static chain.
9718 Since function with tail call may use any caller-saved
9719 registers in epilogue, DRAP must not use caller-saved
9720 register in such case. */
9721 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9722 return DI_REG;
9724 /* Reuse static chain register if it isn't used for parameter
9725 passing. */
9726 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9728 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9729 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9730 return CX_REG;
9732 return DI_REG;
9736 /* Return minimum incoming stack alignment. */
9738 static unsigned int
9739 ix86_minimum_incoming_stack_boundary (bool sibcall)
9741 unsigned int incoming_stack_boundary;
9743 /* Prefer the one specified at command line. */
9744 if (ix86_user_incoming_stack_boundary)
9745 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9746 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9747 if -mstackrealign is used, it isn't used for sibcall check and
9748 estimated stack alignment is 128bit. */
9749 else if (!sibcall
9750 && !TARGET_64BIT
9751 && ix86_force_align_arg_pointer
9752 && crtl->stack_alignment_estimated == 128)
9753 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9754 else
9755 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9757 /* Incoming stack alignment can be changed on individual functions
9758 via force_align_arg_pointer attribute. We use the smallest
9759 incoming stack boundary. */
9760 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9761 && lookup_attribute (ix86_force_align_arg_pointer_string,
9762 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9763 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9765 /* The incoming stack frame has to be aligned at least at
9766 parm_stack_boundary. */
9767 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9768 incoming_stack_boundary = crtl->parm_stack_boundary;
9770 /* Stack at entrance of main is aligned by runtime. We use the
9771 smallest incoming stack boundary. */
9772 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9773 && DECL_NAME (current_function_decl)
9774 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9775 && DECL_FILE_SCOPE_P (current_function_decl))
9776 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9778 return incoming_stack_boundary;
9781 /* Update incoming stack boundary and estimated stack alignment. */
9783 static void
9784 ix86_update_stack_boundary (void)
9786 ix86_incoming_stack_boundary
9787 = ix86_minimum_incoming_stack_boundary (false);
9789 /* x86_64 vararg needs 16byte stack alignment for register save
9790 area. */
9791 if (TARGET_64BIT
9792 && cfun->stdarg
9793 && crtl->stack_alignment_estimated < 128)
9794 crtl->stack_alignment_estimated = 128;
9797 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9798 needed or an rtx for DRAP otherwise. */
9800 static rtx
9801 ix86_get_drap_rtx (void)
9803 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9804 crtl->need_drap = true;
9806 if (stack_realign_drap)
9808 /* Assign DRAP to vDRAP and returns vDRAP */
9809 unsigned int regno = find_drap_reg ();
9810 rtx drap_vreg;
9811 rtx arg_ptr;
9812 rtx seq, insn;
9814 arg_ptr = gen_rtx_REG (Pmode, regno);
9815 crtl->drap_reg = arg_ptr;
9817 start_sequence ();
9818 drap_vreg = copy_to_reg (arg_ptr);
9819 seq = get_insns ();
9820 end_sequence ();
9822 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9823 if (!optimize)
9825 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9826 RTX_FRAME_RELATED_P (insn) = 1;
9828 return drap_vreg;
9830 else
9831 return NULL;
9834 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9836 static rtx
9837 ix86_internal_arg_pointer (void)
9839 return virtual_incoming_args_rtx;
9842 struct scratch_reg {
9843 rtx reg;
9844 bool saved;
9847 /* Return a short-lived scratch register for use on function entry.
9848 In 32-bit mode, it is valid only after the registers are saved
9849 in the prologue. This register must be released by means of
9850 release_scratch_register_on_entry once it is dead. */
9852 static void
9853 get_scratch_register_on_entry (struct scratch_reg *sr)
9855 int regno;
9857 sr->saved = false;
9859 if (TARGET_64BIT)
9861 /* We always use R11 in 64-bit mode. */
9862 regno = R11_REG;
9864 else
9866 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9867 bool fastcall_p
9868 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9869 bool thiscall_p
9870 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9871 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9872 int regparm = ix86_function_regparm (fntype, decl);
9873 int drap_regno
9874 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9876 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9877 for the static chain register. */
9878 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9879 && drap_regno != AX_REG)
9880 regno = AX_REG;
9881 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9882 for the static chain register. */
9883 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9884 regno = AX_REG;
9885 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9886 regno = DX_REG;
9887 /* ecx is the static chain register. */
9888 else if (regparm < 3 && !fastcall_p && !thiscall_p
9889 && !static_chain_p
9890 && drap_regno != CX_REG)
9891 regno = CX_REG;
9892 else if (ix86_save_reg (BX_REG, true))
9893 regno = BX_REG;
9894 /* esi is the static chain register. */
9895 else if (!(regparm == 3 && static_chain_p)
9896 && ix86_save_reg (SI_REG, true))
9897 regno = SI_REG;
9898 else if (ix86_save_reg (DI_REG, true))
9899 regno = DI_REG;
9900 else
9902 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9903 sr->saved = true;
9907 sr->reg = gen_rtx_REG (Pmode, regno);
9908 if (sr->saved)
9910 rtx insn = emit_insn (gen_push (sr->reg));
9911 RTX_FRAME_RELATED_P (insn) = 1;
9915 /* Release a scratch register obtained from the preceding function. */
9917 static void
9918 release_scratch_register_on_entry (struct scratch_reg *sr)
9920 if (sr->saved)
9922 struct machine_function *m = cfun->machine;
9923 rtx x, insn = emit_insn (gen_pop (sr->reg));
9925 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9926 RTX_FRAME_RELATED_P (insn) = 1;
9927 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9928 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9929 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9930 m->fs.sp_offset -= UNITS_PER_WORD;
9934 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9936 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9938 static void
9939 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9941 /* We skip the probe for the first interval + a small dope of 4 words and
9942 probe that many bytes past the specified size to maintain a protection
9943 area at the botton of the stack. */
9944 const int dope = 4 * UNITS_PER_WORD;
9945 rtx size_rtx = GEN_INT (size), last;
9947 /* See if we have a constant small number of probes to generate. If so,
9948 that's the easy case. The run-time loop is made up of 11 insns in the
9949 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9950 for n # of intervals. */
9951 if (size <= 5 * PROBE_INTERVAL)
9953 HOST_WIDE_INT i, adjust;
9954 bool first_probe = true;
9956 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9957 values of N from 1 until it exceeds SIZE. If only one probe is
9958 needed, this will not generate any code. Then adjust and probe
9959 to PROBE_INTERVAL + SIZE. */
9960 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9962 if (first_probe)
9964 adjust = 2 * PROBE_INTERVAL + dope;
9965 first_probe = false;
9967 else
9968 adjust = PROBE_INTERVAL;
9970 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9971 plus_constant (Pmode, stack_pointer_rtx,
9972 -adjust)));
9973 emit_stack_probe (stack_pointer_rtx);
9976 if (first_probe)
9977 adjust = size + PROBE_INTERVAL + dope;
9978 else
9979 adjust = size + PROBE_INTERVAL - i;
9981 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9982 plus_constant (Pmode, stack_pointer_rtx,
9983 -adjust)));
9984 emit_stack_probe (stack_pointer_rtx);
9986 /* Adjust back to account for the additional first interval. */
9987 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9988 plus_constant (Pmode, stack_pointer_rtx,
9989 PROBE_INTERVAL + dope)));
9992 /* Otherwise, do the same as above, but in a loop. Note that we must be
9993 extra careful with variables wrapping around because we might be at
9994 the very top (or the very bottom) of the address space and we have
9995 to be able to handle this case properly; in particular, we use an
9996 equality test for the loop condition. */
9997 else
9999 HOST_WIDE_INT rounded_size;
10000 struct scratch_reg sr;
10002 get_scratch_register_on_entry (&sr);
10005 /* Step 1: round SIZE to the previous multiple of the interval. */
10007 rounded_size = size & -PROBE_INTERVAL;
10010 /* Step 2: compute initial and final value of the loop counter. */
10012 /* SP = SP_0 + PROBE_INTERVAL. */
10013 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10014 plus_constant (Pmode, stack_pointer_rtx,
10015 - (PROBE_INTERVAL + dope))));
10017 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10018 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10019 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10020 gen_rtx_PLUS (Pmode, sr.reg,
10021 stack_pointer_rtx)));
10024 /* Step 3: the loop
10026 while (SP != LAST_ADDR)
10028 SP = SP + PROBE_INTERVAL
10029 probe at SP
10032 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10033 values of N from 1 until it is equal to ROUNDED_SIZE. */
10035 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10038 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10039 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10041 if (size != rounded_size)
10043 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10044 plus_constant (Pmode, stack_pointer_rtx,
10045 rounded_size - size)));
10046 emit_stack_probe (stack_pointer_rtx);
10049 /* Adjust back to account for the additional first interval. */
10050 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10051 plus_constant (Pmode, stack_pointer_rtx,
10052 PROBE_INTERVAL + dope)));
10054 release_scratch_register_on_entry (&sr);
10057 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10059 /* Even if the stack pointer isn't the CFA register, we need to correctly
10060 describe the adjustments made to it, in particular differentiate the
10061 frame-related ones from the frame-unrelated ones. */
10062 if (size > 0)
10064 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10065 XVECEXP (expr, 0, 0)
10066 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10067 plus_constant (Pmode, stack_pointer_rtx, -size));
10068 XVECEXP (expr, 0, 1)
10069 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10070 plus_constant (Pmode, stack_pointer_rtx,
10071 PROBE_INTERVAL + dope + size));
10072 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10073 RTX_FRAME_RELATED_P (last) = 1;
10075 cfun->machine->fs.sp_offset += size;
10078 /* Make sure nothing is scheduled before we are done. */
10079 emit_insn (gen_blockage ());
10082 /* Adjust the stack pointer up to REG while probing it. */
10084 const char *
10085 output_adjust_stack_and_probe (rtx reg)
10087 static int labelno = 0;
10088 char loop_lab[32], end_lab[32];
10089 rtx xops[2];
10091 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10092 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10094 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10096 /* Jump to END_LAB if SP == LAST_ADDR. */
10097 xops[0] = stack_pointer_rtx;
10098 xops[1] = reg;
10099 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10100 fputs ("\tje\t", asm_out_file);
10101 assemble_name_raw (asm_out_file, end_lab);
10102 fputc ('\n', asm_out_file);
10104 /* SP = SP + PROBE_INTERVAL. */
10105 xops[1] = GEN_INT (PROBE_INTERVAL);
10106 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10108 /* Probe at SP. */
10109 xops[1] = const0_rtx;
10110 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10112 fprintf (asm_out_file, "\tjmp\t");
10113 assemble_name_raw (asm_out_file, loop_lab);
10114 fputc ('\n', asm_out_file);
10116 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10118 return "";
10121 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10122 inclusive. These are offsets from the current stack pointer. */
10124 static void
10125 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10127 /* See if we have a constant small number of probes to generate. If so,
10128 that's the easy case. The run-time loop is made up of 7 insns in the
10129 generic case while the compile-time loop is made up of n insns for n #
10130 of intervals. */
10131 if (size <= 7 * PROBE_INTERVAL)
10133 HOST_WIDE_INT i;
10135 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10136 it exceeds SIZE. If only one probe is needed, this will not
10137 generate any code. Then probe at FIRST + SIZE. */
10138 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10139 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10140 -(first + i)));
10142 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10143 -(first + size)));
10146 /* Otherwise, do the same as above, but in a loop. Note that we must be
10147 extra careful with variables wrapping around because we might be at
10148 the very top (or the very bottom) of the address space and we have
10149 to be able to handle this case properly; in particular, we use an
10150 equality test for the loop condition. */
10151 else
10153 HOST_WIDE_INT rounded_size, last;
10154 struct scratch_reg sr;
10156 get_scratch_register_on_entry (&sr);
10159 /* Step 1: round SIZE to the previous multiple of the interval. */
10161 rounded_size = size & -PROBE_INTERVAL;
10164 /* Step 2: compute initial and final value of the loop counter. */
10166 /* TEST_OFFSET = FIRST. */
10167 emit_move_insn (sr.reg, GEN_INT (-first));
10169 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10170 last = first + rounded_size;
10173 /* Step 3: the loop
10175 while (TEST_ADDR != LAST_ADDR)
10177 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10178 probe at TEST_ADDR
10181 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10182 until it is equal to ROUNDED_SIZE. */
10184 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10187 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10188 that SIZE is equal to ROUNDED_SIZE. */
10190 if (size != rounded_size)
10191 emit_stack_probe (plus_constant (Pmode,
10192 gen_rtx_PLUS (Pmode,
10193 stack_pointer_rtx,
10194 sr.reg),
10195 rounded_size - size));
10197 release_scratch_register_on_entry (&sr);
10200 /* Make sure nothing is scheduled before we are done. */
10201 emit_insn (gen_blockage ());
10204 /* Probe a range of stack addresses from REG to END, inclusive. These are
10205 offsets from the current stack pointer. */
10207 const char *
10208 output_probe_stack_range (rtx reg, rtx end)
10210 static int labelno = 0;
10211 char loop_lab[32], end_lab[32];
10212 rtx xops[3];
10214 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10215 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10217 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10219 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10220 xops[0] = reg;
10221 xops[1] = end;
10222 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10223 fputs ("\tje\t", asm_out_file);
10224 assemble_name_raw (asm_out_file, end_lab);
10225 fputc ('\n', asm_out_file);
10227 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10228 xops[1] = GEN_INT (PROBE_INTERVAL);
10229 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10231 /* Probe at TEST_ADDR. */
10232 xops[0] = stack_pointer_rtx;
10233 xops[1] = reg;
10234 xops[2] = const0_rtx;
10235 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10237 fprintf (asm_out_file, "\tjmp\t");
10238 assemble_name_raw (asm_out_file, loop_lab);
10239 fputc ('\n', asm_out_file);
10241 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10243 return "";
10246 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10247 to be generated in correct form. */
10248 static void
10249 ix86_finalize_stack_realign_flags (void)
10251 /* Check if stack realign is really needed after reload, and
10252 stores result in cfun */
10253 unsigned int incoming_stack_boundary
10254 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10255 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10256 unsigned int stack_realign = (incoming_stack_boundary
10257 < (crtl->is_leaf
10258 ? crtl->max_used_stack_slot_alignment
10259 : crtl->stack_alignment_needed));
10261 if (crtl->stack_realign_finalized)
10263 /* After stack_realign_needed is finalized, we can't no longer
10264 change it. */
10265 gcc_assert (crtl->stack_realign_needed == stack_realign);
10266 return;
10269 /* If the only reason for frame_pointer_needed is that we conservatively
10270 assumed stack realignment might be needed, but in the end nothing that
10271 needed the stack alignment had been spilled, clear frame_pointer_needed
10272 and say we don't need stack realignment. */
10273 if (stack_realign
10274 && !crtl->need_drap
10275 && frame_pointer_needed
10276 && crtl->is_leaf
10277 && flag_omit_frame_pointer
10278 && crtl->sp_is_unchanging
10279 && !ix86_current_function_calls_tls_descriptor
10280 && !crtl->accesses_prior_frames
10281 && !cfun->calls_alloca
10282 && !crtl->calls_eh_return
10283 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10284 && !ix86_frame_pointer_required ()
10285 && get_frame_size () == 0
10286 && ix86_nsaved_sseregs () == 0
10287 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10289 HARD_REG_SET set_up_by_prologue, prologue_used;
10290 basic_block bb;
10292 CLEAR_HARD_REG_SET (prologue_used);
10293 CLEAR_HARD_REG_SET (set_up_by_prologue);
10294 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10295 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10296 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10297 HARD_FRAME_POINTER_REGNUM);
10298 FOR_EACH_BB (bb)
10300 rtx insn;
10301 FOR_BB_INSNS (bb, insn)
10302 if (NONDEBUG_INSN_P (insn)
10303 && requires_stack_frame_p (insn, prologue_used,
10304 set_up_by_prologue))
10306 crtl->stack_realign_needed = stack_realign;
10307 crtl->stack_realign_finalized = true;
10308 return;
10312 frame_pointer_needed = false;
10313 stack_realign = false;
10314 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10315 crtl->stack_alignment_needed = incoming_stack_boundary;
10316 crtl->stack_alignment_estimated = incoming_stack_boundary;
10317 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10318 crtl->preferred_stack_boundary = incoming_stack_boundary;
10319 df_finish_pass (true);
10320 df_scan_alloc (NULL);
10321 df_scan_blocks ();
10322 df_compute_regs_ever_live (true);
10323 df_analyze ();
10326 crtl->stack_realign_needed = stack_realign;
10327 crtl->stack_realign_finalized = true;
10330 /* Expand the prologue into a bunch of separate insns. */
10332 void
10333 ix86_expand_prologue (void)
10335 struct machine_function *m = cfun->machine;
10336 rtx insn, t;
10337 bool pic_reg_used;
10338 struct ix86_frame frame;
10339 HOST_WIDE_INT allocate;
10340 bool int_registers_saved;
10341 bool sse_registers_saved;
10343 ix86_finalize_stack_realign_flags ();
10345 /* DRAP should not coexist with stack_realign_fp */
10346 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10348 memset (&m->fs, 0, sizeof (m->fs));
10350 /* Initialize CFA state for before the prologue. */
10351 m->fs.cfa_reg = stack_pointer_rtx;
10352 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10354 /* Track SP offset to the CFA. We continue tracking this after we've
10355 swapped the CFA register away from SP. In the case of re-alignment
10356 this is fudged; we're interested to offsets within the local frame. */
10357 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10358 m->fs.sp_valid = true;
10360 ix86_compute_frame_layout (&frame);
10362 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10364 /* We should have already generated an error for any use of
10365 ms_hook on a nested function. */
10366 gcc_checking_assert (!ix86_static_chain_on_stack);
10368 /* Check if profiling is active and we shall use profiling before
10369 prologue variant. If so sorry. */
10370 if (crtl->profile && flag_fentry != 0)
10371 sorry ("ms_hook_prologue attribute isn%'t compatible "
10372 "with -mfentry for 32-bit");
10374 /* In ix86_asm_output_function_label we emitted:
10375 8b ff movl.s %edi,%edi
10376 55 push %ebp
10377 8b ec movl.s %esp,%ebp
10379 This matches the hookable function prologue in Win32 API
10380 functions in Microsoft Windows XP Service Pack 2 and newer.
10381 Wine uses this to enable Windows apps to hook the Win32 API
10382 functions provided by Wine.
10384 What that means is that we've already set up the frame pointer. */
10386 if (frame_pointer_needed
10387 && !(crtl->drap_reg && crtl->stack_realign_needed))
10389 rtx push, mov;
10391 /* We've decided to use the frame pointer already set up.
10392 Describe this to the unwinder by pretending that both
10393 push and mov insns happen right here.
10395 Putting the unwind info here at the end of the ms_hook
10396 is done so that we can make absolutely certain we get
10397 the required byte sequence at the start of the function,
10398 rather than relying on an assembler that can produce
10399 the exact encoding required.
10401 However it does mean (in the unpatched case) that we have
10402 a 1 insn window where the asynchronous unwind info is
10403 incorrect. However, if we placed the unwind info at
10404 its correct location we would have incorrect unwind info
10405 in the patched case. Which is probably all moot since
10406 I don't expect Wine generates dwarf2 unwind info for the
10407 system libraries that use this feature. */
10409 insn = emit_insn (gen_blockage ());
10411 push = gen_push (hard_frame_pointer_rtx);
10412 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10413 stack_pointer_rtx);
10414 RTX_FRAME_RELATED_P (push) = 1;
10415 RTX_FRAME_RELATED_P (mov) = 1;
10417 RTX_FRAME_RELATED_P (insn) = 1;
10418 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10419 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10421 /* Note that gen_push incremented m->fs.cfa_offset, even
10422 though we didn't emit the push insn here. */
10423 m->fs.cfa_reg = hard_frame_pointer_rtx;
10424 m->fs.fp_offset = m->fs.cfa_offset;
10425 m->fs.fp_valid = true;
10427 else
10429 /* The frame pointer is not needed so pop %ebp again.
10430 This leaves us with a pristine state. */
10431 emit_insn (gen_pop (hard_frame_pointer_rtx));
10435 /* The first insn of a function that accepts its static chain on the
10436 stack is to push the register that would be filled in by a direct
10437 call. This insn will be skipped by the trampoline. */
10438 else if (ix86_static_chain_on_stack)
10440 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10441 emit_insn (gen_blockage ());
10443 /* We don't want to interpret this push insn as a register save,
10444 only as a stack adjustment. The real copy of the register as
10445 a save will be done later, if needed. */
10446 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10447 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10448 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10449 RTX_FRAME_RELATED_P (insn) = 1;
10452 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10453 of DRAP is needed and stack realignment is really needed after reload */
10454 if (stack_realign_drap)
10456 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10458 /* Only need to push parameter pointer reg if it is caller saved. */
10459 if (!call_used_regs[REGNO (crtl->drap_reg)])
10461 /* Push arg pointer reg */
10462 insn = emit_insn (gen_push (crtl->drap_reg));
10463 RTX_FRAME_RELATED_P (insn) = 1;
10466 /* Grab the argument pointer. */
10467 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10468 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10469 RTX_FRAME_RELATED_P (insn) = 1;
10470 m->fs.cfa_reg = crtl->drap_reg;
10471 m->fs.cfa_offset = 0;
10473 /* Align the stack. */
10474 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10475 stack_pointer_rtx,
10476 GEN_INT (-align_bytes)));
10477 RTX_FRAME_RELATED_P (insn) = 1;
10479 /* Replicate the return address on the stack so that return
10480 address can be reached via (argp - 1) slot. This is needed
10481 to implement macro RETURN_ADDR_RTX and intrinsic function
10482 expand_builtin_return_addr etc. */
10483 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10484 t = gen_frame_mem (word_mode, t);
10485 insn = emit_insn (gen_push (t));
10486 RTX_FRAME_RELATED_P (insn) = 1;
10488 /* For the purposes of frame and register save area addressing,
10489 we've started over with a new frame. */
10490 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10491 m->fs.realigned = true;
10494 int_registers_saved = (frame.nregs == 0);
10495 sse_registers_saved = (frame.nsseregs == 0);
10497 if (frame_pointer_needed && !m->fs.fp_valid)
10499 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10500 slower on all targets. Also sdb doesn't like it. */
10501 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10502 RTX_FRAME_RELATED_P (insn) = 1;
10504 /* Push registers now, before setting the frame pointer
10505 on SEH target. */
10506 if (!int_registers_saved
10507 && TARGET_SEH
10508 && !frame.save_regs_using_mov)
10510 ix86_emit_save_regs ();
10511 int_registers_saved = true;
10512 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10515 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10517 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10518 RTX_FRAME_RELATED_P (insn) = 1;
10520 if (m->fs.cfa_reg == stack_pointer_rtx)
10521 m->fs.cfa_reg = hard_frame_pointer_rtx;
10522 m->fs.fp_offset = m->fs.sp_offset;
10523 m->fs.fp_valid = true;
10527 if (!int_registers_saved)
10529 /* If saving registers via PUSH, do so now. */
10530 if (!frame.save_regs_using_mov)
10532 ix86_emit_save_regs ();
10533 int_registers_saved = true;
10534 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10537 /* When using red zone we may start register saving before allocating
10538 the stack frame saving one cycle of the prologue. However, avoid
10539 doing this if we have to probe the stack; at least on x86_64 the
10540 stack probe can turn into a call that clobbers a red zone location. */
10541 else if (ix86_using_red_zone ()
10542 && (! TARGET_STACK_PROBE
10543 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10545 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10546 int_registers_saved = true;
10550 if (stack_realign_fp)
10552 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10553 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10555 /* The computation of the size of the re-aligned stack frame means
10556 that we must allocate the size of the register save area before
10557 performing the actual alignment. Otherwise we cannot guarantee
10558 that there's enough storage above the realignment point. */
10559 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10560 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10561 GEN_INT (m->fs.sp_offset
10562 - frame.sse_reg_save_offset),
10563 -1, false);
10565 /* Align the stack. */
10566 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10567 stack_pointer_rtx,
10568 GEN_INT (-align_bytes)));
10570 /* For the purposes of register save area addressing, the stack
10571 pointer is no longer valid. As for the value of sp_offset,
10572 see ix86_compute_frame_layout, which we need to match in order
10573 to pass verification of stack_pointer_offset at the end. */
10574 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10575 m->fs.sp_valid = false;
10578 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10580 if (flag_stack_usage_info)
10582 /* We start to count from ARG_POINTER. */
10583 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10585 /* If it was realigned, take into account the fake frame. */
10586 if (stack_realign_drap)
10588 if (ix86_static_chain_on_stack)
10589 stack_size += UNITS_PER_WORD;
10591 if (!call_used_regs[REGNO (crtl->drap_reg)])
10592 stack_size += UNITS_PER_WORD;
10594 /* This over-estimates by 1 minimal-stack-alignment-unit but
10595 mitigates that by counting in the new return address slot. */
10596 current_function_dynamic_stack_size
10597 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10600 current_function_static_stack_size = stack_size;
10603 /* On SEH target with very large frame size, allocate an area to save
10604 SSE registers (as the very large allocation won't be described). */
10605 if (TARGET_SEH
10606 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10607 && !sse_registers_saved)
10609 HOST_WIDE_INT sse_size =
10610 frame.sse_reg_save_offset - frame.reg_save_offset;
10612 gcc_assert (int_registers_saved);
10614 /* No need to do stack checking as the area will be immediately
10615 written. */
10616 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10617 GEN_INT (-sse_size), -1,
10618 m->fs.cfa_reg == stack_pointer_rtx);
10619 allocate -= sse_size;
10620 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10621 sse_registers_saved = true;
10624 /* The stack has already been decremented by the instruction calling us
10625 so probe if the size is non-negative to preserve the protection area. */
10626 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10628 /* We expect the registers to be saved when probes are used. */
10629 gcc_assert (int_registers_saved);
10631 if (STACK_CHECK_MOVING_SP)
10633 ix86_adjust_stack_and_probe (allocate);
10634 allocate = 0;
10636 else
10638 HOST_WIDE_INT size = allocate;
10640 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10641 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10643 if (TARGET_STACK_PROBE)
10644 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10645 else
10646 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10650 if (allocate == 0)
10652 else if (!ix86_target_stack_probe ()
10653 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10655 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10656 GEN_INT (-allocate), -1,
10657 m->fs.cfa_reg == stack_pointer_rtx);
10659 else
10661 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10662 rtx r10 = NULL;
10663 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10664 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10665 bool eax_live = false;
10666 bool r10_live = false;
10668 if (TARGET_64BIT)
10669 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10670 if (!TARGET_64BIT_MS_ABI)
10671 eax_live = ix86_eax_live_at_start_p ();
10673 /* Note that SEH directives need to continue tracking the stack
10674 pointer even after the frame pointer has been set up. */
10675 if (eax_live)
10677 insn = emit_insn (gen_push (eax));
10678 allocate -= UNITS_PER_WORD;
10679 if (sp_is_cfa_reg || TARGET_SEH)
10681 if (sp_is_cfa_reg)
10682 m->fs.cfa_offset += UNITS_PER_WORD;
10683 RTX_FRAME_RELATED_P (insn) = 1;
10687 if (r10_live)
10689 r10 = gen_rtx_REG (Pmode, R10_REG);
10690 insn = emit_insn (gen_push (r10));
10691 allocate -= UNITS_PER_WORD;
10692 if (sp_is_cfa_reg || TARGET_SEH)
10694 if (sp_is_cfa_reg)
10695 m->fs.cfa_offset += UNITS_PER_WORD;
10696 RTX_FRAME_RELATED_P (insn) = 1;
10700 emit_move_insn (eax, GEN_INT (allocate));
10701 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10703 /* Use the fact that AX still contains ALLOCATE. */
10704 adjust_stack_insn = (Pmode == DImode
10705 ? gen_pro_epilogue_adjust_stack_di_sub
10706 : gen_pro_epilogue_adjust_stack_si_sub);
10708 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10709 stack_pointer_rtx, eax));
10711 if (sp_is_cfa_reg || TARGET_SEH)
10713 if (sp_is_cfa_reg)
10714 m->fs.cfa_offset += allocate;
10715 RTX_FRAME_RELATED_P (insn) = 1;
10716 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10717 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10718 plus_constant (Pmode, stack_pointer_rtx,
10719 -allocate)));
10721 m->fs.sp_offset += allocate;
10723 if (r10_live && eax_live)
10725 t = choose_baseaddr (m->fs.sp_offset - allocate);
10726 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10727 gen_frame_mem (word_mode, t));
10728 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10729 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10730 gen_frame_mem (word_mode, t));
10732 else if (eax_live || r10_live)
10734 t = choose_baseaddr (m->fs.sp_offset - allocate);
10735 emit_move_insn (gen_rtx_REG (word_mode,
10736 (eax_live ? AX_REG : R10_REG)),
10737 gen_frame_mem (word_mode, t));
10740 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10742 /* If we havn't already set up the frame pointer, do so now. */
10743 if (frame_pointer_needed && !m->fs.fp_valid)
10745 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10746 GEN_INT (frame.stack_pointer_offset
10747 - frame.hard_frame_pointer_offset));
10748 insn = emit_insn (insn);
10749 RTX_FRAME_RELATED_P (insn) = 1;
10750 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10752 if (m->fs.cfa_reg == stack_pointer_rtx)
10753 m->fs.cfa_reg = hard_frame_pointer_rtx;
10754 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10755 m->fs.fp_valid = true;
10758 if (!int_registers_saved)
10759 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10760 if (!sse_registers_saved)
10761 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10763 pic_reg_used = false;
10764 /* We don't use pic-register for pe-coff target. */
10765 if (pic_offset_table_rtx
10766 && !TARGET_PECOFF
10767 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10768 || crtl->profile))
10770 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10772 if (alt_pic_reg_used != INVALID_REGNUM)
10773 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10775 pic_reg_used = true;
10778 if (pic_reg_used)
10780 if (TARGET_64BIT)
10782 if (ix86_cmodel == CM_LARGE_PIC)
10784 rtx label, tmp_reg;
10786 gcc_assert (Pmode == DImode);
10787 label = gen_label_rtx ();
10788 emit_label (label);
10789 LABEL_PRESERVE_P (label) = 1;
10790 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10791 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10792 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10793 label));
10794 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10795 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10796 pic_offset_table_rtx, tmp_reg));
10798 else
10799 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10801 else
10803 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10804 RTX_FRAME_RELATED_P (insn) = 1;
10805 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10809 /* In the pic_reg_used case, make sure that the got load isn't deleted
10810 when mcount needs it. Blockage to avoid call movement across mcount
10811 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10812 note. */
10813 if (crtl->profile && !flag_fentry && pic_reg_used)
10814 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10816 if (crtl->drap_reg && !crtl->stack_realign_needed)
10818 /* vDRAP is setup but after reload it turns out stack realign
10819 isn't necessary, here we will emit prologue to setup DRAP
10820 without stack realign adjustment */
10821 t = choose_baseaddr (0);
10822 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10825 /* Prevent instructions from being scheduled into register save push
10826 sequence when access to the redzone area is done through frame pointer.
10827 The offset between the frame pointer and the stack pointer is calculated
10828 relative to the value of the stack pointer at the end of the function
10829 prologue, and moving instructions that access redzone area via frame
10830 pointer inside push sequence violates this assumption. */
10831 if (frame_pointer_needed && frame.red_zone_size)
10832 emit_insn (gen_memory_blockage ());
10834 /* Emit cld instruction if stringops are used in the function. */
10835 if (TARGET_CLD && ix86_current_function_needs_cld)
10836 emit_insn (gen_cld ());
10838 /* SEH requires that the prologue end within 256 bytes of the start of
10839 the function. Prevent instruction schedules that would extend that.
10840 Further, prevent alloca modifications to the stack pointer from being
10841 combined with prologue modifications. */
10842 if (TARGET_SEH)
10843 emit_insn (gen_prologue_use (stack_pointer_rtx));
10846 /* Emit code to restore REG using a POP insn. */
10848 static void
10849 ix86_emit_restore_reg_using_pop (rtx reg)
10851 struct machine_function *m = cfun->machine;
10852 rtx insn = emit_insn (gen_pop (reg));
10854 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10855 m->fs.sp_offset -= UNITS_PER_WORD;
10857 if (m->fs.cfa_reg == crtl->drap_reg
10858 && REGNO (reg) == REGNO (crtl->drap_reg))
10860 /* Previously we'd represented the CFA as an expression
10861 like *(%ebp - 8). We've just popped that value from
10862 the stack, which means we need to reset the CFA to
10863 the drap register. This will remain until we restore
10864 the stack pointer. */
10865 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10866 RTX_FRAME_RELATED_P (insn) = 1;
10868 /* This means that the DRAP register is valid for addressing too. */
10869 m->fs.drap_valid = true;
10870 return;
10873 if (m->fs.cfa_reg == stack_pointer_rtx)
10875 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10876 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10877 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10878 RTX_FRAME_RELATED_P (insn) = 1;
10880 m->fs.cfa_offset -= UNITS_PER_WORD;
10883 /* When the frame pointer is the CFA, and we pop it, we are
10884 swapping back to the stack pointer as the CFA. This happens
10885 for stack frames that don't allocate other data, so we assume
10886 the stack pointer is now pointing at the return address, i.e.
10887 the function entry state, which makes the offset be 1 word. */
10888 if (reg == hard_frame_pointer_rtx)
10890 m->fs.fp_valid = false;
10891 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10893 m->fs.cfa_reg = stack_pointer_rtx;
10894 m->fs.cfa_offset -= UNITS_PER_WORD;
10896 add_reg_note (insn, REG_CFA_DEF_CFA,
10897 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10898 GEN_INT (m->fs.cfa_offset)));
10899 RTX_FRAME_RELATED_P (insn) = 1;
10904 /* Emit code to restore saved registers using POP insns. */
10906 static void
10907 ix86_emit_restore_regs_using_pop (void)
10909 unsigned int regno;
10911 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10912 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10913 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10916 /* Emit code and notes for the LEAVE instruction. */
10918 static void
10919 ix86_emit_leave (void)
10921 struct machine_function *m = cfun->machine;
10922 rtx insn = emit_insn (ix86_gen_leave ());
10924 ix86_add_queued_cfa_restore_notes (insn);
10926 gcc_assert (m->fs.fp_valid);
10927 m->fs.sp_valid = true;
10928 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10929 m->fs.fp_valid = false;
10931 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10933 m->fs.cfa_reg = stack_pointer_rtx;
10934 m->fs.cfa_offset = m->fs.sp_offset;
10936 add_reg_note (insn, REG_CFA_DEF_CFA,
10937 plus_constant (Pmode, stack_pointer_rtx,
10938 m->fs.sp_offset));
10939 RTX_FRAME_RELATED_P (insn) = 1;
10941 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10942 m->fs.fp_offset);
10945 /* Emit code to restore saved registers using MOV insns.
10946 First register is restored from CFA - CFA_OFFSET. */
10947 static void
10948 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10949 bool maybe_eh_return)
10951 struct machine_function *m = cfun->machine;
10952 unsigned int regno;
10954 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10955 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10957 rtx reg = gen_rtx_REG (word_mode, regno);
10958 rtx insn, mem;
10960 mem = choose_baseaddr (cfa_offset);
10961 mem = gen_frame_mem (word_mode, mem);
10962 insn = emit_move_insn (reg, mem);
10964 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10966 /* Previously we'd represented the CFA as an expression
10967 like *(%ebp - 8). We've just popped that value from
10968 the stack, which means we need to reset the CFA to
10969 the drap register. This will remain until we restore
10970 the stack pointer. */
10971 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10972 RTX_FRAME_RELATED_P (insn) = 1;
10974 /* This means that the DRAP register is valid for addressing. */
10975 m->fs.drap_valid = true;
10977 else
10978 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10980 cfa_offset -= UNITS_PER_WORD;
10984 /* Emit code to restore saved registers using MOV insns.
10985 First register is restored from CFA - CFA_OFFSET. */
10986 static void
10987 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10988 bool maybe_eh_return)
10990 unsigned int regno;
10992 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10993 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10995 rtx reg = gen_rtx_REG (V4SFmode, regno);
10996 rtx mem;
10998 mem = choose_baseaddr (cfa_offset);
10999 mem = gen_rtx_MEM (V4SFmode, mem);
11000 set_mem_align (mem, 128);
11001 emit_move_insn (reg, mem);
11003 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11005 cfa_offset -= 16;
11009 /* Restore function stack, frame, and registers. */
11011 void
11012 ix86_expand_epilogue (int style)
11014 struct machine_function *m = cfun->machine;
11015 struct machine_frame_state frame_state_save = m->fs;
11016 struct ix86_frame frame;
11017 bool restore_regs_via_mov;
11018 bool using_drap;
11020 ix86_finalize_stack_realign_flags ();
11021 ix86_compute_frame_layout (&frame);
11023 m->fs.sp_valid = (!frame_pointer_needed
11024 || (crtl->sp_is_unchanging
11025 && !stack_realign_fp));
11026 gcc_assert (!m->fs.sp_valid
11027 || m->fs.sp_offset == frame.stack_pointer_offset);
11029 /* The FP must be valid if the frame pointer is present. */
11030 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11031 gcc_assert (!m->fs.fp_valid
11032 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11034 /* We must have *some* valid pointer to the stack frame. */
11035 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11037 /* The DRAP is never valid at this point. */
11038 gcc_assert (!m->fs.drap_valid);
11040 /* See the comment about red zone and frame
11041 pointer usage in ix86_expand_prologue. */
11042 if (frame_pointer_needed && frame.red_zone_size)
11043 emit_insn (gen_memory_blockage ());
11045 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11046 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11048 /* Determine the CFA offset of the end of the red-zone. */
11049 m->fs.red_zone_offset = 0;
11050 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11052 /* The red-zone begins below the return address. */
11053 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11055 /* When the register save area is in the aligned portion of
11056 the stack, determine the maximum runtime displacement that
11057 matches up with the aligned frame. */
11058 if (stack_realign_drap)
11059 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11060 + UNITS_PER_WORD);
11063 /* Special care must be taken for the normal return case of a function
11064 using eh_return: the eax and edx registers are marked as saved, but
11065 not restored along this path. Adjust the save location to match. */
11066 if (crtl->calls_eh_return && style != 2)
11067 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11069 /* EH_RETURN requires the use of moves to function properly. */
11070 if (crtl->calls_eh_return)
11071 restore_regs_via_mov = true;
11072 /* SEH requires the use of pops to identify the epilogue. */
11073 else if (TARGET_SEH)
11074 restore_regs_via_mov = false;
11075 /* If we're only restoring one register and sp is not valid then
11076 using a move instruction to restore the register since it's
11077 less work than reloading sp and popping the register. */
11078 else if (!m->fs.sp_valid && frame.nregs <= 1)
11079 restore_regs_via_mov = true;
11080 else if (TARGET_EPILOGUE_USING_MOVE
11081 && cfun->machine->use_fast_prologue_epilogue
11082 && (frame.nregs > 1
11083 || m->fs.sp_offset != frame.reg_save_offset))
11084 restore_regs_via_mov = true;
11085 else if (frame_pointer_needed
11086 && !frame.nregs
11087 && m->fs.sp_offset != frame.reg_save_offset)
11088 restore_regs_via_mov = true;
11089 else if (frame_pointer_needed
11090 && TARGET_USE_LEAVE
11091 && cfun->machine->use_fast_prologue_epilogue
11092 && frame.nregs == 1)
11093 restore_regs_via_mov = true;
11094 else
11095 restore_regs_via_mov = false;
11097 if (restore_regs_via_mov || frame.nsseregs)
11099 /* Ensure that the entire register save area is addressable via
11100 the stack pointer, if we will restore via sp. */
11101 if (TARGET_64BIT
11102 && m->fs.sp_offset > 0x7fffffff
11103 && !(m->fs.fp_valid || m->fs.drap_valid)
11104 && (frame.nsseregs + frame.nregs) != 0)
11106 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11107 GEN_INT (m->fs.sp_offset
11108 - frame.sse_reg_save_offset),
11109 style,
11110 m->fs.cfa_reg == stack_pointer_rtx);
11114 /* If there are any SSE registers to restore, then we have to do it
11115 via moves, since there's obviously no pop for SSE regs. */
11116 if (frame.nsseregs)
11117 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11118 style == 2);
11120 if (restore_regs_via_mov)
11122 rtx t;
11124 if (frame.nregs)
11125 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11127 /* eh_return epilogues need %ecx added to the stack pointer. */
11128 if (style == 2)
11130 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11132 /* Stack align doesn't work with eh_return. */
11133 gcc_assert (!stack_realign_drap);
11134 /* Neither does regparm nested functions. */
11135 gcc_assert (!ix86_static_chain_on_stack);
11137 if (frame_pointer_needed)
11139 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11140 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11141 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11143 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11144 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11146 /* Note that we use SA as a temporary CFA, as the return
11147 address is at the proper place relative to it. We
11148 pretend this happens at the FP restore insn because
11149 prior to this insn the FP would be stored at the wrong
11150 offset relative to SA, and after this insn we have no
11151 other reasonable register to use for the CFA. We don't
11152 bother resetting the CFA to the SP for the duration of
11153 the return insn. */
11154 add_reg_note (insn, REG_CFA_DEF_CFA,
11155 plus_constant (Pmode, sa, UNITS_PER_WORD));
11156 ix86_add_queued_cfa_restore_notes (insn);
11157 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11158 RTX_FRAME_RELATED_P (insn) = 1;
11160 m->fs.cfa_reg = sa;
11161 m->fs.cfa_offset = UNITS_PER_WORD;
11162 m->fs.fp_valid = false;
11164 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11165 const0_rtx, style, false);
11167 else
11169 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11170 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11171 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11172 ix86_add_queued_cfa_restore_notes (insn);
11174 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11175 if (m->fs.cfa_offset != UNITS_PER_WORD)
11177 m->fs.cfa_offset = UNITS_PER_WORD;
11178 add_reg_note (insn, REG_CFA_DEF_CFA,
11179 plus_constant (Pmode, stack_pointer_rtx,
11180 UNITS_PER_WORD));
11181 RTX_FRAME_RELATED_P (insn) = 1;
11184 m->fs.sp_offset = UNITS_PER_WORD;
11185 m->fs.sp_valid = true;
11188 else
11190 /* SEH requires that the function end with (1) a stack adjustment
11191 if necessary, (2) a sequence of pops, and (3) a return or
11192 jump instruction. Prevent insns from the function body from
11193 being scheduled into this sequence. */
11194 if (TARGET_SEH)
11196 /* Prevent a catch region from being adjacent to the standard
11197 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11198 several other flags that would be interesting to test are
11199 not yet set up. */
11200 if (flag_non_call_exceptions)
11201 emit_insn (gen_nops (const1_rtx));
11202 else
11203 emit_insn (gen_blockage ());
11206 /* First step is to deallocate the stack frame so that we can
11207 pop the registers. Also do it on SEH target for very large
11208 frame as the emitted instructions aren't allowed by the ABI in
11209 epilogues. */
11210 if (!m->fs.sp_valid
11211 || (TARGET_SEH
11212 && (m->fs.sp_offset - frame.reg_save_offset
11213 >= SEH_MAX_FRAME_SIZE)))
11215 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11216 GEN_INT (m->fs.fp_offset
11217 - frame.reg_save_offset),
11218 style, false);
11220 else if (m->fs.sp_offset != frame.reg_save_offset)
11222 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11223 GEN_INT (m->fs.sp_offset
11224 - frame.reg_save_offset),
11225 style,
11226 m->fs.cfa_reg == stack_pointer_rtx);
11229 ix86_emit_restore_regs_using_pop ();
11232 /* If we used a stack pointer and haven't already got rid of it,
11233 then do so now. */
11234 if (m->fs.fp_valid)
11236 /* If the stack pointer is valid and pointing at the frame
11237 pointer store address, then we only need a pop. */
11238 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11239 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11240 /* Leave results in shorter dependency chains on CPUs that are
11241 able to grok it fast. */
11242 else if (TARGET_USE_LEAVE
11243 || optimize_bb_for_size_p (EXIT_BLOCK_PTR)
11244 || !cfun->machine->use_fast_prologue_epilogue)
11245 ix86_emit_leave ();
11246 else
11248 pro_epilogue_adjust_stack (stack_pointer_rtx,
11249 hard_frame_pointer_rtx,
11250 const0_rtx, style, !using_drap);
11251 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11255 if (using_drap)
11257 int param_ptr_offset = UNITS_PER_WORD;
11258 rtx insn;
11260 gcc_assert (stack_realign_drap);
11262 if (ix86_static_chain_on_stack)
11263 param_ptr_offset += UNITS_PER_WORD;
11264 if (!call_used_regs[REGNO (crtl->drap_reg)])
11265 param_ptr_offset += UNITS_PER_WORD;
11267 insn = emit_insn (gen_rtx_SET
11268 (VOIDmode, stack_pointer_rtx,
11269 gen_rtx_PLUS (Pmode,
11270 crtl->drap_reg,
11271 GEN_INT (-param_ptr_offset))));
11272 m->fs.cfa_reg = stack_pointer_rtx;
11273 m->fs.cfa_offset = param_ptr_offset;
11274 m->fs.sp_offset = param_ptr_offset;
11275 m->fs.realigned = false;
11277 add_reg_note (insn, REG_CFA_DEF_CFA,
11278 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11279 GEN_INT (param_ptr_offset)));
11280 RTX_FRAME_RELATED_P (insn) = 1;
11282 if (!call_used_regs[REGNO (crtl->drap_reg)])
11283 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11286 /* At this point the stack pointer must be valid, and we must have
11287 restored all of the registers. We may not have deallocated the
11288 entire stack frame. We've delayed this until now because it may
11289 be possible to merge the local stack deallocation with the
11290 deallocation forced by ix86_static_chain_on_stack. */
11291 gcc_assert (m->fs.sp_valid);
11292 gcc_assert (!m->fs.fp_valid);
11293 gcc_assert (!m->fs.realigned);
11294 if (m->fs.sp_offset != UNITS_PER_WORD)
11296 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11297 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11298 style, true);
11300 else
11301 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11303 /* Sibcall epilogues don't want a return instruction. */
11304 if (style == 0)
11306 m->fs = frame_state_save;
11307 return;
11310 if (crtl->args.pops_args && crtl->args.size)
11312 rtx popc = GEN_INT (crtl->args.pops_args);
11314 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11315 address, do explicit add, and jump indirectly to the caller. */
11317 if (crtl->args.pops_args >= 65536)
11319 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11320 rtx insn;
11322 /* There is no "pascal" calling convention in any 64bit ABI. */
11323 gcc_assert (!TARGET_64BIT);
11325 insn = emit_insn (gen_pop (ecx));
11326 m->fs.cfa_offset -= UNITS_PER_WORD;
11327 m->fs.sp_offset -= UNITS_PER_WORD;
11329 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11330 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11331 add_reg_note (insn, REG_CFA_REGISTER,
11332 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11333 RTX_FRAME_RELATED_P (insn) = 1;
11335 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11336 popc, -1, true);
11337 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11339 else
11340 emit_jump_insn (gen_simple_return_pop_internal (popc));
11342 else
11343 emit_jump_insn (gen_simple_return_internal ());
11345 /* Restore the state back to the state from the prologue,
11346 so that it's correct for the next epilogue. */
11347 m->fs = frame_state_save;
11350 /* Reset from the function's potential modifications. */
11352 static void
11353 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11354 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11356 if (pic_offset_table_rtx)
11357 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11358 #if TARGET_MACHO
11359 /* Mach-O doesn't support labels at the end of objects, so if
11360 it looks like we might want one, insert a NOP. */
11362 rtx insn = get_last_insn ();
11363 rtx deleted_debug_label = NULL_RTX;
11364 while (insn
11365 && NOTE_P (insn)
11366 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11368 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11369 notes only, instead set their CODE_LABEL_NUMBER to -1,
11370 otherwise there would be code generation differences
11371 in between -g and -g0. */
11372 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11373 deleted_debug_label = insn;
11374 insn = PREV_INSN (insn);
11376 if (insn
11377 && (LABEL_P (insn)
11378 || (NOTE_P (insn)
11379 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11380 fputs ("\tnop\n", file);
11381 else if (deleted_debug_label)
11382 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11383 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11384 CODE_LABEL_NUMBER (insn) = -1;
11386 #endif
11390 /* Return a scratch register to use in the split stack prologue. The
11391 split stack prologue is used for -fsplit-stack. It is the first
11392 instructions in the function, even before the regular prologue.
11393 The scratch register can be any caller-saved register which is not
11394 used for parameters or for the static chain. */
11396 static unsigned int
11397 split_stack_prologue_scratch_regno (void)
11399 if (TARGET_64BIT)
11400 return R11_REG;
11401 else
11403 bool is_fastcall, is_thiscall;
11404 int regparm;
11406 is_fastcall = (lookup_attribute ("fastcall",
11407 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11408 != NULL);
11409 is_thiscall = (lookup_attribute ("thiscall",
11410 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11411 != NULL);
11412 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11414 if (is_fastcall)
11416 if (DECL_STATIC_CHAIN (cfun->decl))
11418 sorry ("-fsplit-stack does not support fastcall with "
11419 "nested function");
11420 return INVALID_REGNUM;
11422 return AX_REG;
11424 else if (is_thiscall)
11426 if (!DECL_STATIC_CHAIN (cfun->decl))
11427 return DX_REG;
11428 return AX_REG;
11430 else if (regparm < 3)
11432 if (!DECL_STATIC_CHAIN (cfun->decl))
11433 return CX_REG;
11434 else
11436 if (regparm >= 2)
11438 sorry ("-fsplit-stack does not support 2 register "
11439 " parameters for a nested function");
11440 return INVALID_REGNUM;
11442 return DX_REG;
11445 else
11447 /* FIXME: We could make this work by pushing a register
11448 around the addition and comparison. */
11449 sorry ("-fsplit-stack does not support 3 register parameters");
11450 return INVALID_REGNUM;
11455 /* A SYMBOL_REF for the function which allocates new stackspace for
11456 -fsplit-stack. */
11458 static GTY(()) rtx split_stack_fn;
11460 /* A SYMBOL_REF for the more stack function when using the large
11461 model. */
11463 static GTY(()) rtx split_stack_fn_large;
11465 /* Handle -fsplit-stack. These are the first instructions in the
11466 function, even before the regular prologue. */
11468 void
11469 ix86_expand_split_stack_prologue (void)
11471 struct ix86_frame frame;
11472 HOST_WIDE_INT allocate;
11473 unsigned HOST_WIDE_INT args_size;
11474 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11475 rtx scratch_reg = NULL_RTX;
11476 rtx varargs_label = NULL_RTX;
11477 rtx fn;
11479 gcc_assert (flag_split_stack && reload_completed);
11481 ix86_finalize_stack_realign_flags ();
11482 ix86_compute_frame_layout (&frame);
11483 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11485 /* This is the label we will branch to if we have enough stack
11486 space. We expect the basic block reordering pass to reverse this
11487 branch if optimizing, so that we branch in the unlikely case. */
11488 label = gen_label_rtx ();
11490 /* We need to compare the stack pointer minus the frame size with
11491 the stack boundary in the TCB. The stack boundary always gives
11492 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11493 can compare directly. Otherwise we need to do an addition. */
11495 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11496 UNSPEC_STACK_CHECK);
11497 limit = gen_rtx_CONST (Pmode, limit);
11498 limit = gen_rtx_MEM (Pmode, limit);
11499 if (allocate < SPLIT_STACK_AVAILABLE)
11500 current = stack_pointer_rtx;
11501 else
11503 unsigned int scratch_regno;
11504 rtx offset;
11506 /* We need a scratch register to hold the stack pointer minus
11507 the required frame size. Since this is the very start of the
11508 function, the scratch register can be any caller-saved
11509 register which is not used for parameters. */
11510 offset = GEN_INT (- allocate);
11511 scratch_regno = split_stack_prologue_scratch_regno ();
11512 if (scratch_regno == INVALID_REGNUM)
11513 return;
11514 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11515 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11517 /* We don't use ix86_gen_add3 in this case because it will
11518 want to split to lea, but when not optimizing the insn
11519 will not be split after this point. */
11520 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11521 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11522 offset)));
11524 else
11526 emit_move_insn (scratch_reg, offset);
11527 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11528 stack_pointer_rtx));
11530 current = scratch_reg;
11533 ix86_expand_branch (GEU, current, limit, label);
11534 jump_insn = get_last_insn ();
11535 JUMP_LABEL (jump_insn) = label;
11537 /* Mark the jump as very likely to be taken. */
11538 add_reg_note (jump_insn, REG_BR_PROB,
11539 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11541 if (split_stack_fn == NULL_RTX)
11542 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11543 fn = split_stack_fn;
11545 /* Get more stack space. We pass in the desired stack space and the
11546 size of the arguments to copy to the new stack. In 32-bit mode
11547 we push the parameters; __morestack will return on a new stack
11548 anyhow. In 64-bit mode we pass the parameters in r10 and
11549 r11. */
11550 allocate_rtx = GEN_INT (allocate);
11551 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11552 call_fusage = NULL_RTX;
11553 if (TARGET_64BIT)
11555 rtx reg10, reg11;
11557 reg10 = gen_rtx_REG (Pmode, R10_REG);
11558 reg11 = gen_rtx_REG (Pmode, R11_REG);
11560 /* If this function uses a static chain, it will be in %r10.
11561 Preserve it across the call to __morestack. */
11562 if (DECL_STATIC_CHAIN (cfun->decl))
11564 rtx rax;
11566 rax = gen_rtx_REG (word_mode, AX_REG);
11567 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11568 use_reg (&call_fusage, rax);
11571 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11572 && !TARGET_PECOFF)
11574 HOST_WIDE_INT argval;
11576 gcc_assert (Pmode == DImode);
11577 /* When using the large model we need to load the address
11578 into a register, and we've run out of registers. So we
11579 switch to a different calling convention, and we call a
11580 different function: __morestack_large. We pass the
11581 argument size in the upper 32 bits of r10 and pass the
11582 frame size in the lower 32 bits. */
11583 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11584 gcc_assert ((args_size & 0xffffffff) == args_size);
11586 if (split_stack_fn_large == NULL_RTX)
11587 split_stack_fn_large =
11588 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11590 if (ix86_cmodel == CM_LARGE_PIC)
11592 rtx label, x;
11594 label = gen_label_rtx ();
11595 emit_label (label);
11596 LABEL_PRESERVE_P (label) = 1;
11597 emit_insn (gen_set_rip_rex64 (reg10, label));
11598 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11599 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11600 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11601 UNSPEC_GOT);
11602 x = gen_rtx_CONST (Pmode, x);
11603 emit_move_insn (reg11, x);
11604 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11605 x = gen_const_mem (Pmode, x);
11606 emit_move_insn (reg11, x);
11608 else
11609 emit_move_insn (reg11, split_stack_fn_large);
11611 fn = reg11;
11613 argval = ((args_size << 16) << 16) + allocate;
11614 emit_move_insn (reg10, GEN_INT (argval));
11616 else
11618 emit_move_insn (reg10, allocate_rtx);
11619 emit_move_insn (reg11, GEN_INT (args_size));
11620 use_reg (&call_fusage, reg11);
11623 use_reg (&call_fusage, reg10);
11625 else
11627 emit_insn (gen_push (GEN_INT (args_size)));
11628 emit_insn (gen_push (allocate_rtx));
11630 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11631 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11632 NULL_RTX, false);
11633 add_function_usage_to (call_insn, call_fusage);
11635 /* In order to make call/return prediction work right, we now need
11636 to execute a return instruction. See
11637 libgcc/config/i386/morestack.S for the details on how this works.
11639 For flow purposes gcc must not see this as a return
11640 instruction--we need control flow to continue at the subsequent
11641 label. Therefore, we use an unspec. */
11642 gcc_assert (crtl->args.pops_args < 65536);
11643 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11645 /* If we are in 64-bit mode and this function uses a static chain,
11646 we saved %r10 in %rax before calling _morestack. */
11647 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11648 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11649 gen_rtx_REG (word_mode, AX_REG));
11651 /* If this function calls va_start, we need to store a pointer to
11652 the arguments on the old stack, because they may not have been
11653 all copied to the new stack. At this point the old stack can be
11654 found at the frame pointer value used by __morestack, because
11655 __morestack has set that up before calling back to us. Here we
11656 store that pointer in a scratch register, and in
11657 ix86_expand_prologue we store the scratch register in a stack
11658 slot. */
11659 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11661 unsigned int scratch_regno;
11662 rtx frame_reg;
11663 int words;
11665 scratch_regno = split_stack_prologue_scratch_regno ();
11666 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11667 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11669 /* 64-bit:
11670 fp -> old fp value
11671 return address within this function
11672 return address of caller of this function
11673 stack arguments
11674 So we add three words to get to the stack arguments.
11676 32-bit:
11677 fp -> old fp value
11678 return address within this function
11679 first argument to __morestack
11680 second argument to __morestack
11681 return address of caller of this function
11682 stack arguments
11683 So we add five words to get to the stack arguments.
11685 words = TARGET_64BIT ? 3 : 5;
11686 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11687 gen_rtx_PLUS (Pmode, frame_reg,
11688 GEN_INT (words * UNITS_PER_WORD))));
11690 varargs_label = gen_label_rtx ();
11691 emit_jump_insn (gen_jump (varargs_label));
11692 JUMP_LABEL (get_last_insn ()) = varargs_label;
11694 emit_barrier ();
11697 emit_label (label);
11698 LABEL_NUSES (label) = 1;
11700 /* If this function calls va_start, we now have to set the scratch
11701 register for the case where we do not call __morestack. In this
11702 case we need to set it based on the stack pointer. */
11703 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11705 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11706 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11707 GEN_INT (UNITS_PER_WORD))));
11709 emit_label (varargs_label);
11710 LABEL_NUSES (varargs_label) = 1;
11714 /* We may have to tell the dataflow pass that the split stack prologue
11715 is initializing a scratch register. */
11717 static void
11718 ix86_live_on_entry (bitmap regs)
11720 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11722 gcc_assert (flag_split_stack);
11723 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11727 /* Determine if op is suitable SUBREG RTX for address. */
11729 static bool
11730 ix86_address_subreg_operand (rtx op)
11732 enum machine_mode mode;
11734 if (!REG_P (op))
11735 return false;
11737 mode = GET_MODE (op);
11739 if (GET_MODE_CLASS (mode) != MODE_INT)
11740 return false;
11742 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11743 failures when the register is one word out of a two word structure. */
11744 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11745 return false;
11747 /* Allow only SUBREGs of non-eliminable hard registers. */
11748 return register_no_elim_operand (op, mode);
11751 /* Extract the parts of an RTL expression that is a valid memory address
11752 for an instruction. Return 0 if the structure of the address is
11753 grossly off. Return -1 if the address contains ASHIFT, so it is not
11754 strictly valid, but still used for computing length of lea instruction. */
11757 ix86_decompose_address (rtx addr, struct ix86_address *out)
11759 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11760 rtx base_reg, index_reg;
11761 HOST_WIDE_INT scale = 1;
11762 rtx scale_rtx = NULL_RTX;
11763 rtx tmp;
11764 int retval = 1;
11765 enum ix86_address_seg seg = SEG_DEFAULT;
11767 /* Allow zero-extended SImode addresses,
11768 they will be emitted with addr32 prefix. */
11769 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11771 if (GET_CODE (addr) == ZERO_EXTEND
11772 && GET_MODE (XEXP (addr, 0)) == SImode)
11774 addr = XEXP (addr, 0);
11775 if (CONST_INT_P (addr))
11776 return 0;
11778 else if (GET_CODE (addr) == AND
11779 && const_32bit_mask (XEXP (addr, 1), DImode))
11781 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11782 if (addr == NULL_RTX)
11783 return 0;
11785 if (CONST_INT_P (addr))
11786 return 0;
11790 /* Allow SImode subregs of DImode addresses,
11791 they will be emitted with addr32 prefix. */
11792 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11794 if (GET_CODE (addr) == SUBREG
11795 && GET_MODE (SUBREG_REG (addr)) == DImode)
11797 addr = SUBREG_REG (addr);
11798 if (CONST_INT_P (addr))
11799 return 0;
11803 if (REG_P (addr))
11804 base = addr;
11805 else if (GET_CODE (addr) == SUBREG)
11807 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11808 base = addr;
11809 else
11810 return 0;
11812 else if (GET_CODE (addr) == PLUS)
11814 rtx addends[4], op;
11815 int n = 0, i;
11817 op = addr;
11820 if (n >= 4)
11821 return 0;
11822 addends[n++] = XEXP (op, 1);
11823 op = XEXP (op, 0);
11825 while (GET_CODE (op) == PLUS);
11826 if (n >= 4)
11827 return 0;
11828 addends[n] = op;
11830 for (i = n; i >= 0; --i)
11832 op = addends[i];
11833 switch (GET_CODE (op))
11835 case MULT:
11836 if (index)
11837 return 0;
11838 index = XEXP (op, 0);
11839 scale_rtx = XEXP (op, 1);
11840 break;
11842 case ASHIFT:
11843 if (index)
11844 return 0;
11845 index = XEXP (op, 0);
11846 tmp = XEXP (op, 1);
11847 if (!CONST_INT_P (tmp))
11848 return 0;
11849 scale = INTVAL (tmp);
11850 if ((unsigned HOST_WIDE_INT) scale > 3)
11851 return 0;
11852 scale = 1 << scale;
11853 break;
11855 case ZERO_EXTEND:
11856 op = XEXP (op, 0);
11857 if (GET_CODE (op) != UNSPEC)
11858 return 0;
11859 /* FALLTHRU */
11861 case UNSPEC:
11862 if (XINT (op, 1) == UNSPEC_TP
11863 && TARGET_TLS_DIRECT_SEG_REFS
11864 && seg == SEG_DEFAULT)
11865 seg = DEFAULT_TLS_SEG_REG;
11866 else
11867 return 0;
11868 break;
11870 case SUBREG:
11871 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11872 return 0;
11873 /* FALLTHRU */
11875 case REG:
11876 if (!base)
11877 base = op;
11878 else if (!index)
11879 index = op;
11880 else
11881 return 0;
11882 break;
11884 case CONST:
11885 case CONST_INT:
11886 case SYMBOL_REF:
11887 case LABEL_REF:
11888 if (disp)
11889 return 0;
11890 disp = op;
11891 break;
11893 default:
11894 return 0;
11898 else if (GET_CODE (addr) == MULT)
11900 index = XEXP (addr, 0); /* index*scale */
11901 scale_rtx = XEXP (addr, 1);
11903 else if (GET_CODE (addr) == ASHIFT)
11905 /* We're called for lea too, which implements ashift on occasion. */
11906 index = XEXP (addr, 0);
11907 tmp = XEXP (addr, 1);
11908 if (!CONST_INT_P (tmp))
11909 return 0;
11910 scale = INTVAL (tmp);
11911 if ((unsigned HOST_WIDE_INT) scale > 3)
11912 return 0;
11913 scale = 1 << scale;
11914 retval = -1;
11916 else if (CONST_INT_P (addr))
11918 if (!x86_64_immediate_operand (addr, VOIDmode))
11919 return 0;
11921 /* Constant addresses are sign extended to 64bit, we have to
11922 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11923 if (TARGET_X32
11924 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11925 return 0;
11927 disp = addr;
11929 else
11930 disp = addr; /* displacement */
11932 if (index)
11934 if (REG_P (index))
11936 else if (GET_CODE (index) == SUBREG
11937 && ix86_address_subreg_operand (SUBREG_REG (index)))
11939 else
11940 return 0;
11943 /* Address override works only on the (%reg) part of %fs:(%reg). */
11944 if (seg != SEG_DEFAULT
11945 && ((base && GET_MODE (base) != word_mode)
11946 || (index && GET_MODE (index) != word_mode)))
11947 return 0;
11949 /* Extract the integral value of scale. */
11950 if (scale_rtx)
11952 if (!CONST_INT_P (scale_rtx))
11953 return 0;
11954 scale = INTVAL (scale_rtx);
11957 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11958 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11960 /* Avoid useless 0 displacement. */
11961 if (disp == const0_rtx && (base || index))
11962 disp = NULL_RTX;
11964 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11965 if (base_reg && index_reg && scale == 1
11966 && (index_reg == arg_pointer_rtx
11967 || index_reg == frame_pointer_rtx
11968 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11970 rtx tmp;
11971 tmp = base, base = index, index = tmp;
11972 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11975 /* Special case: %ebp cannot be encoded as a base without a displacement.
11976 Similarly %r13. */
11977 if (!disp
11978 && base_reg
11979 && (base_reg == hard_frame_pointer_rtx
11980 || base_reg == frame_pointer_rtx
11981 || base_reg == arg_pointer_rtx
11982 || (REG_P (base_reg)
11983 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11984 || REGNO (base_reg) == R13_REG))))
11985 disp = const0_rtx;
11987 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11988 Avoid this by transforming to [%esi+0].
11989 Reload calls address legitimization without cfun defined, so we need
11990 to test cfun for being non-NULL. */
11991 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11992 && base_reg && !index_reg && !disp
11993 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11994 disp = const0_rtx;
11996 /* Special case: encode reg+reg instead of reg*2. */
11997 if (!base && index && scale == 2)
11998 base = index, base_reg = index_reg, scale = 1;
12000 /* Special case: scaling cannot be encoded without base or displacement. */
12001 if (!base && !disp && index && scale != 1)
12002 disp = const0_rtx;
12004 out->base = base;
12005 out->index = index;
12006 out->disp = disp;
12007 out->scale = scale;
12008 out->seg = seg;
12010 return retval;
12013 /* Return cost of the memory address x.
12014 For i386, it is better to use a complex address than let gcc copy
12015 the address into a reg and make a new pseudo. But not if the address
12016 requires to two regs - that would mean more pseudos with longer
12017 lifetimes. */
12018 static int
12019 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12020 addr_space_t as ATTRIBUTE_UNUSED,
12021 bool speed ATTRIBUTE_UNUSED)
12023 struct ix86_address parts;
12024 int cost = 1;
12025 int ok = ix86_decompose_address (x, &parts);
12027 gcc_assert (ok);
12029 if (parts.base && GET_CODE (parts.base) == SUBREG)
12030 parts.base = SUBREG_REG (parts.base);
12031 if (parts.index && GET_CODE (parts.index) == SUBREG)
12032 parts.index = SUBREG_REG (parts.index);
12034 /* Attempt to minimize number of registers in the address. */
12035 if ((parts.base
12036 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12037 || (parts.index
12038 && (!REG_P (parts.index)
12039 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12040 cost++;
12042 if (parts.base
12043 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12044 && parts.index
12045 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12046 && parts.base != parts.index)
12047 cost++;
12049 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12050 since it's predecode logic can't detect the length of instructions
12051 and it degenerates to vector decoded. Increase cost of such
12052 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12053 to split such addresses or even refuse such addresses at all.
12055 Following addressing modes are affected:
12056 [base+scale*index]
12057 [scale*index+disp]
12058 [base+index]
12060 The first and last case may be avoidable by explicitly coding the zero in
12061 memory address, but I don't have AMD-K6 machine handy to check this
12062 theory. */
12064 if (TARGET_K6
12065 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12066 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12067 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12068 cost += 10;
12070 return cost;
12073 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12074 this is used for to form addresses to local data when -fPIC is in
12075 use. */
12077 static bool
12078 darwin_local_data_pic (rtx disp)
12080 return (GET_CODE (disp) == UNSPEC
12081 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12084 /* Determine if a given RTX is a valid constant. We already know this
12085 satisfies CONSTANT_P. */
12087 static bool
12088 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12090 switch (GET_CODE (x))
12092 case CONST:
12093 x = XEXP (x, 0);
12095 if (GET_CODE (x) == PLUS)
12097 if (!CONST_INT_P (XEXP (x, 1)))
12098 return false;
12099 x = XEXP (x, 0);
12102 if (TARGET_MACHO && darwin_local_data_pic (x))
12103 return true;
12105 /* Only some unspecs are valid as "constants". */
12106 if (GET_CODE (x) == UNSPEC)
12107 switch (XINT (x, 1))
12109 case UNSPEC_GOT:
12110 case UNSPEC_GOTOFF:
12111 case UNSPEC_PLTOFF:
12112 return TARGET_64BIT;
12113 case UNSPEC_TPOFF:
12114 case UNSPEC_NTPOFF:
12115 x = XVECEXP (x, 0, 0);
12116 return (GET_CODE (x) == SYMBOL_REF
12117 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12118 case UNSPEC_DTPOFF:
12119 x = XVECEXP (x, 0, 0);
12120 return (GET_CODE (x) == SYMBOL_REF
12121 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12122 default:
12123 return false;
12126 /* We must have drilled down to a symbol. */
12127 if (GET_CODE (x) == LABEL_REF)
12128 return true;
12129 if (GET_CODE (x) != SYMBOL_REF)
12130 return false;
12131 /* FALLTHRU */
12133 case SYMBOL_REF:
12134 /* TLS symbols are never valid. */
12135 if (SYMBOL_REF_TLS_MODEL (x))
12136 return false;
12138 /* DLLIMPORT symbols are never valid. */
12139 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12140 && SYMBOL_REF_DLLIMPORT_P (x))
12141 return false;
12143 #if TARGET_MACHO
12144 /* mdynamic-no-pic */
12145 if (MACHO_DYNAMIC_NO_PIC_P)
12146 return machopic_symbol_defined_p (x);
12147 #endif
12148 break;
12150 case CONST_DOUBLE:
12151 if (GET_MODE (x) == TImode
12152 && x != CONST0_RTX (TImode)
12153 && !TARGET_64BIT)
12154 return false;
12155 break;
12157 case CONST_VECTOR:
12158 if (!standard_sse_constant_p (x))
12159 return false;
12161 default:
12162 break;
12165 /* Otherwise we handle everything else in the move patterns. */
12166 return true;
12169 /* Determine if it's legal to put X into the constant pool. This
12170 is not possible for the address of thread-local symbols, which
12171 is checked above. */
12173 static bool
12174 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12176 /* We can always put integral constants and vectors in memory. */
12177 switch (GET_CODE (x))
12179 case CONST_INT:
12180 case CONST_DOUBLE:
12181 case CONST_VECTOR:
12182 return false;
12184 default:
12185 break;
12187 return !ix86_legitimate_constant_p (mode, x);
12190 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12191 otherwise zero. */
12193 static bool
12194 is_imported_p (rtx x)
12196 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12197 || GET_CODE (x) != SYMBOL_REF)
12198 return false;
12200 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12204 /* Nonzero if the constant value X is a legitimate general operand
12205 when generating PIC code. It is given that flag_pic is on and
12206 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12208 bool
12209 legitimate_pic_operand_p (rtx x)
12211 rtx inner;
12213 switch (GET_CODE (x))
12215 case CONST:
12216 inner = XEXP (x, 0);
12217 if (GET_CODE (inner) == PLUS
12218 && CONST_INT_P (XEXP (inner, 1)))
12219 inner = XEXP (inner, 0);
12221 /* Only some unspecs are valid as "constants". */
12222 if (GET_CODE (inner) == UNSPEC)
12223 switch (XINT (inner, 1))
12225 case UNSPEC_GOT:
12226 case UNSPEC_GOTOFF:
12227 case UNSPEC_PLTOFF:
12228 return TARGET_64BIT;
12229 case UNSPEC_TPOFF:
12230 x = XVECEXP (inner, 0, 0);
12231 return (GET_CODE (x) == SYMBOL_REF
12232 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12233 case UNSPEC_MACHOPIC_OFFSET:
12234 return legitimate_pic_address_disp_p (x);
12235 default:
12236 return false;
12238 /* FALLTHRU */
12240 case SYMBOL_REF:
12241 case LABEL_REF:
12242 return legitimate_pic_address_disp_p (x);
12244 default:
12245 return true;
12249 /* Determine if a given CONST RTX is a valid memory displacement
12250 in PIC mode. */
12252 bool
12253 legitimate_pic_address_disp_p (rtx disp)
12255 bool saw_plus;
12257 /* In 64bit mode we can allow direct addresses of symbols and labels
12258 when they are not dynamic symbols. */
12259 if (TARGET_64BIT)
12261 rtx op0 = disp, op1;
12263 switch (GET_CODE (disp))
12265 case LABEL_REF:
12266 return true;
12268 case CONST:
12269 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12270 break;
12271 op0 = XEXP (XEXP (disp, 0), 0);
12272 op1 = XEXP (XEXP (disp, 0), 1);
12273 if (!CONST_INT_P (op1)
12274 || INTVAL (op1) >= 16*1024*1024
12275 || INTVAL (op1) < -16*1024*1024)
12276 break;
12277 if (GET_CODE (op0) == LABEL_REF)
12278 return true;
12279 if (GET_CODE (op0) == CONST
12280 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12281 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12282 return true;
12283 if (GET_CODE (op0) == UNSPEC
12284 && XINT (op0, 1) == UNSPEC_PCREL)
12285 return true;
12286 if (GET_CODE (op0) != SYMBOL_REF)
12287 break;
12288 /* FALLTHRU */
12290 case SYMBOL_REF:
12291 /* TLS references should always be enclosed in UNSPEC.
12292 The dllimported symbol needs always to be resolved. */
12293 if (SYMBOL_REF_TLS_MODEL (op0)
12294 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12295 return false;
12297 if (TARGET_PECOFF)
12299 if (is_imported_p (op0))
12300 return true;
12302 if (SYMBOL_REF_FAR_ADDR_P (op0)
12303 || !SYMBOL_REF_LOCAL_P (op0))
12304 break;
12306 /* Function-symbols need to be resolved only for
12307 large-model.
12308 For the small-model we don't need to resolve anything
12309 here. */
12310 if ((ix86_cmodel != CM_LARGE_PIC
12311 && SYMBOL_REF_FUNCTION_P (op0))
12312 || ix86_cmodel == CM_SMALL_PIC)
12313 return true;
12314 /* Non-external symbols don't need to be resolved for
12315 large, and medium-model. */
12316 if ((ix86_cmodel == CM_LARGE_PIC
12317 || ix86_cmodel == CM_MEDIUM_PIC)
12318 && !SYMBOL_REF_EXTERNAL_P (op0))
12319 return true;
12321 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12322 && SYMBOL_REF_LOCAL_P (op0)
12323 && ix86_cmodel != CM_LARGE_PIC)
12324 return true;
12325 break;
12327 default:
12328 break;
12331 if (GET_CODE (disp) != CONST)
12332 return false;
12333 disp = XEXP (disp, 0);
12335 if (TARGET_64BIT)
12337 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12338 of GOT tables. We should not need these anyway. */
12339 if (GET_CODE (disp) != UNSPEC
12340 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12341 && XINT (disp, 1) != UNSPEC_GOTOFF
12342 && XINT (disp, 1) != UNSPEC_PCREL
12343 && XINT (disp, 1) != UNSPEC_PLTOFF))
12344 return false;
12346 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12347 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12348 return false;
12349 return true;
12352 saw_plus = false;
12353 if (GET_CODE (disp) == PLUS)
12355 if (!CONST_INT_P (XEXP (disp, 1)))
12356 return false;
12357 disp = XEXP (disp, 0);
12358 saw_plus = true;
12361 if (TARGET_MACHO && darwin_local_data_pic (disp))
12362 return true;
12364 if (GET_CODE (disp) != UNSPEC)
12365 return false;
12367 switch (XINT (disp, 1))
12369 case UNSPEC_GOT:
12370 if (saw_plus)
12371 return false;
12372 /* We need to check for both symbols and labels because VxWorks loads
12373 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12374 details. */
12375 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12376 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12377 case UNSPEC_GOTOFF:
12378 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12379 While ABI specify also 32bit relocation but we don't produce it in
12380 small PIC model at all. */
12381 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12382 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12383 && !TARGET_64BIT)
12384 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12385 return false;
12386 case UNSPEC_GOTTPOFF:
12387 case UNSPEC_GOTNTPOFF:
12388 case UNSPEC_INDNTPOFF:
12389 if (saw_plus)
12390 return false;
12391 disp = XVECEXP (disp, 0, 0);
12392 return (GET_CODE (disp) == SYMBOL_REF
12393 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12394 case UNSPEC_NTPOFF:
12395 disp = XVECEXP (disp, 0, 0);
12396 return (GET_CODE (disp) == SYMBOL_REF
12397 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12398 case UNSPEC_DTPOFF:
12399 disp = XVECEXP (disp, 0, 0);
12400 return (GET_CODE (disp) == SYMBOL_REF
12401 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12404 return false;
12407 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12408 replace the input X, or the original X if no replacement is called for.
12409 The output parameter *WIN is 1 if the calling macro should goto WIN,
12410 0 if it should not. */
12412 bool
12413 ix86_legitimize_reload_address (rtx x,
12414 enum machine_mode mode ATTRIBUTE_UNUSED,
12415 int opnum, int type,
12416 int ind_levels ATTRIBUTE_UNUSED)
12418 /* Reload can generate:
12420 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12421 (reg:DI 97))
12422 (reg:DI 2 cx))
12424 This RTX is rejected from ix86_legitimate_address_p due to
12425 non-strictness of base register 97. Following this rejection,
12426 reload pushes all three components into separate registers,
12427 creating invalid memory address RTX.
12429 Following code reloads only the invalid part of the
12430 memory address RTX. */
12432 if (GET_CODE (x) == PLUS
12433 && REG_P (XEXP (x, 1))
12434 && GET_CODE (XEXP (x, 0)) == PLUS
12435 && REG_P (XEXP (XEXP (x, 0), 1)))
12437 rtx base, index;
12438 bool something_reloaded = false;
12440 base = XEXP (XEXP (x, 0), 1);
12441 if (!REG_OK_FOR_BASE_STRICT_P (base))
12443 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12444 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12445 opnum, (enum reload_type) type);
12446 something_reloaded = true;
12449 index = XEXP (x, 1);
12450 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12452 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12453 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12454 opnum, (enum reload_type) type);
12455 something_reloaded = true;
12458 gcc_assert (something_reloaded);
12459 return true;
12462 return false;
12465 /* Recognizes RTL expressions that are valid memory addresses for an
12466 instruction. The MODE argument is the machine mode for the MEM
12467 expression that wants to use this address.
12469 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12470 convert common non-canonical forms to canonical form so that they will
12471 be recognized. */
12473 static bool
12474 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12475 rtx addr, bool strict)
12477 struct ix86_address parts;
12478 rtx base, index, disp;
12479 HOST_WIDE_INT scale;
12481 if (ix86_decompose_address (addr, &parts) <= 0)
12482 /* Decomposition failed. */
12483 return false;
12485 base = parts.base;
12486 index = parts.index;
12487 disp = parts.disp;
12488 scale = parts.scale;
12490 /* Validate base register. */
12491 if (base)
12493 rtx reg;
12495 if (REG_P (base))
12496 reg = base;
12497 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12498 reg = SUBREG_REG (base);
12499 else
12500 /* Base is not a register. */
12501 return false;
12503 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12504 return false;
12506 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12507 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12508 /* Base is not valid. */
12509 return false;
12512 /* Validate index register. */
12513 if (index)
12515 rtx reg;
12517 if (REG_P (index))
12518 reg = index;
12519 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12520 reg = SUBREG_REG (index);
12521 else
12522 /* Index is not a register. */
12523 return false;
12525 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12526 return false;
12528 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12529 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12530 /* Index is not valid. */
12531 return false;
12534 /* Index and base should have the same mode. */
12535 if (base && index
12536 && GET_MODE (base) != GET_MODE (index))
12537 return false;
12539 /* Validate scale factor. */
12540 if (scale != 1)
12542 if (!index)
12543 /* Scale without index. */
12544 return false;
12546 if (scale != 2 && scale != 4 && scale != 8)
12547 /* Scale is not a valid multiplier. */
12548 return false;
12551 /* Validate displacement. */
12552 if (disp)
12554 if (GET_CODE (disp) == CONST
12555 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12556 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12557 switch (XINT (XEXP (disp, 0), 1))
12559 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12560 used. While ABI specify also 32bit relocations, we don't produce
12561 them at all and use IP relative instead. */
12562 case UNSPEC_GOT:
12563 case UNSPEC_GOTOFF:
12564 gcc_assert (flag_pic);
12565 if (!TARGET_64BIT)
12566 goto is_legitimate_pic;
12568 /* 64bit address unspec. */
12569 return false;
12571 case UNSPEC_GOTPCREL:
12572 case UNSPEC_PCREL:
12573 gcc_assert (flag_pic);
12574 goto is_legitimate_pic;
12576 case UNSPEC_GOTTPOFF:
12577 case UNSPEC_GOTNTPOFF:
12578 case UNSPEC_INDNTPOFF:
12579 case UNSPEC_NTPOFF:
12580 case UNSPEC_DTPOFF:
12581 break;
12583 case UNSPEC_STACK_CHECK:
12584 gcc_assert (flag_split_stack);
12585 break;
12587 default:
12588 /* Invalid address unspec. */
12589 return false;
12592 else if (SYMBOLIC_CONST (disp)
12593 && (flag_pic
12594 || (TARGET_MACHO
12595 #if TARGET_MACHO
12596 && MACHOPIC_INDIRECT
12597 && !machopic_operand_p (disp)
12598 #endif
12602 is_legitimate_pic:
12603 if (TARGET_64BIT && (index || base))
12605 /* foo@dtpoff(%rX) is ok. */
12606 if (GET_CODE (disp) != CONST
12607 || GET_CODE (XEXP (disp, 0)) != PLUS
12608 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12609 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12610 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12611 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12612 /* Non-constant pic memory reference. */
12613 return false;
12615 else if ((!TARGET_MACHO || flag_pic)
12616 && ! legitimate_pic_address_disp_p (disp))
12617 /* Displacement is an invalid pic construct. */
12618 return false;
12619 #if TARGET_MACHO
12620 else if (MACHO_DYNAMIC_NO_PIC_P
12621 && !ix86_legitimate_constant_p (Pmode, disp))
12622 /* displacment must be referenced via non_lazy_pointer */
12623 return false;
12624 #endif
12626 /* This code used to verify that a symbolic pic displacement
12627 includes the pic_offset_table_rtx register.
12629 While this is good idea, unfortunately these constructs may
12630 be created by "adds using lea" optimization for incorrect
12631 code like:
12633 int a;
12634 int foo(int i)
12636 return *(&a+i);
12639 This code is nonsensical, but results in addressing
12640 GOT table with pic_offset_table_rtx base. We can't
12641 just refuse it easily, since it gets matched by
12642 "addsi3" pattern, that later gets split to lea in the
12643 case output register differs from input. While this
12644 can be handled by separate addsi pattern for this case
12645 that never results in lea, this seems to be easier and
12646 correct fix for crash to disable this test. */
12648 else if (GET_CODE (disp) != LABEL_REF
12649 && !CONST_INT_P (disp)
12650 && (GET_CODE (disp) != CONST
12651 || !ix86_legitimate_constant_p (Pmode, disp))
12652 && (GET_CODE (disp) != SYMBOL_REF
12653 || !ix86_legitimate_constant_p (Pmode, disp)))
12654 /* Displacement is not constant. */
12655 return false;
12656 else if (TARGET_64BIT
12657 && !x86_64_immediate_operand (disp, VOIDmode))
12658 /* Displacement is out of range. */
12659 return false;
12662 /* Everything looks valid. */
12663 return true;
12666 /* Determine if a given RTX is a valid constant address. */
12668 bool
12669 constant_address_p (rtx x)
12671 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12674 /* Return a unique alias set for the GOT. */
12676 static alias_set_type
12677 ix86_GOT_alias_set (void)
12679 static alias_set_type set = -1;
12680 if (set == -1)
12681 set = new_alias_set ();
12682 return set;
12685 /* Return a legitimate reference for ORIG (an address) using the
12686 register REG. If REG is 0, a new pseudo is generated.
12688 There are two types of references that must be handled:
12690 1. Global data references must load the address from the GOT, via
12691 the PIC reg. An insn is emitted to do this load, and the reg is
12692 returned.
12694 2. Static data references, constant pool addresses, and code labels
12695 compute the address as an offset from the GOT, whose base is in
12696 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12697 differentiate them from global data objects. The returned
12698 address is the PIC reg + an unspec constant.
12700 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12701 reg also appears in the address. */
12703 static rtx
12704 legitimize_pic_address (rtx orig, rtx reg)
12706 rtx addr = orig;
12707 rtx new_rtx = orig;
12709 #if TARGET_MACHO
12710 if (TARGET_MACHO && !TARGET_64BIT)
12712 if (reg == 0)
12713 reg = gen_reg_rtx (Pmode);
12714 /* Use the generic Mach-O PIC machinery. */
12715 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12717 #endif
12719 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12721 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12722 if (tmp)
12723 return tmp;
12726 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12727 new_rtx = addr;
12728 else if (TARGET_64BIT && !TARGET_PECOFF
12729 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12731 rtx tmpreg;
12732 /* This symbol may be referenced via a displacement from the PIC
12733 base address (@GOTOFF). */
12735 if (reload_in_progress)
12736 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12737 if (GET_CODE (addr) == CONST)
12738 addr = XEXP (addr, 0);
12739 if (GET_CODE (addr) == PLUS)
12741 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12742 UNSPEC_GOTOFF);
12743 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12745 else
12746 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12747 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12748 if (!reg)
12749 tmpreg = gen_reg_rtx (Pmode);
12750 else
12751 tmpreg = reg;
12752 emit_move_insn (tmpreg, new_rtx);
12754 if (reg != 0)
12756 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12757 tmpreg, 1, OPTAB_DIRECT);
12758 new_rtx = reg;
12760 else
12761 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12763 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12765 /* This symbol may be referenced via a displacement from the PIC
12766 base address (@GOTOFF). */
12768 if (reload_in_progress)
12769 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12770 if (GET_CODE (addr) == CONST)
12771 addr = XEXP (addr, 0);
12772 if (GET_CODE (addr) == PLUS)
12774 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12775 UNSPEC_GOTOFF);
12776 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12778 else
12779 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12780 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12781 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12783 if (reg != 0)
12785 emit_move_insn (reg, new_rtx);
12786 new_rtx = reg;
12789 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12790 /* We can't use @GOTOFF for text labels on VxWorks;
12791 see gotoff_operand. */
12792 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12794 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12795 if (tmp)
12796 return tmp;
12798 /* For x64 PE-COFF there is no GOT table. So we use address
12799 directly. */
12800 if (TARGET_64BIT && TARGET_PECOFF)
12802 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12803 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12805 if (reg == 0)
12806 reg = gen_reg_rtx (Pmode);
12807 emit_move_insn (reg, new_rtx);
12808 new_rtx = reg;
12810 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12812 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12813 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12814 new_rtx = gen_const_mem (Pmode, new_rtx);
12815 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12817 if (reg == 0)
12818 reg = gen_reg_rtx (Pmode);
12819 /* Use directly gen_movsi, otherwise the address is loaded
12820 into register for CSE. We don't want to CSE this addresses,
12821 instead we CSE addresses from the GOT table, so skip this. */
12822 emit_insn (gen_movsi (reg, new_rtx));
12823 new_rtx = reg;
12825 else
12827 /* This symbol must be referenced via a load from the
12828 Global Offset Table (@GOT). */
12830 if (reload_in_progress)
12831 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12832 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12833 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12834 if (TARGET_64BIT)
12835 new_rtx = force_reg (Pmode, new_rtx);
12836 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12837 new_rtx = gen_const_mem (Pmode, new_rtx);
12838 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12840 if (reg == 0)
12841 reg = gen_reg_rtx (Pmode);
12842 emit_move_insn (reg, new_rtx);
12843 new_rtx = reg;
12846 else
12848 if (CONST_INT_P (addr)
12849 && !x86_64_immediate_operand (addr, VOIDmode))
12851 if (reg)
12853 emit_move_insn (reg, addr);
12854 new_rtx = reg;
12856 else
12857 new_rtx = force_reg (Pmode, addr);
12859 else if (GET_CODE (addr) == CONST)
12861 addr = XEXP (addr, 0);
12863 /* We must match stuff we generate before. Assume the only
12864 unspecs that can get here are ours. Not that we could do
12865 anything with them anyway.... */
12866 if (GET_CODE (addr) == UNSPEC
12867 || (GET_CODE (addr) == PLUS
12868 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12869 return orig;
12870 gcc_assert (GET_CODE (addr) == PLUS);
12872 if (GET_CODE (addr) == PLUS)
12874 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12876 /* Check first to see if this is a constant offset from a @GOTOFF
12877 symbol reference. */
12878 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
12879 && CONST_INT_P (op1))
12881 if (!TARGET_64BIT)
12883 if (reload_in_progress)
12884 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12885 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12886 UNSPEC_GOTOFF);
12887 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12888 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12889 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12891 if (reg != 0)
12893 emit_move_insn (reg, new_rtx);
12894 new_rtx = reg;
12897 else
12899 if (INTVAL (op1) < -16*1024*1024
12900 || INTVAL (op1) >= 16*1024*1024)
12902 if (!x86_64_immediate_operand (op1, Pmode))
12903 op1 = force_reg (Pmode, op1);
12904 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12908 else
12910 rtx base = legitimize_pic_address (op0, reg);
12911 enum machine_mode mode = GET_MODE (base);
12912 new_rtx
12913 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12915 if (CONST_INT_P (new_rtx))
12917 if (INTVAL (new_rtx) < -16*1024*1024
12918 || INTVAL (new_rtx) >= 16*1024*1024)
12920 if (!x86_64_immediate_operand (new_rtx, mode))
12921 new_rtx = force_reg (mode, new_rtx);
12922 new_rtx
12923 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12925 else
12926 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12928 else
12930 if (GET_CODE (new_rtx) == PLUS
12931 && CONSTANT_P (XEXP (new_rtx, 1)))
12933 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12934 new_rtx = XEXP (new_rtx, 1);
12936 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12941 return new_rtx;
12944 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12946 static rtx
12947 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12949 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12951 if (GET_MODE (tp) != tp_mode)
12953 gcc_assert (GET_MODE (tp) == SImode);
12954 gcc_assert (tp_mode == DImode);
12956 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12959 if (to_reg)
12960 tp = copy_to_mode_reg (tp_mode, tp);
12962 return tp;
12965 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12967 static GTY(()) rtx ix86_tls_symbol;
12969 static rtx
12970 ix86_tls_get_addr (void)
12972 if (!ix86_tls_symbol)
12974 const char *sym
12975 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12976 ? "___tls_get_addr" : "__tls_get_addr");
12978 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12981 return ix86_tls_symbol;
12984 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12986 static GTY(()) rtx ix86_tls_module_base_symbol;
12989 ix86_tls_module_base (void)
12991 if (!ix86_tls_module_base_symbol)
12993 ix86_tls_module_base_symbol
12994 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12996 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12997 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13000 return ix86_tls_module_base_symbol;
13003 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13004 false if we expect this to be used for a memory address and true if
13005 we expect to load the address into a register. */
13007 static rtx
13008 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13010 rtx dest, base, off;
13011 rtx pic = NULL_RTX, tp = NULL_RTX;
13012 enum machine_mode tp_mode = Pmode;
13013 int type;
13015 switch (model)
13017 case TLS_MODEL_GLOBAL_DYNAMIC:
13018 dest = gen_reg_rtx (Pmode);
13020 if (!TARGET_64BIT)
13022 if (flag_pic && !TARGET_PECOFF)
13023 pic = pic_offset_table_rtx;
13024 else
13026 pic = gen_reg_rtx (Pmode);
13027 emit_insn (gen_set_got (pic));
13031 if (TARGET_GNU2_TLS)
13033 if (TARGET_64BIT)
13034 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13035 else
13036 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13038 tp = get_thread_pointer (Pmode, true);
13039 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13041 if (GET_MODE (x) != Pmode)
13042 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13044 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13046 else
13048 rtx caddr = ix86_tls_get_addr ();
13050 if (TARGET_64BIT)
13052 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13053 rtx insns;
13055 start_sequence ();
13056 emit_call_insn
13057 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13058 insns = get_insns ();
13059 end_sequence ();
13061 if (GET_MODE (x) != Pmode)
13062 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13064 RTL_CONST_CALL_P (insns) = 1;
13065 emit_libcall_block (insns, dest, rax, x);
13067 else
13068 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13070 break;
13072 case TLS_MODEL_LOCAL_DYNAMIC:
13073 base = gen_reg_rtx (Pmode);
13075 if (!TARGET_64BIT)
13077 if (flag_pic)
13078 pic = pic_offset_table_rtx;
13079 else
13081 pic = gen_reg_rtx (Pmode);
13082 emit_insn (gen_set_got (pic));
13086 if (TARGET_GNU2_TLS)
13088 rtx tmp = ix86_tls_module_base ();
13090 if (TARGET_64BIT)
13091 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13092 else
13093 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13095 tp = get_thread_pointer (Pmode, true);
13096 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13097 gen_rtx_MINUS (Pmode, tmp, tp));
13099 else
13101 rtx caddr = ix86_tls_get_addr ();
13103 if (TARGET_64BIT)
13105 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13106 rtx insns, eqv;
13108 start_sequence ();
13109 emit_call_insn
13110 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13111 insns = get_insns ();
13112 end_sequence ();
13114 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13115 share the LD_BASE result with other LD model accesses. */
13116 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13117 UNSPEC_TLS_LD_BASE);
13119 RTL_CONST_CALL_P (insns) = 1;
13120 emit_libcall_block (insns, base, rax, eqv);
13122 else
13123 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13126 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13127 off = gen_rtx_CONST (Pmode, off);
13129 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13131 if (TARGET_GNU2_TLS)
13133 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13135 if (GET_MODE (x) != Pmode)
13136 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13138 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13140 break;
13142 case TLS_MODEL_INITIAL_EXEC:
13143 if (TARGET_64BIT)
13145 if (TARGET_SUN_TLS && !TARGET_X32)
13147 /* The Sun linker took the AMD64 TLS spec literally
13148 and can only handle %rax as destination of the
13149 initial executable code sequence. */
13151 dest = gen_reg_rtx (DImode);
13152 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13153 return dest;
13156 /* Generate DImode references to avoid %fs:(%reg32)
13157 problems and linker IE->LE relaxation bug. */
13158 tp_mode = DImode;
13159 pic = NULL;
13160 type = UNSPEC_GOTNTPOFF;
13162 else if (flag_pic)
13164 if (reload_in_progress)
13165 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13166 pic = pic_offset_table_rtx;
13167 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13169 else if (!TARGET_ANY_GNU_TLS)
13171 pic = gen_reg_rtx (Pmode);
13172 emit_insn (gen_set_got (pic));
13173 type = UNSPEC_GOTTPOFF;
13175 else
13177 pic = NULL;
13178 type = UNSPEC_INDNTPOFF;
13181 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13182 off = gen_rtx_CONST (tp_mode, off);
13183 if (pic)
13184 off = gen_rtx_PLUS (tp_mode, pic, off);
13185 off = gen_const_mem (tp_mode, off);
13186 set_mem_alias_set (off, ix86_GOT_alias_set ());
13188 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13190 base = get_thread_pointer (tp_mode,
13191 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13192 off = force_reg (tp_mode, off);
13193 return gen_rtx_PLUS (tp_mode, base, off);
13195 else
13197 base = get_thread_pointer (Pmode, true);
13198 dest = gen_reg_rtx (Pmode);
13199 emit_insn (ix86_gen_sub3 (dest, base, off));
13201 break;
13203 case TLS_MODEL_LOCAL_EXEC:
13204 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13205 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13206 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13207 off = gen_rtx_CONST (Pmode, off);
13209 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13211 base = get_thread_pointer (Pmode,
13212 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13213 return gen_rtx_PLUS (Pmode, base, off);
13215 else
13217 base = get_thread_pointer (Pmode, true);
13218 dest = gen_reg_rtx (Pmode);
13219 emit_insn (ix86_gen_sub3 (dest, base, off));
13221 break;
13223 default:
13224 gcc_unreachable ();
13227 return dest;
13230 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13231 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13232 unique refptr-DECL symbol corresponding to symbol DECL. */
13234 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13235 htab_t dllimport_map;
13237 static tree
13238 get_dllimport_decl (tree decl, bool beimport)
13240 struct tree_map *h, in;
13241 void **loc;
13242 const char *name;
13243 const char *prefix;
13244 size_t namelen, prefixlen;
13245 char *imp_name;
13246 tree to;
13247 rtx rtl;
13249 if (!dllimport_map)
13250 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13252 in.hash = htab_hash_pointer (decl);
13253 in.base.from = decl;
13254 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13255 h = (struct tree_map *) *loc;
13256 if (h)
13257 return h->to;
13259 *loc = h = ggc_alloc_tree_map ();
13260 h->hash = in.hash;
13261 h->base.from = decl;
13262 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13263 VAR_DECL, NULL, ptr_type_node);
13264 DECL_ARTIFICIAL (to) = 1;
13265 DECL_IGNORED_P (to) = 1;
13266 DECL_EXTERNAL (to) = 1;
13267 TREE_READONLY (to) = 1;
13269 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13270 name = targetm.strip_name_encoding (name);
13271 if (beimport)
13272 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13273 ? "*__imp_" : "*__imp__";
13274 else
13275 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13276 namelen = strlen (name);
13277 prefixlen = strlen (prefix);
13278 imp_name = (char *) alloca (namelen + prefixlen + 1);
13279 memcpy (imp_name, prefix, prefixlen);
13280 memcpy (imp_name + prefixlen, name, namelen + 1);
13282 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13283 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13284 SET_SYMBOL_REF_DECL (rtl, to);
13285 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13286 if (!beimport)
13288 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13289 #ifdef SUB_TARGET_RECORD_STUB
13290 SUB_TARGET_RECORD_STUB (name);
13291 #endif
13294 rtl = gen_const_mem (Pmode, rtl);
13295 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13297 SET_DECL_RTL (to, rtl);
13298 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13300 return to;
13303 /* Expand SYMBOL into its corresponding far-addresse symbol.
13304 WANT_REG is true if we require the result be a register. */
13306 static rtx
13307 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13309 tree imp_decl;
13310 rtx x;
13312 gcc_assert (SYMBOL_REF_DECL (symbol));
13313 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13315 x = DECL_RTL (imp_decl);
13316 if (want_reg)
13317 x = force_reg (Pmode, x);
13318 return x;
13321 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13322 true if we require the result be a register. */
13324 static rtx
13325 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13327 tree imp_decl;
13328 rtx x;
13330 gcc_assert (SYMBOL_REF_DECL (symbol));
13331 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13333 x = DECL_RTL (imp_decl);
13334 if (want_reg)
13335 x = force_reg (Pmode, x);
13336 return x;
13339 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13340 is true if we require the result be a register. */
13342 static rtx
13343 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13345 if (!TARGET_PECOFF)
13346 return NULL_RTX;
13348 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13350 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13351 return legitimize_dllimport_symbol (addr, inreg);
13352 if (GET_CODE (addr) == CONST
13353 && GET_CODE (XEXP (addr, 0)) == PLUS
13354 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13355 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13357 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13358 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13362 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13363 return NULL_RTX;
13364 if (GET_CODE (addr) == SYMBOL_REF
13365 && !is_imported_p (addr)
13366 && SYMBOL_REF_EXTERNAL_P (addr)
13367 && SYMBOL_REF_DECL (addr))
13368 return legitimize_pe_coff_extern_decl (addr, inreg);
13370 if (GET_CODE (addr) == CONST
13371 && GET_CODE (XEXP (addr, 0)) == PLUS
13372 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13373 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13374 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13375 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13377 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13378 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13380 return NULL_RTX;
13383 /* Try machine-dependent ways of modifying an illegitimate address
13384 to be legitimate. If we find one, return the new, valid address.
13385 This macro is used in only one place: `memory_address' in explow.c.
13387 OLDX is the address as it was before break_out_memory_refs was called.
13388 In some cases it is useful to look at this to decide what needs to be done.
13390 It is always safe for this macro to do nothing. It exists to recognize
13391 opportunities to optimize the output.
13393 For the 80386, we handle X+REG by loading X into a register R and
13394 using R+REG. R will go in a general reg and indexing will be used.
13395 However, if REG is a broken-out memory address or multiplication,
13396 nothing needs to be done because REG can certainly go in a general reg.
13398 When -fpic is used, special handling is needed for symbolic references.
13399 See comments by legitimize_pic_address in i386.c for details. */
13401 static rtx
13402 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13403 enum machine_mode mode)
13405 int changed = 0;
13406 unsigned log;
13408 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13409 if (log)
13410 return legitimize_tls_address (x, (enum tls_model) log, false);
13411 if (GET_CODE (x) == CONST
13412 && GET_CODE (XEXP (x, 0)) == PLUS
13413 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13414 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13416 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13417 (enum tls_model) log, false);
13418 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13421 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13423 rtx tmp = legitimize_pe_coff_symbol (x, true);
13424 if (tmp)
13425 return tmp;
13428 if (flag_pic && SYMBOLIC_CONST (x))
13429 return legitimize_pic_address (x, 0);
13431 #if TARGET_MACHO
13432 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13433 return machopic_indirect_data_reference (x, 0);
13434 #endif
13436 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13437 if (GET_CODE (x) == ASHIFT
13438 && CONST_INT_P (XEXP (x, 1))
13439 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13441 changed = 1;
13442 log = INTVAL (XEXP (x, 1));
13443 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13444 GEN_INT (1 << log));
13447 if (GET_CODE (x) == PLUS)
13449 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13451 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13452 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13453 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13455 changed = 1;
13456 log = INTVAL (XEXP (XEXP (x, 0), 1));
13457 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13458 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13459 GEN_INT (1 << log));
13462 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13463 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13464 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13466 changed = 1;
13467 log = INTVAL (XEXP (XEXP (x, 1), 1));
13468 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13469 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13470 GEN_INT (1 << log));
13473 /* Put multiply first if it isn't already. */
13474 if (GET_CODE (XEXP (x, 1)) == MULT)
13476 rtx tmp = XEXP (x, 0);
13477 XEXP (x, 0) = XEXP (x, 1);
13478 XEXP (x, 1) = tmp;
13479 changed = 1;
13482 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13483 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13484 created by virtual register instantiation, register elimination, and
13485 similar optimizations. */
13486 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13488 changed = 1;
13489 x = gen_rtx_PLUS (Pmode,
13490 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13491 XEXP (XEXP (x, 1), 0)),
13492 XEXP (XEXP (x, 1), 1));
13495 /* Canonicalize
13496 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13497 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13498 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13499 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13500 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13501 && CONSTANT_P (XEXP (x, 1)))
13503 rtx constant;
13504 rtx other = NULL_RTX;
13506 if (CONST_INT_P (XEXP (x, 1)))
13508 constant = XEXP (x, 1);
13509 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13511 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13513 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13514 other = XEXP (x, 1);
13516 else
13517 constant = 0;
13519 if (constant)
13521 changed = 1;
13522 x = gen_rtx_PLUS (Pmode,
13523 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13524 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13525 plus_constant (Pmode, other,
13526 INTVAL (constant)));
13530 if (changed && ix86_legitimate_address_p (mode, x, false))
13531 return x;
13533 if (GET_CODE (XEXP (x, 0)) == MULT)
13535 changed = 1;
13536 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13539 if (GET_CODE (XEXP (x, 1)) == MULT)
13541 changed = 1;
13542 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13545 if (changed
13546 && REG_P (XEXP (x, 1))
13547 && REG_P (XEXP (x, 0)))
13548 return x;
13550 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13552 changed = 1;
13553 x = legitimize_pic_address (x, 0);
13556 if (changed && ix86_legitimate_address_p (mode, x, false))
13557 return x;
13559 if (REG_P (XEXP (x, 0)))
13561 rtx temp = gen_reg_rtx (Pmode);
13562 rtx val = force_operand (XEXP (x, 1), temp);
13563 if (val != temp)
13565 val = convert_to_mode (Pmode, val, 1);
13566 emit_move_insn (temp, val);
13569 XEXP (x, 1) = temp;
13570 return x;
13573 else if (REG_P (XEXP (x, 1)))
13575 rtx temp = gen_reg_rtx (Pmode);
13576 rtx val = force_operand (XEXP (x, 0), temp);
13577 if (val != temp)
13579 val = convert_to_mode (Pmode, val, 1);
13580 emit_move_insn (temp, val);
13583 XEXP (x, 0) = temp;
13584 return x;
13588 return x;
13591 /* Print an integer constant expression in assembler syntax. Addition
13592 and subtraction are the only arithmetic that may appear in these
13593 expressions. FILE is the stdio stream to write to, X is the rtx, and
13594 CODE is the operand print code from the output string. */
13596 static void
13597 output_pic_addr_const (FILE *file, rtx x, int code)
13599 char buf[256];
13601 switch (GET_CODE (x))
13603 case PC:
13604 gcc_assert (flag_pic);
13605 putc ('.', file);
13606 break;
13608 case SYMBOL_REF:
13609 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13610 output_addr_const (file, x);
13611 else
13613 const char *name = XSTR (x, 0);
13615 /* Mark the decl as referenced so that cgraph will
13616 output the function. */
13617 if (SYMBOL_REF_DECL (x))
13618 mark_decl_referenced (SYMBOL_REF_DECL (x));
13620 #if TARGET_MACHO
13621 if (MACHOPIC_INDIRECT
13622 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13623 name = machopic_indirection_name (x, /*stub_p=*/true);
13624 #endif
13625 assemble_name (file, name);
13627 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13628 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13629 fputs ("@PLT", file);
13630 break;
13632 case LABEL_REF:
13633 x = XEXP (x, 0);
13634 /* FALLTHRU */
13635 case CODE_LABEL:
13636 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13637 assemble_name (asm_out_file, buf);
13638 break;
13640 case CONST_INT:
13641 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13642 break;
13644 case CONST:
13645 /* This used to output parentheses around the expression,
13646 but that does not work on the 386 (either ATT or BSD assembler). */
13647 output_pic_addr_const (file, XEXP (x, 0), code);
13648 break;
13650 case CONST_DOUBLE:
13651 if (GET_MODE (x) == VOIDmode)
13653 /* We can use %d if the number is <32 bits and positive. */
13654 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13655 fprintf (file, "0x%lx%08lx",
13656 (unsigned long) CONST_DOUBLE_HIGH (x),
13657 (unsigned long) CONST_DOUBLE_LOW (x));
13658 else
13659 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13661 else
13662 /* We can't handle floating point constants;
13663 TARGET_PRINT_OPERAND must handle them. */
13664 output_operand_lossage ("floating constant misused");
13665 break;
13667 case PLUS:
13668 /* Some assemblers need integer constants to appear first. */
13669 if (CONST_INT_P (XEXP (x, 0)))
13671 output_pic_addr_const (file, XEXP (x, 0), code);
13672 putc ('+', file);
13673 output_pic_addr_const (file, XEXP (x, 1), code);
13675 else
13677 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13678 output_pic_addr_const (file, XEXP (x, 1), code);
13679 putc ('+', file);
13680 output_pic_addr_const (file, XEXP (x, 0), code);
13682 break;
13684 case MINUS:
13685 if (!TARGET_MACHO)
13686 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13687 output_pic_addr_const (file, XEXP (x, 0), code);
13688 putc ('-', file);
13689 output_pic_addr_const (file, XEXP (x, 1), code);
13690 if (!TARGET_MACHO)
13691 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13692 break;
13694 case UNSPEC:
13695 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13697 bool f = i386_asm_output_addr_const_extra (file, x);
13698 gcc_assert (f);
13699 break;
13702 gcc_assert (XVECLEN (x, 0) == 1);
13703 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13704 switch (XINT (x, 1))
13706 case UNSPEC_GOT:
13707 fputs ("@GOT", file);
13708 break;
13709 case UNSPEC_GOTOFF:
13710 fputs ("@GOTOFF", file);
13711 break;
13712 case UNSPEC_PLTOFF:
13713 fputs ("@PLTOFF", file);
13714 break;
13715 case UNSPEC_PCREL:
13716 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13717 "(%rip)" : "[rip]", file);
13718 break;
13719 case UNSPEC_GOTPCREL:
13720 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13721 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13722 break;
13723 case UNSPEC_GOTTPOFF:
13724 /* FIXME: This might be @TPOFF in Sun ld too. */
13725 fputs ("@gottpoff", file);
13726 break;
13727 case UNSPEC_TPOFF:
13728 fputs ("@tpoff", file);
13729 break;
13730 case UNSPEC_NTPOFF:
13731 if (TARGET_64BIT)
13732 fputs ("@tpoff", file);
13733 else
13734 fputs ("@ntpoff", file);
13735 break;
13736 case UNSPEC_DTPOFF:
13737 fputs ("@dtpoff", file);
13738 break;
13739 case UNSPEC_GOTNTPOFF:
13740 if (TARGET_64BIT)
13741 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13742 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13743 else
13744 fputs ("@gotntpoff", file);
13745 break;
13746 case UNSPEC_INDNTPOFF:
13747 fputs ("@indntpoff", file);
13748 break;
13749 #if TARGET_MACHO
13750 case UNSPEC_MACHOPIC_OFFSET:
13751 putc ('-', file);
13752 machopic_output_function_base_name (file);
13753 break;
13754 #endif
13755 default:
13756 output_operand_lossage ("invalid UNSPEC as operand");
13757 break;
13759 break;
13761 default:
13762 output_operand_lossage ("invalid expression as operand");
13766 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13767 We need to emit DTP-relative relocations. */
13769 static void ATTRIBUTE_UNUSED
13770 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13772 fputs (ASM_LONG, file);
13773 output_addr_const (file, x);
13774 fputs ("@dtpoff", file);
13775 switch (size)
13777 case 4:
13778 break;
13779 case 8:
13780 fputs (", 0", file);
13781 break;
13782 default:
13783 gcc_unreachable ();
13787 /* Return true if X is a representation of the PIC register. This copes
13788 with calls from ix86_find_base_term, where the register might have
13789 been replaced by a cselib value. */
13791 static bool
13792 ix86_pic_register_p (rtx x)
13794 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13795 return (pic_offset_table_rtx
13796 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13797 else
13798 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13801 /* Helper function for ix86_delegitimize_address.
13802 Attempt to delegitimize TLS local-exec accesses. */
13804 static rtx
13805 ix86_delegitimize_tls_address (rtx orig_x)
13807 rtx x = orig_x, unspec;
13808 struct ix86_address addr;
13810 if (!TARGET_TLS_DIRECT_SEG_REFS)
13811 return orig_x;
13812 if (MEM_P (x))
13813 x = XEXP (x, 0);
13814 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13815 return orig_x;
13816 if (ix86_decompose_address (x, &addr) == 0
13817 || addr.seg != DEFAULT_TLS_SEG_REG
13818 || addr.disp == NULL_RTX
13819 || GET_CODE (addr.disp) != CONST)
13820 return orig_x;
13821 unspec = XEXP (addr.disp, 0);
13822 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13823 unspec = XEXP (unspec, 0);
13824 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13825 return orig_x;
13826 x = XVECEXP (unspec, 0, 0);
13827 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13828 if (unspec != XEXP (addr.disp, 0))
13829 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13830 if (addr.index)
13832 rtx idx = addr.index;
13833 if (addr.scale != 1)
13834 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13835 x = gen_rtx_PLUS (Pmode, idx, x);
13837 if (addr.base)
13838 x = gen_rtx_PLUS (Pmode, addr.base, x);
13839 if (MEM_P (orig_x))
13840 x = replace_equiv_address_nv (orig_x, x);
13841 return x;
13844 /* In the name of slightly smaller debug output, and to cater to
13845 general assembler lossage, recognize PIC+GOTOFF and turn it back
13846 into a direct symbol reference.
13848 On Darwin, this is necessary to avoid a crash, because Darwin
13849 has a different PIC label for each routine but the DWARF debugging
13850 information is not associated with any particular routine, so it's
13851 necessary to remove references to the PIC label from RTL stored by
13852 the DWARF output code. */
13854 static rtx
13855 ix86_delegitimize_address (rtx x)
13857 rtx orig_x = delegitimize_mem_from_attrs (x);
13858 /* addend is NULL or some rtx if x is something+GOTOFF where
13859 something doesn't include the PIC register. */
13860 rtx addend = NULL_RTX;
13861 /* reg_addend is NULL or a multiple of some register. */
13862 rtx reg_addend = NULL_RTX;
13863 /* const_addend is NULL or a const_int. */
13864 rtx const_addend = NULL_RTX;
13865 /* This is the result, or NULL. */
13866 rtx result = NULL_RTX;
13868 x = orig_x;
13870 if (MEM_P (x))
13871 x = XEXP (x, 0);
13873 if (TARGET_64BIT)
13875 if (GET_CODE (x) == CONST
13876 && GET_CODE (XEXP (x, 0)) == PLUS
13877 && GET_MODE (XEXP (x, 0)) == Pmode
13878 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13879 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13880 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13882 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13883 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13884 if (MEM_P (orig_x))
13885 x = replace_equiv_address_nv (orig_x, x);
13886 return x;
13889 if (GET_CODE (x) == CONST
13890 && GET_CODE (XEXP (x, 0)) == UNSPEC
13891 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
13892 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
13893 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
13895 x = XVECEXP (XEXP (x, 0), 0, 0);
13896 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13898 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13899 GET_MODE (x), 0);
13900 if (x == NULL_RTX)
13901 return orig_x;
13903 return x;
13906 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
13907 return ix86_delegitimize_tls_address (orig_x);
13909 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
13910 and -mcmodel=medium -fpic. */
13913 if (GET_CODE (x) != PLUS
13914 || GET_CODE (XEXP (x, 1)) != CONST)
13915 return ix86_delegitimize_tls_address (orig_x);
13917 if (ix86_pic_register_p (XEXP (x, 0)))
13918 /* %ebx + GOT/GOTOFF */
13920 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13922 /* %ebx + %reg * scale + GOT/GOTOFF */
13923 reg_addend = XEXP (x, 0);
13924 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13925 reg_addend = XEXP (reg_addend, 1);
13926 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13927 reg_addend = XEXP (reg_addend, 0);
13928 else
13930 reg_addend = NULL_RTX;
13931 addend = XEXP (x, 0);
13934 else
13935 addend = XEXP (x, 0);
13937 x = XEXP (XEXP (x, 1), 0);
13938 if (GET_CODE (x) == PLUS
13939 && CONST_INT_P (XEXP (x, 1)))
13941 const_addend = XEXP (x, 1);
13942 x = XEXP (x, 0);
13945 if (GET_CODE (x) == UNSPEC
13946 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13947 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
13948 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
13949 && !MEM_P (orig_x) && !addend)))
13950 result = XVECEXP (x, 0, 0);
13952 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
13953 && !MEM_P (orig_x))
13954 result = XVECEXP (x, 0, 0);
13956 if (! result)
13957 return ix86_delegitimize_tls_address (orig_x);
13959 if (const_addend)
13960 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13961 if (reg_addend)
13962 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13963 if (addend)
13965 /* If the rest of original X doesn't involve the PIC register, add
13966 addend and subtract pic_offset_table_rtx. This can happen e.g.
13967 for code like:
13968 leal (%ebx, %ecx, 4), %ecx
13970 movl foo@GOTOFF(%ecx), %edx
13971 in which case we return (%ecx - %ebx) + foo. */
13972 if (pic_offset_table_rtx)
13973 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13974 pic_offset_table_rtx),
13975 result);
13976 else
13977 return orig_x;
13979 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13981 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13982 if (result == NULL_RTX)
13983 return orig_x;
13985 return result;
13988 /* If X is a machine specific address (i.e. a symbol or label being
13989 referenced as a displacement from the GOT implemented using an
13990 UNSPEC), then return the base term. Otherwise return X. */
13993 ix86_find_base_term (rtx x)
13995 rtx term;
13997 if (TARGET_64BIT)
13999 if (GET_CODE (x) != CONST)
14000 return x;
14001 term = XEXP (x, 0);
14002 if (GET_CODE (term) == PLUS
14003 && (CONST_INT_P (XEXP (term, 1))
14004 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14005 term = XEXP (term, 0);
14006 if (GET_CODE (term) != UNSPEC
14007 || (XINT (term, 1) != UNSPEC_GOTPCREL
14008 && XINT (term, 1) != UNSPEC_PCREL))
14009 return x;
14011 return XVECEXP (term, 0, 0);
14014 return ix86_delegitimize_address (x);
14017 static void
14018 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14019 bool fp, FILE *file)
14021 const char *suffix;
14023 if (mode == CCFPmode || mode == CCFPUmode)
14025 code = ix86_fp_compare_code_to_integer (code);
14026 mode = CCmode;
14028 if (reverse)
14029 code = reverse_condition (code);
14031 switch (code)
14033 case EQ:
14034 switch (mode)
14036 case CCAmode:
14037 suffix = "a";
14038 break;
14040 case CCCmode:
14041 suffix = "c";
14042 break;
14044 case CCOmode:
14045 suffix = "o";
14046 break;
14048 case CCSmode:
14049 suffix = "s";
14050 break;
14052 default:
14053 suffix = "e";
14055 break;
14056 case NE:
14057 switch (mode)
14059 case CCAmode:
14060 suffix = "na";
14061 break;
14063 case CCCmode:
14064 suffix = "nc";
14065 break;
14067 case CCOmode:
14068 suffix = "no";
14069 break;
14071 case CCSmode:
14072 suffix = "ns";
14073 break;
14075 default:
14076 suffix = "ne";
14078 break;
14079 case GT:
14080 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14081 suffix = "g";
14082 break;
14083 case GTU:
14084 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14085 Those same assemblers have the same but opposite lossage on cmov. */
14086 if (mode == CCmode)
14087 suffix = fp ? "nbe" : "a";
14088 else if (mode == CCCmode)
14089 suffix = "b";
14090 else
14091 gcc_unreachable ();
14092 break;
14093 case LT:
14094 switch (mode)
14096 case CCNOmode:
14097 case CCGOCmode:
14098 suffix = "s";
14099 break;
14101 case CCmode:
14102 case CCGCmode:
14103 suffix = "l";
14104 break;
14106 default:
14107 gcc_unreachable ();
14109 break;
14110 case LTU:
14111 gcc_assert (mode == CCmode || mode == CCCmode);
14112 suffix = "b";
14113 break;
14114 case GE:
14115 switch (mode)
14117 case CCNOmode:
14118 case CCGOCmode:
14119 suffix = "ns";
14120 break;
14122 case CCmode:
14123 case CCGCmode:
14124 suffix = "ge";
14125 break;
14127 default:
14128 gcc_unreachable ();
14130 break;
14131 case GEU:
14132 /* ??? As above. */
14133 gcc_assert (mode == CCmode || mode == CCCmode);
14134 suffix = fp ? "nb" : "ae";
14135 break;
14136 case LE:
14137 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14138 suffix = "le";
14139 break;
14140 case LEU:
14141 /* ??? As above. */
14142 if (mode == CCmode)
14143 suffix = "be";
14144 else if (mode == CCCmode)
14145 suffix = fp ? "nb" : "ae";
14146 else
14147 gcc_unreachable ();
14148 break;
14149 case UNORDERED:
14150 suffix = fp ? "u" : "p";
14151 break;
14152 case ORDERED:
14153 suffix = fp ? "nu" : "np";
14154 break;
14155 default:
14156 gcc_unreachable ();
14158 fputs (suffix, file);
14161 /* Print the name of register X to FILE based on its machine mode and number.
14162 If CODE is 'w', pretend the mode is HImode.
14163 If CODE is 'b', pretend the mode is QImode.
14164 If CODE is 'k', pretend the mode is SImode.
14165 If CODE is 'q', pretend the mode is DImode.
14166 If CODE is 'x', pretend the mode is V4SFmode.
14167 If CODE is 't', pretend the mode is V8SFmode.
14168 If CODE is 'h', pretend the reg is the 'high' byte register.
14169 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14170 If CODE is 'd', duplicate the operand for AVX instruction.
14173 void
14174 print_reg (rtx x, int code, FILE *file)
14176 const char *reg;
14177 unsigned int regno;
14178 bool duplicated = code == 'd' && TARGET_AVX;
14180 if (ASSEMBLER_DIALECT == ASM_ATT)
14181 putc ('%', file);
14183 if (x == pc_rtx)
14185 gcc_assert (TARGET_64BIT);
14186 fputs ("rip", file);
14187 return;
14190 regno = true_regnum (x);
14191 gcc_assert (regno != ARG_POINTER_REGNUM
14192 && regno != FRAME_POINTER_REGNUM
14193 && regno != FLAGS_REG
14194 && regno != FPSR_REG
14195 && regno != FPCR_REG);
14197 if (code == 'w' || MMX_REG_P (x))
14198 code = 2;
14199 else if (code == 'b')
14200 code = 1;
14201 else if (code == 'k')
14202 code = 4;
14203 else if (code == 'q')
14204 code = 8;
14205 else if (code == 'y')
14206 code = 3;
14207 else if (code == 'h')
14208 code = 0;
14209 else if (code == 'x')
14210 code = 16;
14211 else if (code == 't')
14212 code = 32;
14213 else
14214 code = GET_MODE_SIZE (GET_MODE (x));
14216 /* Irritatingly, AMD extended registers use different naming convention
14217 from the normal registers: "r%d[bwd]" */
14218 if (REX_INT_REGNO_P (regno))
14220 gcc_assert (TARGET_64BIT);
14221 putc ('r', file);
14222 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14223 switch (code)
14225 case 0:
14226 error ("extended registers have no high halves");
14227 break;
14228 case 1:
14229 putc ('b', file);
14230 break;
14231 case 2:
14232 putc ('w', file);
14233 break;
14234 case 4:
14235 putc ('d', file);
14236 break;
14237 case 8:
14238 /* no suffix */
14239 break;
14240 default:
14241 error ("unsupported operand size for extended register");
14242 break;
14244 return;
14247 reg = NULL;
14248 switch (code)
14250 case 3:
14251 if (STACK_TOP_P (x))
14253 reg = "st(0)";
14254 break;
14256 /* FALLTHRU */
14257 case 8:
14258 case 4:
14259 case 12:
14260 if (! ANY_FP_REG_P (x))
14261 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14262 /* FALLTHRU */
14263 case 16:
14264 case 2:
14265 normal:
14266 reg = hi_reg_name[regno];
14267 break;
14268 case 1:
14269 if (regno >= ARRAY_SIZE (qi_reg_name))
14270 goto normal;
14271 reg = qi_reg_name[regno];
14272 break;
14273 case 0:
14274 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14275 goto normal;
14276 reg = qi_high_reg_name[regno];
14277 break;
14278 case 32:
14279 if (SSE_REG_P (x))
14281 gcc_assert (!duplicated);
14282 putc ('y', file);
14283 fputs (hi_reg_name[regno] + 1, file);
14284 return;
14286 break;
14287 default:
14288 gcc_unreachable ();
14291 fputs (reg, file);
14292 if (duplicated)
14294 if (ASSEMBLER_DIALECT == ASM_ATT)
14295 fprintf (file, ", %%%s", reg);
14296 else
14297 fprintf (file, ", %s", reg);
14301 /* Locate some local-dynamic symbol still in use by this function
14302 so that we can print its name in some tls_local_dynamic_base
14303 pattern. */
14305 static int
14306 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14308 rtx x = *px;
14310 if (GET_CODE (x) == SYMBOL_REF
14311 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14313 cfun->machine->some_ld_name = XSTR (x, 0);
14314 return 1;
14317 return 0;
14320 static const char *
14321 get_some_local_dynamic_name (void)
14323 rtx insn;
14325 if (cfun->machine->some_ld_name)
14326 return cfun->machine->some_ld_name;
14328 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14329 if (NONDEBUG_INSN_P (insn)
14330 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14331 return cfun->machine->some_ld_name;
14333 return NULL;
14336 /* Meaning of CODE:
14337 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14338 C -- print opcode suffix for set/cmov insn.
14339 c -- like C, but print reversed condition
14340 F,f -- likewise, but for floating-point.
14341 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14342 otherwise nothing
14343 R -- print the prefix for register names.
14344 z -- print the opcode suffix for the size of the current operand.
14345 Z -- likewise, with special suffixes for x87 instructions.
14346 * -- print a star (in certain assembler syntax)
14347 A -- print an absolute memory reference.
14348 E -- print address with DImode register names if TARGET_64BIT.
14349 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14350 s -- print a shift double count, followed by the assemblers argument
14351 delimiter.
14352 b -- print the QImode name of the register for the indicated operand.
14353 %b0 would print %al if operands[0] is reg 0.
14354 w -- likewise, print the HImode name of the register.
14355 k -- likewise, print the SImode name of the register.
14356 q -- likewise, print the DImode name of the register.
14357 x -- likewise, print the V4SFmode name of the register.
14358 t -- likewise, print the V8SFmode name of the register.
14359 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14360 y -- print "st(0)" instead of "st" as a register.
14361 d -- print duplicated register operand for AVX instruction.
14362 D -- print condition for SSE cmp instruction.
14363 P -- if PIC, print an @PLT suffix.
14364 p -- print raw symbol name.
14365 X -- don't print any sort of PIC '@' suffix for a symbol.
14366 & -- print some in-use local-dynamic symbol name.
14367 H -- print a memory address offset by 8; used for sse high-parts
14368 Y -- print condition for XOP pcom* instruction.
14369 + -- print a branch hint as 'cs' or 'ds' prefix
14370 ; -- print a semicolon (after prefixes due to bug in older gas).
14371 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14372 @ -- print a segment register of thread base pointer load
14373 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14376 void
14377 ix86_print_operand (FILE *file, rtx x, int code)
14379 if (code)
14381 switch (code)
14383 case 'A':
14384 switch (ASSEMBLER_DIALECT)
14386 case ASM_ATT:
14387 putc ('*', file);
14388 break;
14390 case ASM_INTEL:
14391 /* Intel syntax. For absolute addresses, registers should not
14392 be surrounded by braces. */
14393 if (!REG_P (x))
14395 putc ('[', file);
14396 ix86_print_operand (file, x, 0);
14397 putc (']', file);
14398 return;
14400 break;
14402 default:
14403 gcc_unreachable ();
14406 ix86_print_operand (file, x, 0);
14407 return;
14409 case 'E':
14410 /* Wrap address in an UNSPEC to declare special handling. */
14411 if (TARGET_64BIT)
14412 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14414 output_address (x);
14415 return;
14417 case 'L':
14418 if (ASSEMBLER_DIALECT == ASM_ATT)
14419 putc ('l', file);
14420 return;
14422 case 'W':
14423 if (ASSEMBLER_DIALECT == ASM_ATT)
14424 putc ('w', file);
14425 return;
14427 case 'B':
14428 if (ASSEMBLER_DIALECT == ASM_ATT)
14429 putc ('b', file);
14430 return;
14432 case 'Q':
14433 if (ASSEMBLER_DIALECT == ASM_ATT)
14434 putc ('l', file);
14435 return;
14437 case 'S':
14438 if (ASSEMBLER_DIALECT == ASM_ATT)
14439 putc ('s', file);
14440 return;
14442 case 'T':
14443 if (ASSEMBLER_DIALECT == ASM_ATT)
14444 putc ('t', file);
14445 return;
14447 case 'O':
14448 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14449 if (ASSEMBLER_DIALECT != ASM_ATT)
14450 return;
14452 switch (GET_MODE_SIZE (GET_MODE (x)))
14454 case 2:
14455 putc ('w', file);
14456 break;
14458 case 4:
14459 putc ('l', file);
14460 break;
14462 case 8:
14463 putc ('q', file);
14464 break;
14466 default:
14467 output_operand_lossage
14468 ("invalid operand size for operand code 'O'");
14469 return;
14472 putc ('.', file);
14473 #endif
14474 return;
14476 case 'z':
14477 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14479 /* Opcodes don't get size suffixes if using Intel opcodes. */
14480 if (ASSEMBLER_DIALECT == ASM_INTEL)
14481 return;
14483 switch (GET_MODE_SIZE (GET_MODE (x)))
14485 case 1:
14486 putc ('b', file);
14487 return;
14489 case 2:
14490 putc ('w', file);
14491 return;
14493 case 4:
14494 putc ('l', file);
14495 return;
14497 case 8:
14498 putc ('q', file);
14499 return;
14501 default:
14502 output_operand_lossage
14503 ("invalid operand size for operand code 'z'");
14504 return;
14508 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14509 warning
14510 (0, "non-integer operand used with operand code 'z'");
14511 /* FALLTHRU */
14513 case 'Z':
14514 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14515 if (ASSEMBLER_DIALECT == ASM_INTEL)
14516 return;
14518 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14520 switch (GET_MODE_SIZE (GET_MODE (x)))
14522 case 2:
14523 #ifdef HAVE_AS_IX86_FILDS
14524 putc ('s', file);
14525 #endif
14526 return;
14528 case 4:
14529 putc ('l', file);
14530 return;
14532 case 8:
14533 #ifdef HAVE_AS_IX86_FILDQ
14534 putc ('q', file);
14535 #else
14536 fputs ("ll", file);
14537 #endif
14538 return;
14540 default:
14541 break;
14544 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14546 /* 387 opcodes don't get size suffixes
14547 if the operands are registers. */
14548 if (STACK_REG_P (x))
14549 return;
14551 switch (GET_MODE_SIZE (GET_MODE (x)))
14553 case 4:
14554 putc ('s', file);
14555 return;
14557 case 8:
14558 putc ('l', file);
14559 return;
14561 case 12:
14562 case 16:
14563 putc ('t', file);
14564 return;
14566 default:
14567 break;
14570 else
14572 output_operand_lossage
14573 ("invalid operand type used with operand code 'Z'");
14574 return;
14577 output_operand_lossage
14578 ("invalid operand size for operand code 'Z'");
14579 return;
14581 case 'd':
14582 case 'b':
14583 case 'w':
14584 case 'k':
14585 case 'q':
14586 case 'h':
14587 case 't':
14588 case 'y':
14589 case 'x':
14590 case 'X':
14591 case 'P':
14592 case 'p':
14593 break;
14595 case 's':
14596 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14598 ix86_print_operand (file, x, 0);
14599 fputs (", ", file);
14601 return;
14603 case 'Y':
14604 switch (GET_CODE (x))
14606 case NE:
14607 fputs ("neq", file);
14608 break;
14609 case EQ:
14610 fputs ("eq", file);
14611 break;
14612 case GE:
14613 case GEU:
14614 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14615 break;
14616 case GT:
14617 case GTU:
14618 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14619 break;
14620 case LE:
14621 case LEU:
14622 fputs ("le", file);
14623 break;
14624 case LT:
14625 case LTU:
14626 fputs ("lt", file);
14627 break;
14628 case UNORDERED:
14629 fputs ("unord", file);
14630 break;
14631 case ORDERED:
14632 fputs ("ord", file);
14633 break;
14634 case UNEQ:
14635 fputs ("ueq", file);
14636 break;
14637 case UNGE:
14638 fputs ("nlt", file);
14639 break;
14640 case UNGT:
14641 fputs ("nle", file);
14642 break;
14643 case UNLE:
14644 fputs ("ule", file);
14645 break;
14646 case UNLT:
14647 fputs ("ult", file);
14648 break;
14649 case LTGT:
14650 fputs ("une", file);
14651 break;
14652 default:
14653 output_operand_lossage ("operand is not a condition code, "
14654 "invalid operand code 'Y'");
14655 return;
14657 return;
14659 case 'D':
14660 /* Little bit of braindamage here. The SSE compare instructions
14661 does use completely different names for the comparisons that the
14662 fp conditional moves. */
14663 switch (GET_CODE (x))
14665 case UNEQ:
14666 if (TARGET_AVX)
14668 fputs ("eq_us", file);
14669 break;
14671 case EQ:
14672 fputs ("eq", file);
14673 break;
14674 case UNLT:
14675 if (TARGET_AVX)
14677 fputs ("nge", file);
14678 break;
14680 case LT:
14681 fputs ("lt", file);
14682 break;
14683 case UNLE:
14684 if (TARGET_AVX)
14686 fputs ("ngt", file);
14687 break;
14689 case LE:
14690 fputs ("le", file);
14691 break;
14692 case UNORDERED:
14693 fputs ("unord", file);
14694 break;
14695 case LTGT:
14696 if (TARGET_AVX)
14698 fputs ("neq_oq", file);
14699 break;
14701 case NE:
14702 fputs ("neq", file);
14703 break;
14704 case GE:
14705 if (TARGET_AVX)
14707 fputs ("ge", file);
14708 break;
14710 case UNGE:
14711 fputs ("nlt", file);
14712 break;
14713 case GT:
14714 if (TARGET_AVX)
14716 fputs ("gt", file);
14717 break;
14719 case UNGT:
14720 fputs ("nle", file);
14721 break;
14722 case ORDERED:
14723 fputs ("ord", file);
14724 break;
14725 default:
14726 output_operand_lossage ("operand is not a condition code, "
14727 "invalid operand code 'D'");
14728 return;
14730 return;
14732 case 'F':
14733 case 'f':
14734 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14735 if (ASSEMBLER_DIALECT == ASM_ATT)
14736 putc ('.', file);
14737 #endif
14739 case 'C':
14740 case 'c':
14741 if (!COMPARISON_P (x))
14743 output_operand_lossage ("operand is not a condition code, "
14744 "invalid operand code '%c'", code);
14745 return;
14747 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14748 code == 'c' || code == 'f',
14749 code == 'F' || code == 'f',
14750 file);
14751 return;
14753 case 'H':
14754 if (!offsettable_memref_p (x))
14756 output_operand_lossage ("operand is not an offsettable memory "
14757 "reference, invalid operand code 'H'");
14758 return;
14760 /* It doesn't actually matter what mode we use here, as we're
14761 only going to use this for printing. */
14762 x = adjust_address_nv (x, DImode, 8);
14763 /* Output 'qword ptr' for intel assembler dialect. */
14764 if (ASSEMBLER_DIALECT == ASM_INTEL)
14765 code = 'q';
14766 break;
14768 case 'K':
14769 gcc_assert (CONST_INT_P (x));
14771 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14772 #ifdef HAVE_AS_IX86_HLE
14773 fputs ("xacquire ", file);
14774 #else
14775 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14776 #endif
14777 else if (INTVAL (x) & IX86_HLE_RELEASE)
14778 #ifdef HAVE_AS_IX86_HLE
14779 fputs ("xrelease ", file);
14780 #else
14781 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14782 #endif
14783 /* We do not want to print value of the operand. */
14784 return;
14786 case '*':
14787 if (ASSEMBLER_DIALECT == ASM_ATT)
14788 putc ('*', file);
14789 return;
14791 case '&':
14793 const char *name = get_some_local_dynamic_name ();
14794 if (name == NULL)
14795 output_operand_lossage ("'%%&' used without any "
14796 "local dynamic TLS references");
14797 else
14798 assemble_name (file, name);
14799 return;
14802 case '+':
14804 rtx x;
14806 if (!optimize
14807 || optimize_function_for_size_p (cfun)
14808 || !TARGET_BRANCH_PREDICTION_HINTS)
14809 return;
14811 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14812 if (x)
14814 int pred_val = INTVAL (XEXP (x, 0));
14816 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14817 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14819 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14820 bool cputaken
14821 = final_forward_branch_p (current_output_insn) == 0;
14823 /* Emit hints only in the case default branch prediction
14824 heuristics would fail. */
14825 if (taken != cputaken)
14827 /* We use 3e (DS) prefix for taken branches and
14828 2e (CS) prefix for not taken branches. */
14829 if (taken)
14830 fputs ("ds ; ", file);
14831 else
14832 fputs ("cs ; ", file);
14836 return;
14839 case ';':
14840 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14841 putc (';', file);
14842 #endif
14843 return;
14845 case '@':
14846 if (ASSEMBLER_DIALECT == ASM_ATT)
14847 putc ('%', file);
14849 /* The kernel uses a different segment register for performance
14850 reasons; a system call would not have to trash the userspace
14851 segment register, which would be expensive. */
14852 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14853 fputs ("fs", file);
14854 else
14855 fputs ("gs", file);
14856 return;
14858 case '~':
14859 putc (TARGET_AVX2 ? 'i' : 'f', file);
14860 return;
14862 case '^':
14863 if (TARGET_64BIT && Pmode != word_mode)
14864 fputs ("addr32 ", file);
14865 return;
14867 default:
14868 output_operand_lossage ("invalid operand code '%c'", code);
14872 if (REG_P (x))
14873 print_reg (x, code, file);
14875 else if (MEM_P (x))
14877 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14878 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14879 && GET_MODE (x) != BLKmode)
14881 const char * size;
14882 switch (GET_MODE_SIZE (GET_MODE (x)))
14884 case 1: size = "BYTE"; break;
14885 case 2: size = "WORD"; break;
14886 case 4: size = "DWORD"; break;
14887 case 8: size = "QWORD"; break;
14888 case 12: size = "TBYTE"; break;
14889 case 16:
14890 if (GET_MODE (x) == XFmode)
14891 size = "TBYTE";
14892 else
14893 size = "XMMWORD";
14894 break;
14895 case 32: size = "YMMWORD"; break;
14896 default:
14897 gcc_unreachable ();
14900 /* Check for explicit size override (codes 'b', 'w', 'k',
14901 'q' and 'x') */
14902 if (code == 'b')
14903 size = "BYTE";
14904 else if (code == 'w')
14905 size = "WORD";
14906 else if (code == 'k')
14907 size = "DWORD";
14908 else if (code == 'q')
14909 size = "QWORD";
14910 else if (code == 'x')
14911 size = "XMMWORD";
14913 fputs (size, file);
14914 fputs (" PTR ", file);
14917 x = XEXP (x, 0);
14918 /* Avoid (%rip) for call operands. */
14919 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14920 && !CONST_INT_P (x))
14921 output_addr_const (file, x);
14922 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14923 output_operand_lossage ("invalid constraints for operand");
14924 else
14925 output_address (x);
14928 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14930 REAL_VALUE_TYPE r;
14931 long l;
14933 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14934 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14936 if (ASSEMBLER_DIALECT == ASM_ATT)
14937 putc ('$', file);
14938 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14939 if (code == 'q')
14940 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14941 (unsigned long long) (int) l);
14942 else
14943 fprintf (file, "0x%08x", (unsigned int) l);
14946 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14948 REAL_VALUE_TYPE r;
14949 long l[2];
14951 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14952 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14954 if (ASSEMBLER_DIALECT == ASM_ATT)
14955 putc ('$', file);
14956 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14959 /* These float cases don't actually occur as immediate operands. */
14960 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14962 char dstr[30];
14964 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14965 fputs (dstr, file);
14968 else
14970 /* We have patterns that allow zero sets of memory, for instance.
14971 In 64-bit mode, we should probably support all 8-byte vectors,
14972 since we can in fact encode that into an immediate. */
14973 if (GET_CODE (x) == CONST_VECTOR)
14975 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14976 x = const0_rtx;
14979 if (code != 'P' && code != 'p')
14981 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14983 if (ASSEMBLER_DIALECT == ASM_ATT)
14984 putc ('$', file);
14986 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14987 || GET_CODE (x) == LABEL_REF)
14989 if (ASSEMBLER_DIALECT == ASM_ATT)
14990 putc ('$', file);
14991 else
14992 fputs ("OFFSET FLAT:", file);
14995 if (CONST_INT_P (x))
14996 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14997 else if (flag_pic || MACHOPIC_INDIRECT)
14998 output_pic_addr_const (file, x, code);
14999 else
15000 output_addr_const (file, x);
15004 static bool
15005 ix86_print_operand_punct_valid_p (unsigned char code)
15007 return (code == '@' || code == '*' || code == '+' || code == '&'
15008 || code == ';' || code == '~' || code == '^');
15011 /* Print a memory operand whose address is ADDR. */
15013 static void
15014 ix86_print_operand_address (FILE *file, rtx addr)
15016 struct ix86_address parts;
15017 rtx base, index, disp;
15018 int scale;
15019 int ok;
15020 bool vsib = false;
15021 int code = 0;
15023 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15025 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15026 gcc_assert (parts.index == NULL_RTX);
15027 parts.index = XVECEXP (addr, 0, 1);
15028 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15029 addr = XVECEXP (addr, 0, 0);
15030 vsib = true;
15032 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15034 gcc_assert (TARGET_64BIT);
15035 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15036 code = 'q';
15038 else
15039 ok = ix86_decompose_address (addr, &parts);
15041 gcc_assert (ok);
15043 base = parts.base;
15044 index = parts.index;
15045 disp = parts.disp;
15046 scale = parts.scale;
15048 switch (parts.seg)
15050 case SEG_DEFAULT:
15051 break;
15052 case SEG_FS:
15053 case SEG_GS:
15054 if (ASSEMBLER_DIALECT == ASM_ATT)
15055 putc ('%', file);
15056 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15057 break;
15058 default:
15059 gcc_unreachable ();
15062 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15063 if (TARGET_64BIT && !base && !index)
15065 rtx symbol = disp;
15067 if (GET_CODE (disp) == CONST
15068 && GET_CODE (XEXP (disp, 0)) == PLUS
15069 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15070 symbol = XEXP (XEXP (disp, 0), 0);
15072 if (GET_CODE (symbol) == LABEL_REF
15073 || (GET_CODE (symbol) == SYMBOL_REF
15074 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15075 base = pc_rtx;
15077 if (!base && !index)
15079 /* Displacement only requires special attention. */
15081 if (CONST_INT_P (disp))
15083 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15084 fputs ("ds:", file);
15085 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15087 else if (flag_pic)
15088 output_pic_addr_const (file, disp, 0);
15089 else
15090 output_addr_const (file, disp);
15092 else
15094 /* Print SImode register names to force addr32 prefix. */
15095 if (SImode_address_operand (addr, VOIDmode))
15097 #ifdef ENABLE_CHECKING
15098 gcc_assert (TARGET_64BIT);
15099 switch (GET_CODE (addr))
15101 case SUBREG:
15102 gcc_assert (GET_MODE (addr) == SImode);
15103 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15104 break;
15105 case ZERO_EXTEND:
15106 case AND:
15107 gcc_assert (GET_MODE (addr) == DImode);
15108 break;
15109 default:
15110 gcc_unreachable ();
15112 #endif
15113 gcc_assert (!code);
15114 code = 'k';
15116 else if (code == 0
15117 && TARGET_X32
15118 && disp
15119 && CONST_INT_P (disp)
15120 && INTVAL (disp) < -16*1024*1024)
15122 /* X32 runs in 64-bit mode, where displacement, DISP, in
15123 address DISP(%r64), is encoded as 32-bit immediate sign-
15124 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15125 address is %r64 + 0xffffffffbffffd00. When %r64 <
15126 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15127 which is invalid for x32. The correct address is %r64
15128 - 0x40000300 == 0xf7ffdd64. To properly encode
15129 -0x40000300(%r64) for x32, we zero-extend negative
15130 displacement by forcing addr32 prefix which truncates
15131 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15132 zero-extend all negative displacements, including -1(%rsp).
15133 However, for small negative displacements, sign-extension
15134 won't cause overflow. We only zero-extend negative
15135 displacements if they < -16*1024*1024, which is also used
15136 to check legitimate address displacements for PIC. */
15137 code = 'k';
15140 if (ASSEMBLER_DIALECT == ASM_ATT)
15142 if (disp)
15144 if (flag_pic)
15145 output_pic_addr_const (file, disp, 0);
15146 else if (GET_CODE (disp) == LABEL_REF)
15147 output_asm_label (disp);
15148 else
15149 output_addr_const (file, disp);
15152 putc ('(', file);
15153 if (base)
15154 print_reg (base, code, file);
15155 if (index)
15157 putc (',', file);
15158 print_reg (index, vsib ? 0 : code, file);
15159 if (scale != 1 || vsib)
15160 fprintf (file, ",%d", scale);
15162 putc (')', file);
15164 else
15166 rtx offset = NULL_RTX;
15168 if (disp)
15170 /* Pull out the offset of a symbol; print any symbol itself. */
15171 if (GET_CODE (disp) == CONST
15172 && GET_CODE (XEXP (disp, 0)) == PLUS
15173 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15175 offset = XEXP (XEXP (disp, 0), 1);
15176 disp = gen_rtx_CONST (VOIDmode,
15177 XEXP (XEXP (disp, 0), 0));
15180 if (flag_pic)
15181 output_pic_addr_const (file, disp, 0);
15182 else if (GET_CODE (disp) == LABEL_REF)
15183 output_asm_label (disp);
15184 else if (CONST_INT_P (disp))
15185 offset = disp;
15186 else
15187 output_addr_const (file, disp);
15190 putc ('[', file);
15191 if (base)
15193 print_reg (base, code, file);
15194 if (offset)
15196 if (INTVAL (offset) >= 0)
15197 putc ('+', file);
15198 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15201 else if (offset)
15202 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15203 else
15204 putc ('0', file);
15206 if (index)
15208 putc ('+', file);
15209 print_reg (index, vsib ? 0 : code, file);
15210 if (scale != 1 || vsib)
15211 fprintf (file, "*%d", scale);
15213 putc (']', file);
15218 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15220 static bool
15221 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15223 rtx op;
15225 if (GET_CODE (x) != UNSPEC)
15226 return false;
15228 op = XVECEXP (x, 0, 0);
15229 switch (XINT (x, 1))
15231 case UNSPEC_GOTTPOFF:
15232 output_addr_const (file, op);
15233 /* FIXME: This might be @TPOFF in Sun ld. */
15234 fputs ("@gottpoff", file);
15235 break;
15236 case UNSPEC_TPOFF:
15237 output_addr_const (file, op);
15238 fputs ("@tpoff", file);
15239 break;
15240 case UNSPEC_NTPOFF:
15241 output_addr_const (file, op);
15242 if (TARGET_64BIT)
15243 fputs ("@tpoff", file);
15244 else
15245 fputs ("@ntpoff", file);
15246 break;
15247 case UNSPEC_DTPOFF:
15248 output_addr_const (file, op);
15249 fputs ("@dtpoff", file);
15250 break;
15251 case UNSPEC_GOTNTPOFF:
15252 output_addr_const (file, op);
15253 if (TARGET_64BIT)
15254 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15255 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15256 else
15257 fputs ("@gotntpoff", file);
15258 break;
15259 case UNSPEC_INDNTPOFF:
15260 output_addr_const (file, op);
15261 fputs ("@indntpoff", file);
15262 break;
15263 #if TARGET_MACHO
15264 case UNSPEC_MACHOPIC_OFFSET:
15265 output_addr_const (file, op);
15266 putc ('-', file);
15267 machopic_output_function_base_name (file);
15268 break;
15269 #endif
15271 case UNSPEC_STACK_CHECK:
15273 int offset;
15275 gcc_assert (flag_split_stack);
15277 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15278 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15279 #else
15280 gcc_unreachable ();
15281 #endif
15283 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15285 break;
15287 default:
15288 return false;
15291 return true;
15294 /* Split one or more double-mode RTL references into pairs of half-mode
15295 references. The RTL can be REG, offsettable MEM, integer constant, or
15296 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15297 split and "num" is its length. lo_half and hi_half are output arrays
15298 that parallel "operands". */
15300 void
15301 split_double_mode (enum machine_mode mode, rtx operands[],
15302 int num, rtx lo_half[], rtx hi_half[])
15304 enum machine_mode half_mode;
15305 unsigned int byte;
15307 switch (mode)
15309 case TImode:
15310 half_mode = DImode;
15311 break;
15312 case DImode:
15313 half_mode = SImode;
15314 break;
15315 default:
15316 gcc_unreachable ();
15319 byte = GET_MODE_SIZE (half_mode);
15321 while (num--)
15323 rtx op = operands[num];
15325 /* simplify_subreg refuse to split volatile memory addresses,
15326 but we still have to handle it. */
15327 if (MEM_P (op))
15329 lo_half[num] = adjust_address (op, half_mode, 0);
15330 hi_half[num] = adjust_address (op, half_mode, byte);
15332 else
15334 lo_half[num] = simplify_gen_subreg (half_mode, op,
15335 GET_MODE (op) == VOIDmode
15336 ? mode : GET_MODE (op), 0);
15337 hi_half[num] = simplify_gen_subreg (half_mode, op,
15338 GET_MODE (op) == VOIDmode
15339 ? mode : GET_MODE (op), byte);
15344 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15345 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15346 is the expression of the binary operation. The output may either be
15347 emitted here, or returned to the caller, like all output_* functions.
15349 There is no guarantee that the operands are the same mode, as they
15350 might be within FLOAT or FLOAT_EXTEND expressions. */
15352 #ifndef SYSV386_COMPAT
15353 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15354 wants to fix the assemblers because that causes incompatibility
15355 with gcc. No-one wants to fix gcc because that causes
15356 incompatibility with assemblers... You can use the option of
15357 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15358 #define SYSV386_COMPAT 1
15359 #endif
15361 const char *
15362 output_387_binary_op (rtx insn, rtx *operands)
15364 static char buf[40];
15365 const char *p;
15366 const char *ssep;
15367 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15369 #ifdef ENABLE_CHECKING
15370 /* Even if we do not want to check the inputs, this documents input
15371 constraints. Which helps in understanding the following code. */
15372 if (STACK_REG_P (operands[0])
15373 && ((REG_P (operands[1])
15374 && REGNO (operands[0]) == REGNO (operands[1])
15375 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15376 || (REG_P (operands[2])
15377 && REGNO (operands[0]) == REGNO (operands[2])
15378 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15379 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15380 ; /* ok */
15381 else
15382 gcc_assert (is_sse);
15383 #endif
15385 switch (GET_CODE (operands[3]))
15387 case PLUS:
15388 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15389 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15390 p = "fiadd";
15391 else
15392 p = "fadd";
15393 ssep = "vadd";
15394 break;
15396 case MINUS:
15397 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15398 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15399 p = "fisub";
15400 else
15401 p = "fsub";
15402 ssep = "vsub";
15403 break;
15405 case MULT:
15406 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15407 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15408 p = "fimul";
15409 else
15410 p = "fmul";
15411 ssep = "vmul";
15412 break;
15414 case DIV:
15415 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15416 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15417 p = "fidiv";
15418 else
15419 p = "fdiv";
15420 ssep = "vdiv";
15421 break;
15423 default:
15424 gcc_unreachable ();
15427 if (is_sse)
15429 if (TARGET_AVX)
15431 strcpy (buf, ssep);
15432 if (GET_MODE (operands[0]) == SFmode)
15433 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15434 else
15435 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15437 else
15439 strcpy (buf, ssep + 1);
15440 if (GET_MODE (operands[0]) == SFmode)
15441 strcat (buf, "ss\t{%2, %0|%0, %2}");
15442 else
15443 strcat (buf, "sd\t{%2, %0|%0, %2}");
15445 return buf;
15447 strcpy (buf, p);
15449 switch (GET_CODE (operands[3]))
15451 case MULT:
15452 case PLUS:
15453 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15455 rtx temp = operands[2];
15456 operands[2] = operands[1];
15457 operands[1] = temp;
15460 /* know operands[0] == operands[1]. */
15462 if (MEM_P (operands[2]))
15464 p = "%Z2\t%2";
15465 break;
15468 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15470 if (STACK_TOP_P (operands[0]))
15471 /* How is it that we are storing to a dead operand[2]?
15472 Well, presumably operands[1] is dead too. We can't
15473 store the result to st(0) as st(0) gets popped on this
15474 instruction. Instead store to operands[2] (which I
15475 think has to be st(1)). st(1) will be popped later.
15476 gcc <= 2.8.1 didn't have this check and generated
15477 assembly code that the Unixware assembler rejected. */
15478 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15479 else
15480 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15481 break;
15484 if (STACK_TOP_P (operands[0]))
15485 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15486 else
15487 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15488 break;
15490 case MINUS:
15491 case DIV:
15492 if (MEM_P (operands[1]))
15494 p = "r%Z1\t%1";
15495 break;
15498 if (MEM_P (operands[2]))
15500 p = "%Z2\t%2";
15501 break;
15504 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15506 #if SYSV386_COMPAT
15507 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15508 derived assemblers, confusingly reverse the direction of
15509 the operation for fsub{r} and fdiv{r} when the
15510 destination register is not st(0). The Intel assembler
15511 doesn't have this brain damage. Read !SYSV386_COMPAT to
15512 figure out what the hardware really does. */
15513 if (STACK_TOP_P (operands[0]))
15514 p = "{p\t%0, %2|rp\t%2, %0}";
15515 else
15516 p = "{rp\t%2, %0|p\t%0, %2}";
15517 #else
15518 if (STACK_TOP_P (operands[0]))
15519 /* As above for fmul/fadd, we can't store to st(0). */
15520 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15521 else
15522 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15523 #endif
15524 break;
15527 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15529 #if SYSV386_COMPAT
15530 if (STACK_TOP_P (operands[0]))
15531 p = "{rp\t%0, %1|p\t%1, %0}";
15532 else
15533 p = "{p\t%1, %0|rp\t%0, %1}";
15534 #else
15535 if (STACK_TOP_P (operands[0]))
15536 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15537 else
15538 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15539 #endif
15540 break;
15543 if (STACK_TOP_P (operands[0]))
15545 if (STACK_TOP_P (operands[1]))
15546 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15547 else
15548 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15549 break;
15551 else if (STACK_TOP_P (operands[1]))
15553 #if SYSV386_COMPAT
15554 p = "{\t%1, %0|r\t%0, %1}";
15555 #else
15556 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15557 #endif
15559 else
15561 #if SYSV386_COMPAT
15562 p = "{r\t%2, %0|\t%0, %2}";
15563 #else
15564 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15565 #endif
15567 break;
15569 default:
15570 gcc_unreachable ();
15573 strcat (buf, p);
15574 return buf;
15577 /* Check if a 256bit AVX register is referenced inside of EXP. */
15579 static int
15580 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15582 rtx exp = *pexp;
15584 if (GET_CODE (exp) == SUBREG)
15585 exp = SUBREG_REG (exp);
15587 if (REG_P (exp)
15588 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15589 return 1;
15591 return 0;
15594 /* Return needed mode for entity in optimize_mode_switching pass. */
15596 static int
15597 ix86_avx_u128_mode_needed (rtx insn)
15599 if (CALL_P (insn))
15601 rtx link;
15603 /* Needed mode is set to AVX_U128_CLEAN if there are
15604 no 256bit modes used in function arguments. */
15605 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15606 link;
15607 link = XEXP (link, 1))
15609 if (GET_CODE (XEXP (link, 0)) == USE)
15611 rtx arg = XEXP (XEXP (link, 0), 0);
15613 if (ix86_check_avx256_register (&arg, NULL))
15614 return AVX_U128_ANY;
15618 return AVX_U128_CLEAN;
15621 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15622 changes state only when a 256bit register is written to, but we need
15623 to prevent the compiler from moving optimal insertion point above
15624 eventual read from 256bit register. */
15625 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15626 return AVX_U128_DIRTY;
15628 return AVX_U128_ANY;
15631 /* Return mode that i387 must be switched into
15632 prior to the execution of insn. */
15634 static int
15635 ix86_i387_mode_needed (int entity, rtx insn)
15637 enum attr_i387_cw mode;
15639 /* The mode UNINITIALIZED is used to store control word after a
15640 function call or ASM pattern. The mode ANY specify that function
15641 has no requirements on the control word and make no changes in the
15642 bits we are interested in. */
15644 if (CALL_P (insn)
15645 || (NONJUMP_INSN_P (insn)
15646 && (asm_noperands (PATTERN (insn)) >= 0
15647 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15648 return I387_CW_UNINITIALIZED;
15650 if (recog_memoized (insn) < 0)
15651 return I387_CW_ANY;
15653 mode = get_attr_i387_cw (insn);
15655 switch (entity)
15657 case I387_TRUNC:
15658 if (mode == I387_CW_TRUNC)
15659 return mode;
15660 break;
15662 case I387_FLOOR:
15663 if (mode == I387_CW_FLOOR)
15664 return mode;
15665 break;
15667 case I387_CEIL:
15668 if (mode == I387_CW_CEIL)
15669 return mode;
15670 break;
15672 case I387_MASK_PM:
15673 if (mode == I387_CW_MASK_PM)
15674 return mode;
15675 break;
15677 default:
15678 gcc_unreachable ();
15681 return I387_CW_ANY;
15684 /* Return mode that entity must be switched into
15685 prior to the execution of insn. */
15688 ix86_mode_needed (int entity, rtx insn)
15690 switch (entity)
15692 case AVX_U128:
15693 return ix86_avx_u128_mode_needed (insn);
15694 case I387_TRUNC:
15695 case I387_FLOOR:
15696 case I387_CEIL:
15697 case I387_MASK_PM:
15698 return ix86_i387_mode_needed (entity, insn);
15699 default:
15700 gcc_unreachable ();
15702 return 0;
15705 /* Check if a 256bit AVX register is referenced in stores. */
15707 static void
15708 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15710 if (ix86_check_avx256_register (&dest, NULL))
15712 bool *used = (bool *) data;
15713 *used = true;
15717 /* Calculate mode of upper 128bit AVX registers after the insn. */
15719 static int
15720 ix86_avx_u128_mode_after (int mode, rtx insn)
15722 rtx pat = PATTERN (insn);
15724 if (vzeroupper_operation (pat, VOIDmode)
15725 || vzeroall_operation (pat, VOIDmode))
15726 return AVX_U128_CLEAN;
15728 /* We know that state is clean after CALL insn if there are no
15729 256bit registers used in the function return register. */
15730 if (CALL_P (insn))
15732 bool avx_reg256_found = false;
15733 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15734 if (!avx_reg256_found)
15735 return AVX_U128_CLEAN;
15738 /* Otherwise, return current mode. Remember that if insn
15739 references AVX 256bit registers, the mode was already changed
15740 to DIRTY from MODE_NEEDED. */
15741 return mode;
15744 /* Return the mode that an insn results in. */
15747 ix86_mode_after (int entity, int mode, rtx insn)
15749 switch (entity)
15751 case AVX_U128:
15752 return ix86_avx_u128_mode_after (mode, insn);
15753 case I387_TRUNC:
15754 case I387_FLOOR:
15755 case I387_CEIL:
15756 case I387_MASK_PM:
15757 return mode;
15758 default:
15759 gcc_unreachable ();
15763 static int
15764 ix86_avx_u128_mode_entry (void)
15766 tree arg;
15768 /* Entry mode is set to AVX_U128_DIRTY if there are
15769 256bit modes used in function arguments. */
15770 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15771 arg = TREE_CHAIN (arg))
15773 rtx incoming = DECL_INCOMING_RTL (arg);
15775 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15776 return AVX_U128_DIRTY;
15779 return AVX_U128_CLEAN;
15782 /* Return a mode that ENTITY is assumed to be
15783 switched to at function entry. */
15786 ix86_mode_entry (int entity)
15788 switch (entity)
15790 case AVX_U128:
15791 return ix86_avx_u128_mode_entry ();
15792 case I387_TRUNC:
15793 case I387_FLOOR:
15794 case I387_CEIL:
15795 case I387_MASK_PM:
15796 return I387_CW_ANY;
15797 default:
15798 gcc_unreachable ();
15802 static int
15803 ix86_avx_u128_mode_exit (void)
15805 rtx reg = crtl->return_rtx;
15807 /* Exit mode is set to AVX_U128_DIRTY if there are
15808 256bit modes used in the function return register. */
15809 if (reg && ix86_check_avx256_register (&reg, NULL))
15810 return AVX_U128_DIRTY;
15812 return AVX_U128_CLEAN;
15815 /* Return a mode that ENTITY is assumed to be
15816 switched to at function exit. */
15819 ix86_mode_exit (int entity)
15821 switch (entity)
15823 case AVX_U128:
15824 return ix86_avx_u128_mode_exit ();
15825 case I387_TRUNC:
15826 case I387_FLOOR:
15827 case I387_CEIL:
15828 case I387_MASK_PM:
15829 return I387_CW_ANY;
15830 default:
15831 gcc_unreachable ();
15835 /* Output code to initialize control word copies used by trunc?f?i and
15836 rounding patterns. CURRENT_MODE is set to current control word,
15837 while NEW_MODE is set to new control word. */
15839 static void
15840 emit_i387_cw_initialization (int mode)
15842 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15843 rtx new_mode;
15845 enum ix86_stack_slot slot;
15847 rtx reg = gen_reg_rtx (HImode);
15849 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15850 emit_move_insn (reg, copy_rtx (stored_mode));
15852 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15853 || optimize_insn_for_size_p ())
15855 switch (mode)
15857 case I387_CW_TRUNC:
15858 /* round toward zero (truncate) */
15859 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15860 slot = SLOT_CW_TRUNC;
15861 break;
15863 case I387_CW_FLOOR:
15864 /* round down toward -oo */
15865 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15866 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15867 slot = SLOT_CW_FLOOR;
15868 break;
15870 case I387_CW_CEIL:
15871 /* round up toward +oo */
15872 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15873 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15874 slot = SLOT_CW_CEIL;
15875 break;
15877 case I387_CW_MASK_PM:
15878 /* mask precision exception for nearbyint() */
15879 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15880 slot = SLOT_CW_MASK_PM;
15881 break;
15883 default:
15884 gcc_unreachable ();
15887 else
15889 switch (mode)
15891 case I387_CW_TRUNC:
15892 /* round toward zero (truncate) */
15893 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15894 slot = SLOT_CW_TRUNC;
15895 break;
15897 case I387_CW_FLOOR:
15898 /* round down toward -oo */
15899 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15900 slot = SLOT_CW_FLOOR;
15901 break;
15903 case I387_CW_CEIL:
15904 /* round up toward +oo */
15905 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15906 slot = SLOT_CW_CEIL;
15907 break;
15909 case I387_CW_MASK_PM:
15910 /* mask precision exception for nearbyint() */
15911 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15912 slot = SLOT_CW_MASK_PM;
15913 break;
15915 default:
15916 gcc_unreachable ();
15920 gcc_assert (slot < MAX_386_STACK_LOCALS);
15922 new_mode = assign_386_stack_local (HImode, slot);
15923 emit_move_insn (new_mode, reg);
15926 /* Emit vzeroupper. */
15928 void
15929 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15931 int i;
15933 /* Cancel automatic vzeroupper insertion if there are
15934 live call-saved SSE registers at the insertion point. */
15936 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15937 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15938 return;
15940 if (TARGET_64BIT)
15941 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15942 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15943 return;
15945 emit_insn (gen_avx_vzeroupper ());
15948 /* Generate one or more insns to set ENTITY to MODE. */
15950 void
15951 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15953 switch (entity)
15955 case AVX_U128:
15956 if (mode == AVX_U128_CLEAN)
15957 ix86_avx_emit_vzeroupper (regs_live);
15958 break;
15959 case I387_TRUNC:
15960 case I387_FLOOR:
15961 case I387_CEIL:
15962 case I387_MASK_PM:
15963 if (mode != I387_CW_ANY
15964 && mode != I387_CW_UNINITIALIZED)
15965 emit_i387_cw_initialization (mode);
15966 break;
15967 default:
15968 gcc_unreachable ();
15972 /* Output code for INSN to convert a float to a signed int. OPERANDS
15973 are the insn operands. The output may be [HSD]Imode and the input
15974 operand may be [SDX]Fmode. */
15976 const char *
15977 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15979 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15980 int dimode_p = GET_MODE (operands[0]) == DImode;
15981 int round_mode = get_attr_i387_cw (insn);
15983 /* Jump through a hoop or two for DImode, since the hardware has no
15984 non-popping instruction. We used to do this a different way, but
15985 that was somewhat fragile and broke with post-reload splitters. */
15986 if ((dimode_p || fisttp) && !stack_top_dies)
15987 output_asm_insn ("fld\t%y1", operands);
15989 gcc_assert (STACK_TOP_P (operands[1]));
15990 gcc_assert (MEM_P (operands[0]));
15991 gcc_assert (GET_MODE (operands[1]) != TFmode);
15993 if (fisttp)
15994 output_asm_insn ("fisttp%Z0\t%0", operands);
15995 else
15997 if (round_mode != I387_CW_ANY)
15998 output_asm_insn ("fldcw\t%3", operands);
15999 if (stack_top_dies || dimode_p)
16000 output_asm_insn ("fistp%Z0\t%0", operands);
16001 else
16002 output_asm_insn ("fist%Z0\t%0", operands);
16003 if (round_mode != I387_CW_ANY)
16004 output_asm_insn ("fldcw\t%2", operands);
16007 return "";
16010 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16011 have the values zero or one, indicates the ffreep insn's operand
16012 from the OPERANDS array. */
16014 static const char *
16015 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16017 if (TARGET_USE_FFREEP)
16018 #ifdef HAVE_AS_IX86_FFREEP
16019 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16020 #else
16022 static char retval[32];
16023 int regno = REGNO (operands[opno]);
16025 gcc_assert (STACK_REGNO_P (regno));
16027 regno -= FIRST_STACK_REG;
16029 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16030 return retval;
16032 #endif
16034 return opno ? "fstp\t%y1" : "fstp\t%y0";
16038 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16039 should be used. UNORDERED_P is true when fucom should be used. */
16041 const char *
16042 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16044 int stack_top_dies;
16045 rtx cmp_op0, cmp_op1;
16046 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16048 if (eflags_p)
16050 cmp_op0 = operands[0];
16051 cmp_op1 = operands[1];
16053 else
16055 cmp_op0 = operands[1];
16056 cmp_op1 = operands[2];
16059 if (is_sse)
16061 if (GET_MODE (operands[0]) == SFmode)
16062 if (unordered_p)
16063 return "%vucomiss\t{%1, %0|%0, %1}";
16064 else
16065 return "%vcomiss\t{%1, %0|%0, %1}";
16066 else
16067 if (unordered_p)
16068 return "%vucomisd\t{%1, %0|%0, %1}";
16069 else
16070 return "%vcomisd\t{%1, %0|%0, %1}";
16073 gcc_assert (STACK_TOP_P (cmp_op0));
16075 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16077 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16079 if (stack_top_dies)
16081 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16082 return output_387_ffreep (operands, 1);
16084 else
16085 return "ftst\n\tfnstsw\t%0";
16088 if (STACK_REG_P (cmp_op1)
16089 && stack_top_dies
16090 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16091 && REGNO (cmp_op1) != FIRST_STACK_REG)
16093 /* If both the top of the 387 stack dies, and the other operand
16094 is also a stack register that dies, then this must be a
16095 `fcompp' float compare */
16097 if (eflags_p)
16099 /* There is no double popping fcomi variant. Fortunately,
16100 eflags is immune from the fstp's cc clobbering. */
16101 if (unordered_p)
16102 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16103 else
16104 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16105 return output_387_ffreep (operands, 0);
16107 else
16109 if (unordered_p)
16110 return "fucompp\n\tfnstsw\t%0";
16111 else
16112 return "fcompp\n\tfnstsw\t%0";
16115 else
16117 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16119 static const char * const alt[16] =
16121 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16122 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16123 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16124 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16126 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16127 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16128 NULL,
16129 NULL,
16131 "fcomi\t{%y1, %0|%0, %y1}",
16132 "fcomip\t{%y1, %0|%0, %y1}",
16133 "fucomi\t{%y1, %0|%0, %y1}",
16134 "fucomip\t{%y1, %0|%0, %y1}",
16136 NULL,
16137 NULL,
16138 NULL,
16139 NULL
16142 int mask;
16143 const char *ret;
16145 mask = eflags_p << 3;
16146 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16147 mask |= unordered_p << 1;
16148 mask |= stack_top_dies;
16150 gcc_assert (mask < 16);
16151 ret = alt[mask];
16152 gcc_assert (ret);
16154 return ret;
16158 void
16159 ix86_output_addr_vec_elt (FILE *file, int value)
16161 const char *directive = ASM_LONG;
16163 #ifdef ASM_QUAD
16164 if (TARGET_LP64)
16165 directive = ASM_QUAD;
16166 #else
16167 gcc_assert (!TARGET_64BIT);
16168 #endif
16170 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16173 void
16174 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16176 const char *directive = ASM_LONG;
16178 #ifdef ASM_QUAD
16179 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16180 directive = ASM_QUAD;
16181 #else
16182 gcc_assert (!TARGET_64BIT);
16183 #endif
16184 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16185 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16186 fprintf (file, "%s%s%d-%s%d\n",
16187 directive, LPREFIX, value, LPREFIX, rel);
16188 else if (HAVE_AS_GOTOFF_IN_DATA)
16189 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16190 #if TARGET_MACHO
16191 else if (TARGET_MACHO)
16193 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16194 machopic_output_function_base_name (file);
16195 putc ('\n', file);
16197 #endif
16198 else
16199 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16200 GOT_SYMBOL_NAME, LPREFIX, value);
16203 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16204 for the target. */
16206 void
16207 ix86_expand_clear (rtx dest)
16209 rtx tmp;
16211 /* We play register width games, which are only valid after reload. */
16212 gcc_assert (reload_completed);
16214 /* Avoid HImode and its attendant prefix byte. */
16215 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16216 dest = gen_rtx_REG (SImode, REGNO (dest));
16217 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16219 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16220 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16222 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16223 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16226 emit_insn (tmp);
16229 /* X is an unchanging MEM. If it is a constant pool reference, return
16230 the constant pool rtx, else NULL. */
16233 maybe_get_pool_constant (rtx x)
16235 x = ix86_delegitimize_address (XEXP (x, 0));
16237 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16238 return get_pool_constant (x);
16240 return NULL_RTX;
16243 void
16244 ix86_expand_move (enum machine_mode mode, rtx operands[])
16246 rtx op0, op1;
16247 enum tls_model model;
16249 op0 = operands[0];
16250 op1 = operands[1];
16252 if (GET_CODE (op1) == SYMBOL_REF)
16254 rtx tmp;
16256 model = SYMBOL_REF_TLS_MODEL (op1);
16257 if (model)
16259 op1 = legitimize_tls_address (op1, model, true);
16260 op1 = force_operand (op1, op0);
16261 if (op1 == op0)
16262 return;
16263 op1 = convert_to_mode (mode, op1, 1);
16265 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16266 op1 = tmp;
16268 else if (GET_CODE (op1) == CONST
16269 && GET_CODE (XEXP (op1, 0)) == PLUS
16270 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16272 rtx addend = XEXP (XEXP (op1, 0), 1);
16273 rtx symbol = XEXP (XEXP (op1, 0), 0);
16274 rtx tmp;
16276 model = SYMBOL_REF_TLS_MODEL (symbol);
16277 if (model)
16278 tmp = legitimize_tls_address (symbol, model, true);
16279 else
16280 tmp = legitimize_pe_coff_symbol (symbol, true);
16282 if (tmp)
16284 tmp = force_operand (tmp, NULL);
16285 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16286 op0, 1, OPTAB_DIRECT);
16287 if (tmp == op0)
16288 return;
16289 op1 = convert_to_mode (mode, tmp, 1);
16293 if ((flag_pic || MACHOPIC_INDIRECT)
16294 && symbolic_operand (op1, mode))
16296 if (TARGET_MACHO && !TARGET_64BIT)
16298 #if TARGET_MACHO
16299 /* dynamic-no-pic */
16300 if (MACHOPIC_INDIRECT)
16302 rtx temp = ((reload_in_progress
16303 || ((op0 && REG_P (op0))
16304 && mode == Pmode))
16305 ? op0 : gen_reg_rtx (Pmode));
16306 op1 = machopic_indirect_data_reference (op1, temp);
16307 if (MACHOPIC_PURE)
16308 op1 = machopic_legitimize_pic_address (op1, mode,
16309 temp == op1 ? 0 : temp);
16311 if (op0 != op1 && GET_CODE (op0) != MEM)
16313 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16314 emit_insn (insn);
16315 return;
16317 if (GET_CODE (op0) == MEM)
16318 op1 = force_reg (Pmode, op1);
16319 else
16321 rtx temp = op0;
16322 if (GET_CODE (temp) != REG)
16323 temp = gen_reg_rtx (Pmode);
16324 temp = legitimize_pic_address (op1, temp);
16325 if (temp == op0)
16326 return;
16327 op1 = temp;
16329 /* dynamic-no-pic */
16330 #endif
16332 else
16334 if (MEM_P (op0))
16335 op1 = force_reg (mode, op1);
16336 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16338 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16339 op1 = legitimize_pic_address (op1, reg);
16340 if (op0 == op1)
16341 return;
16342 op1 = convert_to_mode (mode, op1, 1);
16346 else
16348 if (MEM_P (op0)
16349 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16350 || !push_operand (op0, mode))
16351 && MEM_P (op1))
16352 op1 = force_reg (mode, op1);
16354 if (push_operand (op0, mode)
16355 && ! general_no_elim_operand (op1, mode))
16356 op1 = copy_to_mode_reg (mode, op1);
16358 /* Force large constants in 64bit compilation into register
16359 to get them CSEed. */
16360 if (can_create_pseudo_p ()
16361 && (mode == DImode) && TARGET_64BIT
16362 && immediate_operand (op1, mode)
16363 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16364 && !register_operand (op0, mode)
16365 && optimize)
16366 op1 = copy_to_mode_reg (mode, op1);
16368 if (can_create_pseudo_p ()
16369 && FLOAT_MODE_P (mode)
16370 && GET_CODE (op1) == CONST_DOUBLE)
16372 /* If we are loading a floating point constant to a register,
16373 force the value to memory now, since we'll get better code
16374 out the back end. */
16376 op1 = validize_mem (force_const_mem (mode, op1));
16377 if (!register_operand (op0, mode))
16379 rtx temp = gen_reg_rtx (mode);
16380 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16381 emit_move_insn (op0, temp);
16382 return;
16387 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16390 void
16391 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16393 rtx op0 = operands[0], op1 = operands[1];
16394 unsigned int align = GET_MODE_ALIGNMENT (mode);
16396 /* Force constants other than zero into memory. We do not know how
16397 the instructions used to build constants modify the upper 64 bits
16398 of the register, once we have that information we may be able
16399 to handle some of them more efficiently. */
16400 if (can_create_pseudo_p ()
16401 && register_operand (op0, mode)
16402 && (CONSTANT_P (op1)
16403 || (GET_CODE (op1) == SUBREG
16404 && CONSTANT_P (SUBREG_REG (op1))))
16405 && !standard_sse_constant_p (op1))
16406 op1 = validize_mem (force_const_mem (mode, op1));
16408 /* We need to check memory alignment for SSE mode since attribute
16409 can make operands unaligned. */
16410 if (can_create_pseudo_p ()
16411 && SSE_REG_MODE_P (mode)
16412 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16413 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16415 rtx tmp[2];
16417 /* ix86_expand_vector_move_misalign() does not like constants ... */
16418 if (CONSTANT_P (op1)
16419 || (GET_CODE (op1) == SUBREG
16420 && CONSTANT_P (SUBREG_REG (op1))))
16421 op1 = validize_mem (force_const_mem (mode, op1));
16423 /* ... nor both arguments in memory. */
16424 if (!register_operand (op0, mode)
16425 && !register_operand (op1, mode))
16426 op1 = force_reg (mode, op1);
16428 tmp[0] = op0; tmp[1] = op1;
16429 ix86_expand_vector_move_misalign (mode, tmp);
16430 return;
16433 /* Make operand1 a register if it isn't already. */
16434 if (can_create_pseudo_p ()
16435 && !register_operand (op0, mode)
16436 && !register_operand (op1, mode))
16438 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16439 return;
16442 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16445 /* Split 32-byte AVX unaligned load and store if needed. */
16447 static void
16448 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16450 rtx m;
16451 rtx (*extract) (rtx, rtx, rtx);
16452 rtx (*load_unaligned) (rtx, rtx);
16453 rtx (*store_unaligned) (rtx, rtx);
16454 enum machine_mode mode;
16456 switch (GET_MODE (op0))
16458 default:
16459 gcc_unreachable ();
16460 case V32QImode:
16461 extract = gen_avx_vextractf128v32qi;
16462 load_unaligned = gen_avx_loaddqu256;
16463 store_unaligned = gen_avx_storedqu256;
16464 mode = V16QImode;
16465 break;
16466 case V8SFmode:
16467 extract = gen_avx_vextractf128v8sf;
16468 load_unaligned = gen_avx_loadups256;
16469 store_unaligned = gen_avx_storeups256;
16470 mode = V4SFmode;
16471 break;
16472 case V4DFmode:
16473 extract = gen_avx_vextractf128v4df;
16474 load_unaligned = gen_avx_loadupd256;
16475 store_unaligned = gen_avx_storeupd256;
16476 mode = V2DFmode;
16477 break;
16480 if (MEM_P (op1))
16482 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16484 rtx r = gen_reg_rtx (mode);
16485 m = adjust_address (op1, mode, 0);
16486 emit_move_insn (r, m);
16487 m = adjust_address (op1, mode, 16);
16488 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16489 emit_move_insn (op0, r);
16491 else
16492 emit_insn (load_unaligned (op0, op1));
16494 else if (MEM_P (op0))
16496 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16498 m = adjust_address (op0, mode, 0);
16499 emit_insn (extract (m, op1, const0_rtx));
16500 m = adjust_address (op0, mode, 16);
16501 emit_insn (extract (m, op1, const1_rtx));
16503 else
16504 emit_insn (store_unaligned (op0, op1));
16506 else
16507 gcc_unreachable ();
16510 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16511 straight to ix86_expand_vector_move. */
16512 /* Code generation for scalar reg-reg moves of single and double precision data:
16513 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16514 movaps reg, reg
16515 else
16516 movss reg, reg
16517 if (x86_sse_partial_reg_dependency == true)
16518 movapd reg, reg
16519 else
16520 movsd reg, reg
16522 Code generation for scalar loads of double precision data:
16523 if (x86_sse_split_regs == true)
16524 movlpd mem, reg (gas syntax)
16525 else
16526 movsd mem, reg
16528 Code generation for unaligned packed loads of single precision data
16529 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16530 if (x86_sse_unaligned_move_optimal)
16531 movups mem, reg
16533 if (x86_sse_partial_reg_dependency == true)
16535 xorps reg, reg
16536 movlps mem, reg
16537 movhps mem+8, reg
16539 else
16541 movlps mem, reg
16542 movhps mem+8, reg
16545 Code generation for unaligned packed loads of double precision data
16546 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16547 if (x86_sse_unaligned_move_optimal)
16548 movupd mem, reg
16550 if (x86_sse_split_regs == true)
16552 movlpd mem, reg
16553 movhpd mem+8, reg
16555 else
16557 movsd mem, reg
16558 movhpd mem+8, reg
16562 void
16563 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16565 rtx op0, op1, m;
16567 op0 = operands[0];
16568 op1 = operands[1];
16570 if (TARGET_AVX
16571 && GET_MODE_SIZE (mode) == 32)
16573 switch (GET_MODE_CLASS (mode))
16575 case MODE_VECTOR_INT:
16576 case MODE_INT:
16577 op0 = gen_lowpart (V32QImode, op0);
16578 op1 = gen_lowpart (V32QImode, op1);
16579 /* FALLTHRU */
16581 case MODE_VECTOR_FLOAT:
16582 ix86_avx256_split_vector_move_misalign (op0, op1);
16583 break;
16585 default:
16586 gcc_unreachable ();
16589 return;
16592 if (MEM_P (op1))
16594 /* ??? If we have typed data, then it would appear that using
16595 movdqu is the only way to get unaligned data loaded with
16596 integer type. */
16597 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16599 op0 = gen_lowpart (V16QImode, op0);
16600 op1 = gen_lowpart (V16QImode, op1);
16601 /* We will eventually emit movups based on insn attributes. */
16602 emit_insn (gen_sse2_loaddqu (op0, op1));
16604 else if (TARGET_SSE2 && mode == V2DFmode)
16606 rtx zero;
16608 if (TARGET_AVX
16609 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16610 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16611 || optimize_insn_for_size_p ())
16613 /* We will eventually emit movups based on insn attributes. */
16614 emit_insn (gen_sse2_loadupd (op0, op1));
16615 return;
16618 /* When SSE registers are split into halves, we can avoid
16619 writing to the top half twice. */
16620 if (TARGET_SSE_SPLIT_REGS)
16622 emit_clobber (op0);
16623 zero = op0;
16625 else
16627 /* ??? Not sure about the best option for the Intel chips.
16628 The following would seem to satisfy; the register is
16629 entirely cleared, breaking the dependency chain. We
16630 then store to the upper half, with a dependency depth
16631 of one. A rumor has it that Intel recommends two movsd
16632 followed by an unpacklpd, but this is unconfirmed. And
16633 given that the dependency depth of the unpacklpd would
16634 still be one, I'm not sure why this would be better. */
16635 zero = CONST0_RTX (V2DFmode);
16638 m = adjust_address (op1, DFmode, 0);
16639 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16640 m = adjust_address (op1, DFmode, 8);
16641 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16643 else
16645 if (TARGET_AVX
16646 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16647 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16648 || optimize_insn_for_size_p ())
16650 op0 = gen_lowpart (V4SFmode, op0);
16651 op1 = gen_lowpart (V4SFmode, op1);
16652 emit_insn (gen_sse_loadups (op0, op1));
16653 return;
16656 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16657 emit_move_insn (op0, CONST0_RTX (mode));
16658 else
16659 emit_clobber (op0);
16661 if (mode != V4SFmode)
16662 op0 = gen_lowpart (V4SFmode, op0);
16664 m = adjust_address (op1, V2SFmode, 0);
16665 emit_insn (gen_sse_loadlps (op0, op0, m));
16666 m = adjust_address (op1, V2SFmode, 8);
16667 emit_insn (gen_sse_loadhps (op0, op0, m));
16670 else if (MEM_P (op0))
16672 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16674 op0 = gen_lowpart (V16QImode, op0);
16675 op1 = gen_lowpart (V16QImode, op1);
16676 /* We will eventually emit movups based on insn attributes. */
16677 emit_insn (gen_sse2_storedqu (op0, op1));
16679 else if (TARGET_SSE2 && mode == V2DFmode)
16681 if (TARGET_AVX
16682 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16683 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16684 || optimize_insn_for_size_p ())
16685 /* We will eventually emit movups based on insn attributes. */
16686 emit_insn (gen_sse2_storeupd (op0, op1));
16687 else
16689 m = adjust_address (op0, DFmode, 0);
16690 emit_insn (gen_sse2_storelpd (m, op1));
16691 m = adjust_address (op0, DFmode, 8);
16692 emit_insn (gen_sse2_storehpd (m, op1));
16695 else
16697 if (mode != V4SFmode)
16698 op1 = gen_lowpart (V4SFmode, op1);
16700 if (TARGET_AVX
16701 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16702 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16703 || optimize_insn_for_size_p ())
16705 op0 = gen_lowpart (V4SFmode, op0);
16706 emit_insn (gen_sse_storeups (op0, op1));
16708 else
16710 m = adjust_address (op0, V2SFmode, 0);
16711 emit_insn (gen_sse_storelps (m, op1));
16712 m = adjust_address (op0, V2SFmode, 8);
16713 emit_insn (gen_sse_storehps (m, op1));
16717 else
16718 gcc_unreachable ();
16721 /* Expand a push in MODE. This is some mode for which we do not support
16722 proper push instructions, at least from the registers that we expect
16723 the value to live in. */
16725 void
16726 ix86_expand_push (enum machine_mode mode, rtx x)
16728 rtx tmp;
16730 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16731 GEN_INT (-GET_MODE_SIZE (mode)),
16732 stack_pointer_rtx, 1, OPTAB_DIRECT);
16733 if (tmp != stack_pointer_rtx)
16734 emit_move_insn (stack_pointer_rtx, tmp);
16736 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16738 /* When we push an operand onto stack, it has to be aligned at least
16739 at the function argument boundary. However since we don't have
16740 the argument type, we can't determine the actual argument
16741 boundary. */
16742 emit_move_insn (tmp, x);
16745 /* Helper function of ix86_fixup_binary_operands to canonicalize
16746 operand order. Returns true if the operands should be swapped. */
16748 static bool
16749 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16750 rtx operands[])
16752 rtx dst = operands[0];
16753 rtx src1 = operands[1];
16754 rtx src2 = operands[2];
16756 /* If the operation is not commutative, we can't do anything. */
16757 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16758 return false;
16760 /* Highest priority is that src1 should match dst. */
16761 if (rtx_equal_p (dst, src1))
16762 return false;
16763 if (rtx_equal_p (dst, src2))
16764 return true;
16766 /* Next highest priority is that immediate constants come second. */
16767 if (immediate_operand (src2, mode))
16768 return false;
16769 if (immediate_operand (src1, mode))
16770 return true;
16772 /* Lowest priority is that memory references should come second. */
16773 if (MEM_P (src2))
16774 return false;
16775 if (MEM_P (src1))
16776 return true;
16778 return false;
16782 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16783 destination to use for the operation. If different from the true
16784 destination in operands[0], a copy operation will be required. */
16787 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16788 rtx operands[])
16790 rtx dst = operands[0];
16791 rtx src1 = operands[1];
16792 rtx src2 = operands[2];
16794 /* Canonicalize operand order. */
16795 if (ix86_swap_binary_operands_p (code, mode, operands))
16797 rtx temp;
16799 /* It is invalid to swap operands of different modes. */
16800 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16802 temp = src1;
16803 src1 = src2;
16804 src2 = temp;
16807 /* Both source operands cannot be in memory. */
16808 if (MEM_P (src1) && MEM_P (src2))
16810 /* Optimization: Only read from memory once. */
16811 if (rtx_equal_p (src1, src2))
16813 src2 = force_reg (mode, src2);
16814 src1 = src2;
16816 else
16817 src2 = force_reg (mode, src2);
16820 /* If the destination is memory, and we do not have matching source
16821 operands, do things in registers. */
16822 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16823 dst = gen_reg_rtx (mode);
16825 /* Source 1 cannot be a constant. */
16826 if (CONSTANT_P (src1))
16827 src1 = force_reg (mode, src1);
16829 /* Source 1 cannot be a non-matching memory. */
16830 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16831 src1 = force_reg (mode, src1);
16833 /* Improve address combine. */
16834 if (code == PLUS
16835 && GET_MODE_CLASS (mode) == MODE_INT
16836 && MEM_P (src2))
16837 src2 = force_reg (mode, src2);
16839 operands[1] = src1;
16840 operands[2] = src2;
16841 return dst;
16844 /* Similarly, but assume that the destination has already been
16845 set up properly. */
16847 void
16848 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16849 enum machine_mode mode, rtx operands[])
16851 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16852 gcc_assert (dst == operands[0]);
16855 /* Attempt to expand a binary operator. Make the expansion closer to the
16856 actual machine, then just general_operand, which will allow 3 separate
16857 memory references (one output, two input) in a single insn. */
16859 void
16860 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16861 rtx operands[])
16863 rtx src1, src2, dst, op, clob;
16865 dst = ix86_fixup_binary_operands (code, mode, operands);
16866 src1 = operands[1];
16867 src2 = operands[2];
16869 /* Emit the instruction. */
16871 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16872 if (reload_in_progress)
16874 /* Reload doesn't know about the flags register, and doesn't know that
16875 it doesn't want to clobber it. We can only do this with PLUS. */
16876 gcc_assert (code == PLUS);
16877 emit_insn (op);
16879 else if (reload_completed
16880 && code == PLUS
16881 && !rtx_equal_p (dst, src1))
16883 /* This is going to be an LEA; avoid splitting it later. */
16884 emit_insn (op);
16886 else
16888 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16889 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16892 /* Fix up the destination if needed. */
16893 if (dst != operands[0])
16894 emit_move_insn (operands[0], dst);
16897 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16898 the given OPERANDS. */
16900 void
16901 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16902 rtx operands[])
16904 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16905 if (GET_CODE (operands[1]) == SUBREG)
16907 op1 = operands[1];
16908 op2 = operands[2];
16910 else if (GET_CODE (operands[2]) == SUBREG)
16912 op1 = operands[2];
16913 op2 = operands[1];
16915 /* Optimize (__m128i) d | (__m128i) e and similar code
16916 when d and e are float vectors into float vector logical
16917 insn. In C/C++ without using intrinsics there is no other way
16918 to express vector logical operation on float vectors than
16919 to cast them temporarily to integer vectors. */
16920 if (op1
16921 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16922 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16923 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16924 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16925 && SUBREG_BYTE (op1) == 0
16926 && (GET_CODE (op2) == CONST_VECTOR
16927 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16928 && SUBREG_BYTE (op2) == 0))
16929 && can_create_pseudo_p ())
16931 rtx dst;
16932 switch (GET_MODE (SUBREG_REG (op1)))
16934 case V4SFmode:
16935 case V8SFmode:
16936 case V2DFmode:
16937 case V4DFmode:
16938 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16939 if (GET_CODE (op2) == CONST_VECTOR)
16941 op2 = gen_lowpart (GET_MODE (dst), op2);
16942 op2 = force_reg (GET_MODE (dst), op2);
16944 else
16946 op1 = operands[1];
16947 op2 = SUBREG_REG (operands[2]);
16948 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16949 op2 = force_reg (GET_MODE (dst), op2);
16951 op1 = SUBREG_REG (op1);
16952 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16953 op1 = force_reg (GET_MODE (dst), op1);
16954 emit_insn (gen_rtx_SET (VOIDmode, dst,
16955 gen_rtx_fmt_ee (code, GET_MODE (dst),
16956 op1, op2)));
16957 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16958 return;
16959 default:
16960 break;
16963 if (!nonimmediate_operand (operands[1], mode))
16964 operands[1] = force_reg (mode, operands[1]);
16965 if (!nonimmediate_operand (operands[2], mode))
16966 operands[2] = force_reg (mode, operands[2]);
16967 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16968 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16969 gen_rtx_fmt_ee (code, mode, operands[1],
16970 operands[2])));
16973 /* Return TRUE or FALSE depending on whether the binary operator meets the
16974 appropriate constraints. */
16976 bool
16977 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16978 rtx operands[3])
16980 rtx dst = operands[0];
16981 rtx src1 = operands[1];
16982 rtx src2 = operands[2];
16984 /* Both source operands cannot be in memory. */
16985 if (MEM_P (src1) && MEM_P (src2))
16986 return false;
16988 /* Canonicalize operand order for commutative operators. */
16989 if (ix86_swap_binary_operands_p (code, mode, operands))
16991 rtx temp = src1;
16992 src1 = src2;
16993 src2 = temp;
16996 /* If the destination is memory, we must have a matching source operand. */
16997 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16998 return false;
17000 /* Source 1 cannot be a constant. */
17001 if (CONSTANT_P (src1))
17002 return false;
17004 /* Source 1 cannot be a non-matching memory. */
17005 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17006 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17007 return (code == AND
17008 && (mode == HImode
17009 || mode == SImode
17010 || (TARGET_64BIT && mode == DImode))
17011 && satisfies_constraint_L (src2));
17013 return true;
17016 /* Attempt to expand a unary operator. Make the expansion closer to the
17017 actual machine, then just general_operand, which will allow 2 separate
17018 memory references (one output, one input) in a single insn. */
17020 void
17021 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17022 rtx operands[])
17024 int matching_memory;
17025 rtx src, dst, op, clob;
17027 dst = operands[0];
17028 src = operands[1];
17030 /* If the destination is memory, and we do not have matching source
17031 operands, do things in registers. */
17032 matching_memory = 0;
17033 if (MEM_P (dst))
17035 if (rtx_equal_p (dst, src))
17036 matching_memory = 1;
17037 else
17038 dst = gen_reg_rtx (mode);
17041 /* When source operand is memory, destination must match. */
17042 if (MEM_P (src) && !matching_memory)
17043 src = force_reg (mode, src);
17045 /* Emit the instruction. */
17047 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17048 if (reload_in_progress || code == NOT)
17050 /* Reload doesn't know about the flags register, and doesn't know that
17051 it doesn't want to clobber it. */
17052 gcc_assert (code == NOT);
17053 emit_insn (op);
17055 else
17057 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17058 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17061 /* Fix up the destination if needed. */
17062 if (dst != operands[0])
17063 emit_move_insn (operands[0], dst);
17066 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17067 divisor are within the range [0-255]. */
17069 void
17070 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17071 bool signed_p)
17073 rtx end_label, qimode_label;
17074 rtx insn, div, mod;
17075 rtx scratch, tmp0, tmp1, tmp2;
17076 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17077 rtx (*gen_zero_extend) (rtx, rtx);
17078 rtx (*gen_test_ccno_1) (rtx, rtx);
17080 switch (mode)
17082 case SImode:
17083 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17084 gen_test_ccno_1 = gen_testsi_ccno_1;
17085 gen_zero_extend = gen_zero_extendqisi2;
17086 break;
17087 case DImode:
17088 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17089 gen_test_ccno_1 = gen_testdi_ccno_1;
17090 gen_zero_extend = gen_zero_extendqidi2;
17091 break;
17092 default:
17093 gcc_unreachable ();
17096 end_label = gen_label_rtx ();
17097 qimode_label = gen_label_rtx ();
17099 scratch = gen_reg_rtx (mode);
17101 /* Use 8bit unsigned divimod if dividend and divisor are within
17102 the range [0-255]. */
17103 emit_move_insn (scratch, operands[2]);
17104 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17105 scratch, 1, OPTAB_DIRECT);
17106 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17107 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17108 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17109 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17110 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17111 pc_rtx);
17112 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17113 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17114 JUMP_LABEL (insn) = qimode_label;
17116 /* Generate original signed/unsigned divimod. */
17117 div = gen_divmod4_1 (operands[0], operands[1],
17118 operands[2], operands[3]);
17119 emit_insn (div);
17121 /* Branch to the end. */
17122 emit_jump_insn (gen_jump (end_label));
17123 emit_barrier ();
17125 /* Generate 8bit unsigned divide. */
17126 emit_label (qimode_label);
17127 /* Don't use operands[0] for result of 8bit divide since not all
17128 registers support QImode ZERO_EXTRACT. */
17129 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17130 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17131 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17132 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17134 if (signed_p)
17136 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17137 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17139 else
17141 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17142 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17145 /* Extract remainder from AH. */
17146 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17147 if (REG_P (operands[1]))
17148 insn = emit_move_insn (operands[1], tmp1);
17149 else
17151 /* Need a new scratch register since the old one has result
17152 of 8bit divide. */
17153 scratch = gen_reg_rtx (mode);
17154 emit_move_insn (scratch, tmp1);
17155 insn = emit_move_insn (operands[1], scratch);
17157 set_unique_reg_note (insn, REG_EQUAL, mod);
17159 /* Zero extend quotient from AL. */
17160 tmp1 = gen_lowpart (QImode, tmp0);
17161 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17162 set_unique_reg_note (insn, REG_EQUAL, div);
17164 emit_label (end_label);
17167 #define LEA_MAX_STALL (3)
17168 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17170 /* Increase given DISTANCE in half-cycles according to
17171 dependencies between PREV and NEXT instructions.
17172 Add 1 half-cycle if there is no dependency and
17173 go to next cycle if there is some dependecy. */
17175 static unsigned int
17176 increase_distance (rtx prev, rtx next, unsigned int distance)
17178 df_ref *use_rec;
17179 df_ref *def_rec;
17181 if (!prev || !next)
17182 return distance + (distance & 1) + 2;
17184 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17185 return distance + 1;
17187 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17188 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17189 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17190 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17191 return distance + (distance & 1) + 2;
17193 return distance + 1;
17196 /* Function checks if instruction INSN defines register number
17197 REGNO1 or REGNO2. */
17199 static bool
17200 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17201 rtx insn)
17203 df_ref *def_rec;
17205 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17206 if (DF_REF_REG_DEF_P (*def_rec)
17207 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17208 && (regno1 == DF_REF_REGNO (*def_rec)
17209 || regno2 == DF_REF_REGNO (*def_rec)))
17211 return true;
17214 return false;
17217 /* Function checks if instruction INSN uses register number
17218 REGNO as a part of address expression. */
17220 static bool
17221 insn_uses_reg_mem (unsigned int regno, rtx insn)
17223 df_ref *use_rec;
17225 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17226 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17227 return true;
17229 return false;
17232 /* Search backward for non-agu definition of register number REGNO1
17233 or register number REGNO2 in basic block starting from instruction
17234 START up to head of basic block or instruction INSN.
17236 Function puts true value into *FOUND var if definition was found
17237 and false otherwise.
17239 Distance in half-cycles between START and found instruction or head
17240 of BB is added to DISTANCE and returned. */
17242 static int
17243 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17244 rtx insn, int distance,
17245 rtx start, bool *found)
17247 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17248 rtx prev = start;
17249 rtx next = NULL;
17251 *found = false;
17253 while (prev
17254 && prev != insn
17255 && distance < LEA_SEARCH_THRESHOLD)
17257 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17259 distance = increase_distance (prev, next, distance);
17260 if (insn_defines_reg (regno1, regno2, prev))
17262 if (recog_memoized (prev) < 0
17263 || get_attr_type (prev) != TYPE_LEA)
17265 *found = true;
17266 return distance;
17270 next = prev;
17272 if (prev == BB_HEAD (bb))
17273 break;
17275 prev = PREV_INSN (prev);
17278 return distance;
17281 /* Search backward for non-agu definition of register number REGNO1
17282 or register number REGNO2 in INSN's basic block until
17283 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17284 2. Reach neighbour BBs boundary, or
17285 3. Reach agu definition.
17286 Returns the distance between the non-agu definition point and INSN.
17287 If no definition point, returns -1. */
17289 static int
17290 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17291 rtx insn)
17293 basic_block bb = BLOCK_FOR_INSN (insn);
17294 int distance = 0;
17295 bool found = false;
17297 if (insn != BB_HEAD (bb))
17298 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17299 distance, PREV_INSN (insn),
17300 &found);
17302 if (!found && distance < LEA_SEARCH_THRESHOLD)
17304 edge e;
17305 edge_iterator ei;
17306 bool simple_loop = false;
17308 FOR_EACH_EDGE (e, ei, bb->preds)
17309 if (e->src == bb)
17311 simple_loop = true;
17312 break;
17315 if (simple_loop)
17316 distance = distance_non_agu_define_in_bb (regno1, regno2,
17317 insn, distance,
17318 BB_END (bb), &found);
17319 else
17321 int shortest_dist = -1;
17322 bool found_in_bb = false;
17324 FOR_EACH_EDGE (e, ei, bb->preds)
17326 int bb_dist
17327 = distance_non_agu_define_in_bb (regno1, regno2,
17328 insn, distance,
17329 BB_END (e->src),
17330 &found_in_bb);
17331 if (found_in_bb)
17333 if (shortest_dist < 0)
17334 shortest_dist = bb_dist;
17335 else if (bb_dist > 0)
17336 shortest_dist = MIN (bb_dist, shortest_dist);
17338 found = true;
17342 distance = shortest_dist;
17346 /* get_attr_type may modify recog data. We want to make sure
17347 that recog data is valid for instruction INSN, on which
17348 distance_non_agu_define is called. INSN is unchanged here. */
17349 extract_insn_cached (insn);
17351 if (!found)
17352 return -1;
17354 return distance >> 1;
17357 /* Return the distance in half-cycles between INSN and the next
17358 insn that uses register number REGNO in memory address added
17359 to DISTANCE. Return -1 if REGNO0 is set.
17361 Put true value into *FOUND if register usage was found and
17362 false otherwise.
17363 Put true value into *REDEFINED if register redefinition was
17364 found and false otherwise. */
17366 static int
17367 distance_agu_use_in_bb (unsigned int regno,
17368 rtx insn, int distance, rtx start,
17369 bool *found, bool *redefined)
17371 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17372 rtx next = start;
17373 rtx prev = NULL;
17375 *found = false;
17376 *redefined = false;
17378 while (next
17379 && next != insn
17380 && distance < LEA_SEARCH_THRESHOLD)
17382 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17384 distance = increase_distance(prev, next, distance);
17385 if (insn_uses_reg_mem (regno, next))
17387 /* Return DISTANCE if OP0 is used in memory
17388 address in NEXT. */
17389 *found = true;
17390 return distance;
17393 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17395 /* Return -1 if OP0 is set in NEXT. */
17396 *redefined = true;
17397 return -1;
17400 prev = next;
17403 if (next == BB_END (bb))
17404 break;
17406 next = NEXT_INSN (next);
17409 return distance;
17412 /* Return the distance between INSN and the next insn that uses
17413 register number REGNO0 in memory address. Return -1 if no such
17414 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17416 static int
17417 distance_agu_use (unsigned int regno0, rtx insn)
17419 basic_block bb = BLOCK_FOR_INSN (insn);
17420 int distance = 0;
17421 bool found = false;
17422 bool redefined = false;
17424 if (insn != BB_END (bb))
17425 distance = distance_agu_use_in_bb (regno0, insn, distance,
17426 NEXT_INSN (insn),
17427 &found, &redefined);
17429 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17431 edge e;
17432 edge_iterator ei;
17433 bool simple_loop = false;
17435 FOR_EACH_EDGE (e, ei, bb->succs)
17436 if (e->dest == bb)
17438 simple_loop = true;
17439 break;
17442 if (simple_loop)
17443 distance = distance_agu_use_in_bb (regno0, insn,
17444 distance, BB_HEAD (bb),
17445 &found, &redefined);
17446 else
17448 int shortest_dist = -1;
17449 bool found_in_bb = false;
17450 bool redefined_in_bb = false;
17452 FOR_EACH_EDGE (e, ei, bb->succs)
17454 int bb_dist
17455 = distance_agu_use_in_bb (regno0, insn,
17456 distance, BB_HEAD (e->dest),
17457 &found_in_bb, &redefined_in_bb);
17458 if (found_in_bb)
17460 if (shortest_dist < 0)
17461 shortest_dist = bb_dist;
17462 else if (bb_dist > 0)
17463 shortest_dist = MIN (bb_dist, shortest_dist);
17465 found = true;
17469 distance = shortest_dist;
17473 if (!found || redefined)
17474 return -1;
17476 return distance >> 1;
17479 /* Define this macro to tune LEA priority vs ADD, it take effect when
17480 there is a dilemma of choicing LEA or ADD
17481 Negative value: ADD is more preferred than LEA
17482 Zero: Netrual
17483 Positive value: LEA is more preferred than ADD*/
17484 #define IX86_LEA_PRIORITY 0
17486 /* Return true if usage of lea INSN has performance advantage
17487 over a sequence of instructions. Instructions sequence has
17488 SPLIT_COST cycles higher latency than lea latency. */
17490 static bool
17491 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17492 unsigned int regno2, int split_cost, bool has_scale)
17494 int dist_define, dist_use;
17496 /* For Silvermont if using a 2-source or 3-source LEA for
17497 non-destructive destination purposes, or due to wanting
17498 ability to use SCALE, the use of LEA is justified. */
17499 if (ix86_tune == PROCESSOR_SLM)
17501 if (has_scale)
17502 return true;
17503 if (split_cost < 1)
17504 return false;
17505 if (regno0 == regno1 || regno0 == regno2)
17506 return false;
17507 return true;
17510 dist_define = distance_non_agu_define (regno1, regno2, insn);
17511 dist_use = distance_agu_use (regno0, insn);
17513 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17515 /* If there is no non AGU operand definition, no AGU
17516 operand usage and split cost is 0 then both lea
17517 and non lea variants have same priority. Currently
17518 we prefer lea for 64 bit code and non lea on 32 bit
17519 code. */
17520 if (dist_use < 0 && split_cost == 0)
17521 return TARGET_64BIT || IX86_LEA_PRIORITY;
17522 else
17523 return true;
17526 /* With longer definitions distance lea is more preferable.
17527 Here we change it to take into account splitting cost and
17528 lea priority. */
17529 dist_define += split_cost + IX86_LEA_PRIORITY;
17531 /* If there is no use in memory addess then we just check
17532 that split cost exceeds AGU stall. */
17533 if (dist_use < 0)
17534 return dist_define > LEA_MAX_STALL;
17536 /* If this insn has both backward non-agu dependence and forward
17537 agu dependence, the one with short distance takes effect. */
17538 return dist_define >= dist_use;
17541 /* Return true if it is legal to clobber flags by INSN and
17542 false otherwise. */
17544 static bool
17545 ix86_ok_to_clobber_flags (rtx insn)
17547 basic_block bb = BLOCK_FOR_INSN (insn);
17548 df_ref *use;
17549 bitmap live;
17551 while (insn)
17553 if (NONDEBUG_INSN_P (insn))
17555 for (use = DF_INSN_USES (insn); *use; use++)
17556 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17557 return false;
17559 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17560 return true;
17563 if (insn == BB_END (bb))
17564 break;
17566 insn = NEXT_INSN (insn);
17569 live = df_get_live_out(bb);
17570 return !REGNO_REG_SET_P (live, FLAGS_REG);
17573 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17574 move and add to avoid AGU stalls. */
17576 bool
17577 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17579 unsigned int regno0, regno1, regno2;
17581 /* Check if we need to optimize. */
17582 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17583 return false;
17585 /* Check it is correct to split here. */
17586 if (!ix86_ok_to_clobber_flags(insn))
17587 return false;
17589 regno0 = true_regnum (operands[0]);
17590 regno1 = true_regnum (operands[1]);
17591 regno2 = true_regnum (operands[2]);
17593 /* We need to split only adds with non destructive
17594 destination operand. */
17595 if (regno0 == regno1 || regno0 == regno2)
17596 return false;
17597 else
17598 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17601 /* Return true if we should emit lea instruction instead of mov
17602 instruction. */
17604 bool
17605 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17607 unsigned int regno0, regno1;
17609 /* Check if we need to optimize. */
17610 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17611 return false;
17613 /* Use lea for reg to reg moves only. */
17614 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17615 return false;
17617 regno0 = true_regnum (operands[0]);
17618 regno1 = true_regnum (operands[1]);
17620 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17623 /* Return true if we need to split lea into a sequence of
17624 instructions to avoid AGU stalls. */
17626 bool
17627 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17629 unsigned int regno0, regno1, regno2;
17630 int split_cost;
17631 struct ix86_address parts;
17632 int ok;
17634 /* Check we need to optimize. */
17635 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17636 return false;
17638 /* Check it is correct to split here. */
17639 if (!ix86_ok_to_clobber_flags(insn))
17640 return false;
17642 ok = ix86_decompose_address (operands[1], &parts);
17643 gcc_assert (ok);
17645 /* There should be at least two components in the address. */
17646 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17647 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17648 return false;
17650 /* We should not split into add if non legitimate pic
17651 operand is used as displacement. */
17652 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17653 return false;
17655 regno0 = true_regnum (operands[0]) ;
17656 regno1 = INVALID_REGNUM;
17657 regno2 = INVALID_REGNUM;
17659 if (parts.base)
17660 regno1 = true_regnum (parts.base);
17661 if (parts.index)
17662 regno2 = true_regnum (parts.index);
17664 split_cost = 0;
17666 /* Compute how many cycles we will add to execution time
17667 if split lea into a sequence of instructions. */
17668 if (parts.base || parts.index)
17670 /* Have to use mov instruction if non desctructive
17671 destination form is used. */
17672 if (regno1 != regno0 && regno2 != regno0)
17673 split_cost += 1;
17675 /* Have to add index to base if both exist. */
17676 if (parts.base && parts.index)
17677 split_cost += 1;
17679 /* Have to use shift and adds if scale is 2 or greater. */
17680 if (parts.scale > 1)
17682 if (regno0 != regno1)
17683 split_cost += 1;
17684 else if (regno2 == regno0)
17685 split_cost += 4;
17686 else
17687 split_cost += parts.scale;
17690 /* Have to use add instruction with immediate if
17691 disp is non zero. */
17692 if (parts.disp && parts.disp != const0_rtx)
17693 split_cost += 1;
17695 /* Subtract the price of lea. */
17696 split_cost -= 1;
17699 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
17700 parts.scale > 1);
17703 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17704 matches destination. RTX includes clobber of FLAGS_REG. */
17706 static void
17707 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17708 rtx dst, rtx src)
17710 rtx op, clob;
17712 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17713 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17715 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17718 /* Return true if regno1 def is nearest to the insn. */
17720 static bool
17721 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17723 rtx prev = insn;
17724 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17726 if (insn == start)
17727 return false;
17728 while (prev && prev != start)
17730 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17732 prev = PREV_INSN (prev);
17733 continue;
17735 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17736 return true;
17737 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17738 return false;
17739 prev = PREV_INSN (prev);
17742 /* None of the regs is defined in the bb. */
17743 return false;
17746 /* Split lea instructions into a sequence of instructions
17747 which are executed on ALU to avoid AGU stalls.
17748 It is assumed that it is allowed to clobber flags register
17749 at lea position. */
17751 void
17752 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17754 unsigned int regno0, regno1, regno2;
17755 struct ix86_address parts;
17756 rtx target, tmp;
17757 int ok, adds;
17759 ok = ix86_decompose_address (operands[1], &parts);
17760 gcc_assert (ok);
17762 target = gen_lowpart (mode, operands[0]);
17764 regno0 = true_regnum (target);
17765 regno1 = INVALID_REGNUM;
17766 regno2 = INVALID_REGNUM;
17768 if (parts.base)
17770 parts.base = gen_lowpart (mode, parts.base);
17771 regno1 = true_regnum (parts.base);
17774 if (parts.index)
17776 parts.index = gen_lowpart (mode, parts.index);
17777 regno2 = true_regnum (parts.index);
17780 if (parts.disp)
17781 parts.disp = gen_lowpart (mode, parts.disp);
17783 if (parts.scale > 1)
17785 /* Case r1 = r1 + ... */
17786 if (regno1 == regno0)
17788 /* If we have a case r1 = r1 + C * r1 then we
17789 should use multiplication which is very
17790 expensive. Assume cost model is wrong if we
17791 have such case here. */
17792 gcc_assert (regno2 != regno0);
17794 for (adds = parts.scale; adds > 0; adds--)
17795 ix86_emit_binop (PLUS, mode, target, parts.index);
17797 else
17799 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17800 if (regno0 != regno2)
17801 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17803 /* Use shift for scaling. */
17804 ix86_emit_binop (ASHIFT, mode, target,
17805 GEN_INT (exact_log2 (parts.scale)));
17807 if (parts.base)
17808 ix86_emit_binop (PLUS, mode, target, parts.base);
17810 if (parts.disp && parts.disp != const0_rtx)
17811 ix86_emit_binop (PLUS, mode, target, parts.disp);
17814 else if (!parts.base && !parts.index)
17816 gcc_assert(parts.disp);
17817 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17819 else
17821 if (!parts.base)
17823 if (regno0 != regno2)
17824 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17826 else if (!parts.index)
17828 if (regno0 != regno1)
17829 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17831 else
17833 if (regno0 == regno1)
17834 tmp = parts.index;
17835 else if (regno0 == regno2)
17836 tmp = parts.base;
17837 else
17839 rtx tmp1;
17841 /* Find better operand for SET instruction, depending
17842 on which definition is farther from the insn. */
17843 if (find_nearest_reg_def (insn, regno1, regno2))
17844 tmp = parts.index, tmp1 = parts.base;
17845 else
17846 tmp = parts.base, tmp1 = parts.index;
17848 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17850 if (parts.disp && parts.disp != const0_rtx)
17851 ix86_emit_binop (PLUS, mode, target, parts.disp);
17853 ix86_emit_binop (PLUS, mode, target, tmp1);
17854 return;
17857 ix86_emit_binop (PLUS, mode, target, tmp);
17860 if (parts.disp && parts.disp != const0_rtx)
17861 ix86_emit_binop (PLUS, mode, target, parts.disp);
17865 /* Return true if it is ok to optimize an ADD operation to LEA
17866 operation to avoid flag register consumation. For most processors,
17867 ADD is faster than LEA. For the processors like ATOM, if the
17868 destination register of LEA holds an actual address which will be
17869 used soon, LEA is better and otherwise ADD is better. */
17871 bool
17872 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17874 unsigned int regno0 = true_regnum (operands[0]);
17875 unsigned int regno1 = true_regnum (operands[1]);
17876 unsigned int regno2 = true_regnum (operands[2]);
17878 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17879 if (regno0 != regno1 && regno0 != regno2)
17880 return true;
17882 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17883 return false;
17885 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
17888 /* Return true if destination reg of SET_BODY is shift count of
17889 USE_BODY. */
17891 static bool
17892 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17894 rtx set_dest;
17895 rtx shift_rtx;
17896 int i;
17898 /* Retrieve destination of SET_BODY. */
17899 switch (GET_CODE (set_body))
17901 case SET:
17902 set_dest = SET_DEST (set_body);
17903 if (!set_dest || !REG_P (set_dest))
17904 return false;
17905 break;
17906 case PARALLEL:
17907 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17908 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17909 use_body))
17910 return true;
17911 default:
17912 return false;
17913 break;
17916 /* Retrieve shift count of USE_BODY. */
17917 switch (GET_CODE (use_body))
17919 case SET:
17920 shift_rtx = XEXP (use_body, 1);
17921 break;
17922 case PARALLEL:
17923 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17924 if (ix86_dep_by_shift_count_body (set_body,
17925 XVECEXP (use_body, 0, i)))
17926 return true;
17927 default:
17928 return false;
17929 break;
17932 if (shift_rtx
17933 && (GET_CODE (shift_rtx) == ASHIFT
17934 || GET_CODE (shift_rtx) == LSHIFTRT
17935 || GET_CODE (shift_rtx) == ASHIFTRT
17936 || GET_CODE (shift_rtx) == ROTATE
17937 || GET_CODE (shift_rtx) == ROTATERT))
17939 rtx shift_count = XEXP (shift_rtx, 1);
17941 /* Return true if shift count is dest of SET_BODY. */
17942 if (REG_P (shift_count))
17944 /* Add check since it can be invoked before register
17945 allocation in pre-reload schedule. */
17946 if (reload_completed
17947 && true_regnum (set_dest) == true_regnum (shift_count))
17948 return true;
17949 else if (REGNO(set_dest) == REGNO(shift_count))
17950 return true;
17954 return false;
17957 /* Return true if destination reg of SET_INSN is shift count of
17958 USE_INSN. */
17960 bool
17961 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17963 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17964 PATTERN (use_insn));
17967 /* Return TRUE or FALSE depending on whether the unary operator meets the
17968 appropriate constraints. */
17970 bool
17971 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17972 enum machine_mode mode ATTRIBUTE_UNUSED,
17973 rtx operands[2] ATTRIBUTE_UNUSED)
17975 /* If one of operands is memory, source and destination must match. */
17976 if ((MEM_P (operands[0])
17977 || MEM_P (operands[1]))
17978 && ! rtx_equal_p (operands[0], operands[1]))
17979 return false;
17980 return true;
17983 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17984 are ok, keeping in mind the possible movddup alternative. */
17986 bool
17987 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17989 if (MEM_P (operands[0]))
17990 return rtx_equal_p (operands[0], operands[1 + high]);
17991 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17992 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17993 return true;
17996 /* Post-reload splitter for converting an SF or DFmode value in an
17997 SSE register into an unsigned SImode. */
17999 void
18000 ix86_split_convert_uns_si_sse (rtx operands[])
18002 enum machine_mode vecmode;
18003 rtx value, large, zero_or_two31, input, two31, x;
18005 large = operands[1];
18006 zero_or_two31 = operands[2];
18007 input = operands[3];
18008 two31 = operands[4];
18009 vecmode = GET_MODE (large);
18010 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18012 /* Load up the value into the low element. We must ensure that the other
18013 elements are valid floats -- zero is the easiest such value. */
18014 if (MEM_P (input))
18016 if (vecmode == V4SFmode)
18017 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18018 else
18019 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18021 else
18023 input = gen_rtx_REG (vecmode, REGNO (input));
18024 emit_move_insn (value, CONST0_RTX (vecmode));
18025 if (vecmode == V4SFmode)
18026 emit_insn (gen_sse_movss (value, value, input));
18027 else
18028 emit_insn (gen_sse2_movsd (value, value, input));
18031 emit_move_insn (large, two31);
18032 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18034 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18035 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18037 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18038 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18040 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18041 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18043 large = gen_rtx_REG (V4SImode, REGNO (large));
18044 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18046 x = gen_rtx_REG (V4SImode, REGNO (value));
18047 if (vecmode == V4SFmode)
18048 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18049 else
18050 emit_insn (gen_sse2_cvttpd2dq (x, value));
18051 value = x;
18053 emit_insn (gen_xorv4si3 (value, value, large));
18056 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18057 Expects the 64-bit DImode to be supplied in a pair of integral
18058 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18059 -mfpmath=sse, !optimize_size only. */
18061 void
18062 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18064 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18065 rtx int_xmm, fp_xmm;
18066 rtx biases, exponents;
18067 rtx x;
18069 int_xmm = gen_reg_rtx (V4SImode);
18070 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18071 emit_insn (gen_movdi_to_sse (int_xmm, input));
18072 else if (TARGET_SSE_SPLIT_REGS)
18074 emit_clobber (int_xmm);
18075 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18077 else
18079 x = gen_reg_rtx (V2DImode);
18080 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18081 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18084 x = gen_rtx_CONST_VECTOR (V4SImode,
18085 gen_rtvec (4, GEN_INT (0x43300000UL),
18086 GEN_INT (0x45300000UL),
18087 const0_rtx, const0_rtx));
18088 exponents = validize_mem (force_const_mem (V4SImode, x));
18090 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18091 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18093 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18094 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18095 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18096 (0x1.0p84 + double(fp_value_hi_xmm)).
18097 Note these exponents differ by 32. */
18099 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18101 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18102 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18103 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18104 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18105 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18106 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18107 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18108 biases = validize_mem (force_const_mem (V2DFmode, biases));
18109 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18111 /* Add the upper and lower DFmode values together. */
18112 if (TARGET_SSE3)
18113 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18114 else
18116 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18117 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18118 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18121 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18124 /* Not used, but eases macroization of patterns. */
18125 void
18126 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18127 rtx input ATTRIBUTE_UNUSED)
18129 gcc_unreachable ();
18132 /* Convert an unsigned SImode value into a DFmode. Only currently used
18133 for SSE, but applicable anywhere. */
18135 void
18136 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18138 REAL_VALUE_TYPE TWO31r;
18139 rtx x, fp;
18141 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18142 NULL, 1, OPTAB_DIRECT);
18144 fp = gen_reg_rtx (DFmode);
18145 emit_insn (gen_floatsidf2 (fp, x));
18147 real_ldexp (&TWO31r, &dconst1, 31);
18148 x = const_double_from_real_value (TWO31r, DFmode);
18150 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18151 if (x != target)
18152 emit_move_insn (target, x);
18155 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18156 32-bit mode; otherwise we have a direct convert instruction. */
18158 void
18159 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18161 REAL_VALUE_TYPE TWO32r;
18162 rtx fp_lo, fp_hi, x;
18164 fp_lo = gen_reg_rtx (DFmode);
18165 fp_hi = gen_reg_rtx (DFmode);
18167 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18169 real_ldexp (&TWO32r, &dconst1, 32);
18170 x = const_double_from_real_value (TWO32r, DFmode);
18171 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18173 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18175 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18176 0, OPTAB_DIRECT);
18177 if (x != target)
18178 emit_move_insn (target, x);
18181 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18182 For x86_32, -mfpmath=sse, !optimize_size only. */
18183 void
18184 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18186 REAL_VALUE_TYPE ONE16r;
18187 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18189 real_ldexp (&ONE16r, &dconst1, 16);
18190 x = const_double_from_real_value (ONE16r, SFmode);
18191 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18192 NULL, 0, OPTAB_DIRECT);
18193 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18194 NULL, 0, OPTAB_DIRECT);
18195 fp_hi = gen_reg_rtx (SFmode);
18196 fp_lo = gen_reg_rtx (SFmode);
18197 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18198 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18199 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18200 0, OPTAB_DIRECT);
18201 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18202 0, OPTAB_DIRECT);
18203 if (!rtx_equal_p (target, fp_hi))
18204 emit_move_insn (target, fp_hi);
18207 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18208 a vector of unsigned ints VAL to vector of floats TARGET. */
18210 void
18211 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18213 rtx tmp[8];
18214 REAL_VALUE_TYPE TWO16r;
18215 enum machine_mode intmode = GET_MODE (val);
18216 enum machine_mode fltmode = GET_MODE (target);
18217 rtx (*cvt) (rtx, rtx);
18219 if (intmode == V4SImode)
18220 cvt = gen_floatv4siv4sf2;
18221 else
18222 cvt = gen_floatv8siv8sf2;
18223 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18224 tmp[0] = force_reg (intmode, tmp[0]);
18225 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18226 OPTAB_DIRECT);
18227 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18228 NULL_RTX, 1, OPTAB_DIRECT);
18229 tmp[3] = gen_reg_rtx (fltmode);
18230 emit_insn (cvt (tmp[3], tmp[1]));
18231 tmp[4] = gen_reg_rtx (fltmode);
18232 emit_insn (cvt (tmp[4], tmp[2]));
18233 real_ldexp (&TWO16r, &dconst1, 16);
18234 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18235 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18236 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18237 OPTAB_DIRECT);
18238 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18239 OPTAB_DIRECT);
18240 if (tmp[7] != target)
18241 emit_move_insn (target, tmp[7]);
18244 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18245 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18246 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18247 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18250 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18252 REAL_VALUE_TYPE TWO31r;
18253 rtx two31r, tmp[4];
18254 enum machine_mode mode = GET_MODE (val);
18255 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18256 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18257 rtx (*cmp) (rtx, rtx, rtx, rtx);
18258 int i;
18260 for (i = 0; i < 3; i++)
18261 tmp[i] = gen_reg_rtx (mode);
18262 real_ldexp (&TWO31r, &dconst1, 31);
18263 two31r = const_double_from_real_value (TWO31r, scalarmode);
18264 two31r = ix86_build_const_vector (mode, 1, two31r);
18265 two31r = force_reg (mode, two31r);
18266 switch (mode)
18268 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18269 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18270 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18271 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18272 default: gcc_unreachable ();
18274 tmp[3] = gen_rtx_LE (mode, two31r, val);
18275 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18276 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18277 0, OPTAB_DIRECT);
18278 if (intmode == V4SImode || TARGET_AVX2)
18279 *xorp = expand_simple_binop (intmode, ASHIFT,
18280 gen_lowpart (intmode, tmp[0]),
18281 GEN_INT (31), NULL_RTX, 0,
18282 OPTAB_DIRECT);
18283 else
18285 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18286 two31 = ix86_build_const_vector (intmode, 1, two31);
18287 *xorp = expand_simple_binop (intmode, AND,
18288 gen_lowpart (intmode, tmp[0]),
18289 two31, NULL_RTX, 0,
18290 OPTAB_DIRECT);
18292 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18293 0, OPTAB_DIRECT);
18296 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18297 then replicate the value for all elements of the vector
18298 register. */
18301 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18303 int i, n_elt;
18304 rtvec v;
18305 enum machine_mode scalar_mode;
18307 switch (mode)
18309 case V32QImode:
18310 case V16QImode:
18311 case V16HImode:
18312 case V8HImode:
18313 case V8SImode:
18314 case V4SImode:
18315 case V4DImode:
18316 case V2DImode:
18317 gcc_assert (vect);
18318 case V8SFmode:
18319 case V4SFmode:
18320 case V4DFmode:
18321 case V2DFmode:
18322 n_elt = GET_MODE_NUNITS (mode);
18323 v = rtvec_alloc (n_elt);
18324 scalar_mode = GET_MODE_INNER (mode);
18326 RTVEC_ELT (v, 0) = value;
18328 for (i = 1; i < n_elt; ++i)
18329 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18331 return gen_rtx_CONST_VECTOR (mode, v);
18333 default:
18334 gcc_unreachable ();
18338 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18339 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18340 for an SSE register. If VECT is true, then replicate the mask for
18341 all elements of the vector register. If INVERT is true, then create
18342 a mask excluding the sign bit. */
18345 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18347 enum machine_mode vec_mode, imode;
18348 HOST_WIDE_INT hi, lo;
18349 int shift = 63;
18350 rtx v;
18351 rtx mask;
18353 /* Find the sign bit, sign extended to 2*HWI. */
18354 switch (mode)
18356 case V8SImode:
18357 case V4SImode:
18358 case V8SFmode:
18359 case V4SFmode:
18360 vec_mode = mode;
18361 mode = GET_MODE_INNER (mode);
18362 imode = SImode;
18363 lo = 0x80000000, hi = lo < 0;
18364 break;
18366 case V4DImode:
18367 case V2DImode:
18368 case V4DFmode:
18369 case V2DFmode:
18370 vec_mode = mode;
18371 mode = GET_MODE_INNER (mode);
18372 imode = DImode;
18373 if (HOST_BITS_PER_WIDE_INT >= 64)
18374 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18375 else
18376 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18377 break;
18379 case TImode:
18380 case TFmode:
18381 vec_mode = VOIDmode;
18382 if (HOST_BITS_PER_WIDE_INT >= 64)
18384 imode = TImode;
18385 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18387 else
18389 rtvec vec;
18391 imode = DImode;
18392 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18394 if (invert)
18396 lo = ~lo, hi = ~hi;
18397 v = constm1_rtx;
18399 else
18400 v = const0_rtx;
18402 mask = immed_double_const (lo, hi, imode);
18404 vec = gen_rtvec (2, v, mask);
18405 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18406 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18408 return v;
18410 break;
18412 default:
18413 gcc_unreachable ();
18416 if (invert)
18417 lo = ~lo, hi = ~hi;
18419 /* Force this value into the low part of a fp vector constant. */
18420 mask = immed_double_const (lo, hi, imode);
18421 mask = gen_lowpart (mode, mask);
18423 if (vec_mode == VOIDmode)
18424 return force_reg (mode, mask);
18426 v = ix86_build_const_vector (vec_mode, vect, mask);
18427 return force_reg (vec_mode, v);
18430 /* Generate code for floating point ABS or NEG. */
18432 void
18433 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18434 rtx operands[])
18436 rtx mask, set, dst, src;
18437 bool use_sse = false;
18438 bool vector_mode = VECTOR_MODE_P (mode);
18439 enum machine_mode vmode = mode;
18441 if (vector_mode)
18442 use_sse = true;
18443 else if (mode == TFmode)
18444 use_sse = true;
18445 else if (TARGET_SSE_MATH)
18447 use_sse = SSE_FLOAT_MODE_P (mode);
18448 if (mode == SFmode)
18449 vmode = V4SFmode;
18450 else if (mode == DFmode)
18451 vmode = V2DFmode;
18454 /* NEG and ABS performed with SSE use bitwise mask operations.
18455 Create the appropriate mask now. */
18456 if (use_sse)
18457 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18458 else
18459 mask = NULL_RTX;
18461 dst = operands[0];
18462 src = operands[1];
18464 set = gen_rtx_fmt_e (code, mode, src);
18465 set = gen_rtx_SET (VOIDmode, dst, set);
18467 if (mask)
18469 rtx use, clob;
18470 rtvec par;
18472 use = gen_rtx_USE (VOIDmode, mask);
18473 if (vector_mode)
18474 par = gen_rtvec (2, set, use);
18475 else
18477 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18478 par = gen_rtvec (3, set, use, clob);
18480 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18482 else
18483 emit_insn (set);
18486 /* Expand a copysign operation. Special case operand 0 being a constant. */
18488 void
18489 ix86_expand_copysign (rtx operands[])
18491 enum machine_mode mode, vmode;
18492 rtx dest, op0, op1, mask, nmask;
18494 dest = operands[0];
18495 op0 = operands[1];
18496 op1 = operands[2];
18498 mode = GET_MODE (dest);
18500 if (mode == SFmode)
18501 vmode = V4SFmode;
18502 else if (mode == DFmode)
18503 vmode = V2DFmode;
18504 else
18505 vmode = mode;
18507 if (GET_CODE (op0) == CONST_DOUBLE)
18509 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18511 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18512 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18514 if (mode == SFmode || mode == DFmode)
18516 if (op0 == CONST0_RTX (mode))
18517 op0 = CONST0_RTX (vmode);
18518 else
18520 rtx v = ix86_build_const_vector (vmode, false, op0);
18522 op0 = force_reg (vmode, v);
18525 else if (op0 != CONST0_RTX (mode))
18526 op0 = force_reg (mode, op0);
18528 mask = ix86_build_signbit_mask (vmode, 0, 0);
18530 if (mode == SFmode)
18531 copysign_insn = gen_copysignsf3_const;
18532 else if (mode == DFmode)
18533 copysign_insn = gen_copysigndf3_const;
18534 else
18535 copysign_insn = gen_copysigntf3_const;
18537 emit_insn (copysign_insn (dest, op0, op1, mask));
18539 else
18541 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18543 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18544 mask = ix86_build_signbit_mask (vmode, 0, 0);
18546 if (mode == SFmode)
18547 copysign_insn = gen_copysignsf3_var;
18548 else if (mode == DFmode)
18549 copysign_insn = gen_copysigndf3_var;
18550 else
18551 copysign_insn = gen_copysigntf3_var;
18553 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18557 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18558 be a constant, and so has already been expanded into a vector constant. */
18560 void
18561 ix86_split_copysign_const (rtx operands[])
18563 enum machine_mode mode, vmode;
18564 rtx dest, op0, mask, x;
18566 dest = operands[0];
18567 op0 = operands[1];
18568 mask = operands[3];
18570 mode = GET_MODE (dest);
18571 vmode = GET_MODE (mask);
18573 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18574 x = gen_rtx_AND (vmode, dest, mask);
18575 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18577 if (op0 != CONST0_RTX (vmode))
18579 x = gen_rtx_IOR (vmode, dest, op0);
18580 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18584 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18585 so we have to do two masks. */
18587 void
18588 ix86_split_copysign_var (rtx operands[])
18590 enum machine_mode mode, vmode;
18591 rtx dest, scratch, op0, op1, mask, nmask, x;
18593 dest = operands[0];
18594 scratch = operands[1];
18595 op0 = operands[2];
18596 op1 = operands[3];
18597 nmask = operands[4];
18598 mask = operands[5];
18600 mode = GET_MODE (dest);
18601 vmode = GET_MODE (mask);
18603 if (rtx_equal_p (op0, op1))
18605 /* Shouldn't happen often (it's useless, obviously), but when it does
18606 we'd generate incorrect code if we continue below. */
18607 emit_move_insn (dest, op0);
18608 return;
18611 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18613 gcc_assert (REGNO (op1) == REGNO (scratch));
18615 x = gen_rtx_AND (vmode, scratch, mask);
18616 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18618 dest = mask;
18619 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18620 x = gen_rtx_NOT (vmode, dest);
18621 x = gen_rtx_AND (vmode, x, op0);
18622 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18624 else
18626 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18628 x = gen_rtx_AND (vmode, scratch, mask);
18630 else /* alternative 2,4 */
18632 gcc_assert (REGNO (mask) == REGNO (scratch));
18633 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18634 x = gen_rtx_AND (vmode, scratch, op1);
18636 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18638 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18640 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18641 x = gen_rtx_AND (vmode, dest, nmask);
18643 else /* alternative 3,4 */
18645 gcc_assert (REGNO (nmask) == REGNO (dest));
18646 dest = nmask;
18647 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18648 x = gen_rtx_AND (vmode, dest, op0);
18650 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18653 x = gen_rtx_IOR (vmode, dest, scratch);
18654 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18657 /* Return TRUE or FALSE depending on whether the first SET in INSN
18658 has source and destination with matching CC modes, and that the
18659 CC mode is at least as constrained as REQ_MODE. */
18661 bool
18662 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18664 rtx set;
18665 enum machine_mode set_mode;
18667 set = PATTERN (insn);
18668 if (GET_CODE (set) == PARALLEL)
18669 set = XVECEXP (set, 0, 0);
18670 gcc_assert (GET_CODE (set) == SET);
18671 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18673 set_mode = GET_MODE (SET_DEST (set));
18674 switch (set_mode)
18676 case CCNOmode:
18677 if (req_mode != CCNOmode
18678 && (req_mode != CCmode
18679 || XEXP (SET_SRC (set), 1) != const0_rtx))
18680 return false;
18681 break;
18682 case CCmode:
18683 if (req_mode == CCGCmode)
18684 return false;
18685 /* FALLTHRU */
18686 case CCGCmode:
18687 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18688 return false;
18689 /* FALLTHRU */
18690 case CCGOCmode:
18691 if (req_mode == CCZmode)
18692 return false;
18693 /* FALLTHRU */
18694 case CCZmode:
18695 break;
18697 case CCAmode:
18698 case CCCmode:
18699 case CCOmode:
18700 case CCSmode:
18701 if (set_mode != req_mode)
18702 return false;
18703 break;
18705 default:
18706 gcc_unreachable ();
18709 return GET_MODE (SET_SRC (set)) == set_mode;
18712 /* Generate insn patterns to do an integer compare of OPERANDS. */
18714 static rtx
18715 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18717 enum machine_mode cmpmode;
18718 rtx tmp, flags;
18720 cmpmode = SELECT_CC_MODE (code, op0, op1);
18721 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18723 /* This is very simple, but making the interface the same as in the
18724 FP case makes the rest of the code easier. */
18725 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18726 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18728 /* Return the test that should be put into the flags user, i.e.
18729 the bcc, scc, or cmov instruction. */
18730 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18733 /* Figure out whether to use ordered or unordered fp comparisons.
18734 Return the appropriate mode to use. */
18736 enum machine_mode
18737 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18739 /* ??? In order to make all comparisons reversible, we do all comparisons
18740 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18741 all forms trapping and nontrapping comparisons, we can make inequality
18742 comparisons trapping again, since it results in better code when using
18743 FCOM based compares. */
18744 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18747 enum machine_mode
18748 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18750 enum machine_mode mode = GET_MODE (op0);
18752 if (SCALAR_FLOAT_MODE_P (mode))
18754 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18755 return ix86_fp_compare_mode (code);
18758 switch (code)
18760 /* Only zero flag is needed. */
18761 case EQ: /* ZF=0 */
18762 case NE: /* ZF!=0 */
18763 return CCZmode;
18764 /* Codes needing carry flag. */
18765 case GEU: /* CF=0 */
18766 case LTU: /* CF=1 */
18767 /* Detect overflow checks. They need just the carry flag. */
18768 if (GET_CODE (op0) == PLUS
18769 && rtx_equal_p (op1, XEXP (op0, 0)))
18770 return CCCmode;
18771 else
18772 return CCmode;
18773 case GTU: /* CF=0 & ZF=0 */
18774 case LEU: /* CF=1 | ZF=1 */
18775 /* Detect overflow checks. They need just the carry flag. */
18776 if (GET_CODE (op0) == MINUS
18777 && rtx_equal_p (op1, XEXP (op0, 0)))
18778 return CCCmode;
18779 else
18780 return CCmode;
18781 /* Codes possibly doable only with sign flag when
18782 comparing against zero. */
18783 case GE: /* SF=OF or SF=0 */
18784 case LT: /* SF<>OF or SF=1 */
18785 if (op1 == const0_rtx)
18786 return CCGOCmode;
18787 else
18788 /* For other cases Carry flag is not required. */
18789 return CCGCmode;
18790 /* Codes doable only with sign flag when comparing
18791 against zero, but we miss jump instruction for it
18792 so we need to use relational tests against overflow
18793 that thus needs to be zero. */
18794 case GT: /* ZF=0 & SF=OF */
18795 case LE: /* ZF=1 | SF<>OF */
18796 if (op1 == const0_rtx)
18797 return CCNOmode;
18798 else
18799 return CCGCmode;
18800 /* strcmp pattern do (use flags) and combine may ask us for proper
18801 mode. */
18802 case USE:
18803 return CCmode;
18804 default:
18805 gcc_unreachable ();
18809 /* Return the fixed registers used for condition codes. */
18811 static bool
18812 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18814 *p1 = FLAGS_REG;
18815 *p2 = FPSR_REG;
18816 return true;
18819 /* If two condition code modes are compatible, return a condition code
18820 mode which is compatible with both. Otherwise, return
18821 VOIDmode. */
18823 static enum machine_mode
18824 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18826 if (m1 == m2)
18827 return m1;
18829 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18830 return VOIDmode;
18832 if ((m1 == CCGCmode && m2 == CCGOCmode)
18833 || (m1 == CCGOCmode && m2 == CCGCmode))
18834 return CCGCmode;
18836 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18837 return m2;
18838 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18839 return m1;
18841 switch (m1)
18843 default:
18844 gcc_unreachable ();
18846 case CCmode:
18847 case CCGCmode:
18848 case CCGOCmode:
18849 case CCNOmode:
18850 case CCAmode:
18851 case CCCmode:
18852 case CCOmode:
18853 case CCSmode:
18854 case CCZmode:
18855 switch (m2)
18857 default:
18858 return VOIDmode;
18860 case CCmode:
18861 case CCGCmode:
18862 case CCGOCmode:
18863 case CCNOmode:
18864 case CCAmode:
18865 case CCCmode:
18866 case CCOmode:
18867 case CCSmode:
18868 case CCZmode:
18869 return CCmode;
18872 case CCFPmode:
18873 case CCFPUmode:
18874 /* These are only compatible with themselves, which we already
18875 checked above. */
18876 return VOIDmode;
18881 /* Return a comparison we can do and that it is equivalent to
18882 swap_condition (code) apart possibly from orderedness.
18883 But, never change orderedness if TARGET_IEEE_FP, returning
18884 UNKNOWN in that case if necessary. */
18886 static enum rtx_code
18887 ix86_fp_swap_condition (enum rtx_code code)
18889 switch (code)
18891 case GT: /* GTU - CF=0 & ZF=0 */
18892 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18893 case GE: /* GEU - CF=0 */
18894 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18895 case UNLT: /* LTU - CF=1 */
18896 return TARGET_IEEE_FP ? UNKNOWN : GT;
18897 case UNLE: /* LEU - CF=1 | ZF=1 */
18898 return TARGET_IEEE_FP ? UNKNOWN : GE;
18899 default:
18900 return swap_condition (code);
18904 /* Return cost of comparison CODE using the best strategy for performance.
18905 All following functions do use number of instructions as a cost metrics.
18906 In future this should be tweaked to compute bytes for optimize_size and
18907 take into account performance of various instructions on various CPUs. */
18909 static int
18910 ix86_fp_comparison_cost (enum rtx_code code)
18912 int arith_cost;
18914 /* The cost of code using bit-twiddling on %ah. */
18915 switch (code)
18917 case UNLE:
18918 case UNLT:
18919 case LTGT:
18920 case GT:
18921 case GE:
18922 case UNORDERED:
18923 case ORDERED:
18924 case UNEQ:
18925 arith_cost = 4;
18926 break;
18927 case LT:
18928 case NE:
18929 case EQ:
18930 case UNGE:
18931 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18932 break;
18933 case LE:
18934 case UNGT:
18935 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18936 break;
18937 default:
18938 gcc_unreachable ();
18941 switch (ix86_fp_comparison_strategy (code))
18943 case IX86_FPCMP_COMI:
18944 return arith_cost > 4 ? 3 : 2;
18945 case IX86_FPCMP_SAHF:
18946 return arith_cost > 4 ? 4 : 3;
18947 default:
18948 return arith_cost;
18952 /* Return strategy to use for floating-point. We assume that fcomi is always
18953 preferrable where available, since that is also true when looking at size
18954 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18956 enum ix86_fpcmp_strategy
18957 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18959 /* Do fcomi/sahf based test when profitable. */
18961 if (TARGET_CMOVE)
18962 return IX86_FPCMP_COMI;
18964 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
18965 return IX86_FPCMP_SAHF;
18967 return IX86_FPCMP_ARITH;
18970 /* Swap, force into registers, or otherwise massage the two operands
18971 to a fp comparison. The operands are updated in place; the new
18972 comparison code is returned. */
18974 static enum rtx_code
18975 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18977 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18978 rtx op0 = *pop0, op1 = *pop1;
18979 enum machine_mode op_mode = GET_MODE (op0);
18980 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18982 /* All of the unordered compare instructions only work on registers.
18983 The same is true of the fcomi compare instructions. The XFmode
18984 compare instructions require registers except when comparing
18985 against zero or when converting operand 1 from fixed point to
18986 floating point. */
18988 if (!is_sse
18989 && (fpcmp_mode == CCFPUmode
18990 || (op_mode == XFmode
18991 && ! (standard_80387_constant_p (op0) == 1
18992 || standard_80387_constant_p (op1) == 1)
18993 && GET_CODE (op1) != FLOAT)
18994 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18996 op0 = force_reg (op_mode, op0);
18997 op1 = force_reg (op_mode, op1);
18999 else
19001 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19002 things around if they appear profitable, otherwise force op0
19003 into a register. */
19005 if (standard_80387_constant_p (op0) == 0
19006 || (MEM_P (op0)
19007 && ! (standard_80387_constant_p (op1) == 0
19008 || MEM_P (op1))))
19010 enum rtx_code new_code = ix86_fp_swap_condition (code);
19011 if (new_code != UNKNOWN)
19013 rtx tmp;
19014 tmp = op0, op0 = op1, op1 = tmp;
19015 code = new_code;
19019 if (!REG_P (op0))
19020 op0 = force_reg (op_mode, op0);
19022 if (CONSTANT_P (op1))
19024 int tmp = standard_80387_constant_p (op1);
19025 if (tmp == 0)
19026 op1 = validize_mem (force_const_mem (op_mode, op1));
19027 else if (tmp == 1)
19029 if (TARGET_CMOVE)
19030 op1 = force_reg (op_mode, op1);
19032 else
19033 op1 = force_reg (op_mode, op1);
19037 /* Try to rearrange the comparison to make it cheaper. */
19038 if (ix86_fp_comparison_cost (code)
19039 > ix86_fp_comparison_cost (swap_condition (code))
19040 && (REG_P (op1) || can_create_pseudo_p ()))
19042 rtx tmp;
19043 tmp = op0, op0 = op1, op1 = tmp;
19044 code = swap_condition (code);
19045 if (!REG_P (op0))
19046 op0 = force_reg (op_mode, op0);
19049 *pop0 = op0;
19050 *pop1 = op1;
19051 return code;
19054 /* Convert comparison codes we use to represent FP comparison to integer
19055 code that will result in proper branch. Return UNKNOWN if no such code
19056 is available. */
19058 enum rtx_code
19059 ix86_fp_compare_code_to_integer (enum rtx_code code)
19061 switch (code)
19063 case GT:
19064 return GTU;
19065 case GE:
19066 return GEU;
19067 case ORDERED:
19068 case UNORDERED:
19069 return code;
19070 break;
19071 case UNEQ:
19072 return EQ;
19073 break;
19074 case UNLT:
19075 return LTU;
19076 break;
19077 case UNLE:
19078 return LEU;
19079 break;
19080 case LTGT:
19081 return NE;
19082 break;
19083 default:
19084 return UNKNOWN;
19088 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19090 static rtx
19091 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19093 enum machine_mode fpcmp_mode, intcmp_mode;
19094 rtx tmp, tmp2;
19096 fpcmp_mode = ix86_fp_compare_mode (code);
19097 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19099 /* Do fcomi/sahf based test when profitable. */
19100 switch (ix86_fp_comparison_strategy (code))
19102 case IX86_FPCMP_COMI:
19103 intcmp_mode = fpcmp_mode;
19104 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19105 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19106 tmp);
19107 emit_insn (tmp);
19108 break;
19110 case IX86_FPCMP_SAHF:
19111 intcmp_mode = fpcmp_mode;
19112 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19113 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19114 tmp);
19116 if (!scratch)
19117 scratch = gen_reg_rtx (HImode);
19118 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19119 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19120 break;
19122 case IX86_FPCMP_ARITH:
19123 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19124 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19125 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19126 if (!scratch)
19127 scratch = gen_reg_rtx (HImode);
19128 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19130 /* In the unordered case, we have to check C2 for NaN's, which
19131 doesn't happen to work out to anything nice combination-wise.
19132 So do some bit twiddling on the value we've got in AH to come
19133 up with an appropriate set of condition codes. */
19135 intcmp_mode = CCNOmode;
19136 switch (code)
19138 case GT:
19139 case UNGT:
19140 if (code == GT || !TARGET_IEEE_FP)
19142 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19143 code = EQ;
19145 else
19147 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19148 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19149 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19150 intcmp_mode = CCmode;
19151 code = GEU;
19153 break;
19154 case LT:
19155 case UNLT:
19156 if (code == LT && TARGET_IEEE_FP)
19158 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19159 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19160 intcmp_mode = CCmode;
19161 code = EQ;
19163 else
19165 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19166 code = NE;
19168 break;
19169 case GE:
19170 case UNGE:
19171 if (code == GE || !TARGET_IEEE_FP)
19173 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19174 code = EQ;
19176 else
19178 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19179 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19180 code = NE;
19182 break;
19183 case LE:
19184 case UNLE:
19185 if (code == LE && TARGET_IEEE_FP)
19187 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19188 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19189 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19190 intcmp_mode = CCmode;
19191 code = LTU;
19193 else
19195 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19196 code = NE;
19198 break;
19199 case EQ:
19200 case UNEQ:
19201 if (code == EQ && TARGET_IEEE_FP)
19203 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19204 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19205 intcmp_mode = CCmode;
19206 code = EQ;
19208 else
19210 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19211 code = NE;
19213 break;
19214 case NE:
19215 case LTGT:
19216 if (code == NE && TARGET_IEEE_FP)
19218 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19219 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19220 GEN_INT (0x40)));
19221 code = NE;
19223 else
19225 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19226 code = EQ;
19228 break;
19230 case UNORDERED:
19231 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19232 code = NE;
19233 break;
19234 case ORDERED:
19235 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19236 code = EQ;
19237 break;
19239 default:
19240 gcc_unreachable ();
19242 break;
19244 default:
19245 gcc_unreachable();
19248 /* Return the test that should be put into the flags user, i.e.
19249 the bcc, scc, or cmov instruction. */
19250 return gen_rtx_fmt_ee (code, VOIDmode,
19251 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19252 const0_rtx);
19255 static rtx
19256 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19258 rtx ret;
19260 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19261 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19263 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19265 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19266 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19268 else
19269 ret = ix86_expand_int_compare (code, op0, op1);
19271 return ret;
19274 void
19275 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19277 enum machine_mode mode = GET_MODE (op0);
19278 rtx tmp;
19280 switch (mode)
19282 case SFmode:
19283 case DFmode:
19284 case XFmode:
19285 case QImode:
19286 case HImode:
19287 case SImode:
19288 simple:
19289 tmp = ix86_expand_compare (code, op0, op1);
19290 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19291 gen_rtx_LABEL_REF (VOIDmode, label),
19292 pc_rtx);
19293 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19294 return;
19296 case DImode:
19297 if (TARGET_64BIT)
19298 goto simple;
19299 case TImode:
19300 /* Expand DImode branch into multiple compare+branch. */
19302 rtx lo[2], hi[2], label2;
19303 enum rtx_code code1, code2, code3;
19304 enum machine_mode submode;
19306 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19308 tmp = op0, op0 = op1, op1 = tmp;
19309 code = swap_condition (code);
19312 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19313 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19315 submode = mode == DImode ? SImode : DImode;
19317 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19318 avoid two branches. This costs one extra insn, so disable when
19319 optimizing for size. */
19321 if ((code == EQ || code == NE)
19322 && (!optimize_insn_for_size_p ()
19323 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19325 rtx xor0, xor1;
19327 xor1 = hi[0];
19328 if (hi[1] != const0_rtx)
19329 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19330 NULL_RTX, 0, OPTAB_WIDEN);
19332 xor0 = lo[0];
19333 if (lo[1] != const0_rtx)
19334 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19335 NULL_RTX, 0, OPTAB_WIDEN);
19337 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19338 NULL_RTX, 0, OPTAB_WIDEN);
19340 ix86_expand_branch (code, tmp, const0_rtx, label);
19341 return;
19344 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19345 op1 is a constant and the low word is zero, then we can just
19346 examine the high word. Similarly for low word -1 and
19347 less-or-equal-than or greater-than. */
19349 if (CONST_INT_P (hi[1]))
19350 switch (code)
19352 case LT: case LTU: case GE: case GEU:
19353 if (lo[1] == const0_rtx)
19355 ix86_expand_branch (code, hi[0], hi[1], label);
19356 return;
19358 break;
19359 case LE: case LEU: case GT: case GTU:
19360 if (lo[1] == constm1_rtx)
19362 ix86_expand_branch (code, hi[0], hi[1], label);
19363 return;
19365 break;
19366 default:
19367 break;
19370 /* Otherwise, we need two or three jumps. */
19372 label2 = gen_label_rtx ();
19374 code1 = code;
19375 code2 = swap_condition (code);
19376 code3 = unsigned_condition (code);
19378 switch (code)
19380 case LT: case GT: case LTU: case GTU:
19381 break;
19383 case LE: code1 = LT; code2 = GT; break;
19384 case GE: code1 = GT; code2 = LT; break;
19385 case LEU: code1 = LTU; code2 = GTU; break;
19386 case GEU: code1 = GTU; code2 = LTU; break;
19388 case EQ: code1 = UNKNOWN; code2 = NE; break;
19389 case NE: code2 = UNKNOWN; break;
19391 default:
19392 gcc_unreachable ();
19396 * a < b =>
19397 * if (hi(a) < hi(b)) goto true;
19398 * if (hi(a) > hi(b)) goto false;
19399 * if (lo(a) < lo(b)) goto true;
19400 * false:
19403 if (code1 != UNKNOWN)
19404 ix86_expand_branch (code1, hi[0], hi[1], label);
19405 if (code2 != UNKNOWN)
19406 ix86_expand_branch (code2, hi[0], hi[1], label2);
19408 ix86_expand_branch (code3, lo[0], lo[1], label);
19410 if (code2 != UNKNOWN)
19411 emit_label (label2);
19412 return;
19415 default:
19416 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19417 goto simple;
19421 /* Split branch based on floating point condition. */
19422 void
19423 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19424 rtx target1, rtx target2, rtx tmp, rtx pushed)
19426 rtx condition;
19427 rtx i;
19429 if (target2 != pc_rtx)
19431 rtx tmp = target2;
19432 code = reverse_condition_maybe_unordered (code);
19433 target2 = target1;
19434 target1 = tmp;
19437 condition = ix86_expand_fp_compare (code, op1, op2,
19438 tmp);
19440 /* Remove pushed operand from stack. */
19441 if (pushed)
19442 ix86_free_from_memory (GET_MODE (pushed));
19444 i = emit_jump_insn (gen_rtx_SET
19445 (VOIDmode, pc_rtx,
19446 gen_rtx_IF_THEN_ELSE (VOIDmode,
19447 condition, target1, target2)));
19448 if (split_branch_probability >= 0)
19449 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19452 void
19453 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19455 rtx ret;
19457 gcc_assert (GET_MODE (dest) == QImode);
19459 ret = ix86_expand_compare (code, op0, op1);
19460 PUT_MODE (ret, QImode);
19461 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19464 /* Expand comparison setting or clearing carry flag. Return true when
19465 successful and set pop for the operation. */
19466 static bool
19467 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19469 enum machine_mode mode =
19470 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19472 /* Do not handle double-mode compares that go through special path. */
19473 if (mode == (TARGET_64BIT ? TImode : DImode))
19474 return false;
19476 if (SCALAR_FLOAT_MODE_P (mode))
19478 rtx compare_op, compare_seq;
19480 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19482 /* Shortcut: following common codes never translate
19483 into carry flag compares. */
19484 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19485 || code == ORDERED || code == UNORDERED)
19486 return false;
19488 /* These comparisons require zero flag; swap operands so they won't. */
19489 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19490 && !TARGET_IEEE_FP)
19492 rtx tmp = op0;
19493 op0 = op1;
19494 op1 = tmp;
19495 code = swap_condition (code);
19498 /* Try to expand the comparison and verify that we end up with
19499 carry flag based comparison. This fails to be true only when
19500 we decide to expand comparison using arithmetic that is not
19501 too common scenario. */
19502 start_sequence ();
19503 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19504 compare_seq = get_insns ();
19505 end_sequence ();
19507 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19508 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19509 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19510 else
19511 code = GET_CODE (compare_op);
19513 if (code != LTU && code != GEU)
19514 return false;
19516 emit_insn (compare_seq);
19517 *pop = compare_op;
19518 return true;
19521 if (!INTEGRAL_MODE_P (mode))
19522 return false;
19524 switch (code)
19526 case LTU:
19527 case GEU:
19528 break;
19530 /* Convert a==0 into (unsigned)a<1. */
19531 case EQ:
19532 case NE:
19533 if (op1 != const0_rtx)
19534 return false;
19535 op1 = const1_rtx;
19536 code = (code == EQ ? LTU : GEU);
19537 break;
19539 /* Convert a>b into b<a or a>=b-1. */
19540 case GTU:
19541 case LEU:
19542 if (CONST_INT_P (op1))
19544 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19545 /* Bail out on overflow. We still can swap operands but that
19546 would force loading of the constant into register. */
19547 if (op1 == const0_rtx
19548 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19549 return false;
19550 code = (code == GTU ? GEU : LTU);
19552 else
19554 rtx tmp = op1;
19555 op1 = op0;
19556 op0 = tmp;
19557 code = (code == GTU ? LTU : GEU);
19559 break;
19561 /* Convert a>=0 into (unsigned)a<0x80000000. */
19562 case LT:
19563 case GE:
19564 if (mode == DImode || op1 != const0_rtx)
19565 return false;
19566 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19567 code = (code == LT ? GEU : LTU);
19568 break;
19569 case LE:
19570 case GT:
19571 if (mode == DImode || op1 != constm1_rtx)
19572 return false;
19573 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19574 code = (code == LE ? GEU : LTU);
19575 break;
19577 default:
19578 return false;
19580 /* Swapping operands may cause constant to appear as first operand. */
19581 if (!nonimmediate_operand (op0, VOIDmode))
19583 if (!can_create_pseudo_p ())
19584 return false;
19585 op0 = force_reg (mode, op0);
19587 *pop = ix86_expand_compare (code, op0, op1);
19588 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19589 return true;
19592 bool
19593 ix86_expand_int_movcc (rtx operands[])
19595 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19596 rtx compare_seq, compare_op;
19597 enum machine_mode mode = GET_MODE (operands[0]);
19598 bool sign_bit_compare_p = false;
19599 rtx op0 = XEXP (operands[1], 0);
19600 rtx op1 = XEXP (operands[1], 1);
19602 if (GET_MODE (op0) == TImode
19603 || (GET_MODE (op0) == DImode
19604 && !TARGET_64BIT))
19605 return false;
19607 start_sequence ();
19608 compare_op = ix86_expand_compare (code, op0, op1);
19609 compare_seq = get_insns ();
19610 end_sequence ();
19612 compare_code = GET_CODE (compare_op);
19614 if ((op1 == const0_rtx && (code == GE || code == LT))
19615 || (op1 == constm1_rtx && (code == GT || code == LE)))
19616 sign_bit_compare_p = true;
19618 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19619 HImode insns, we'd be swallowed in word prefix ops. */
19621 if ((mode != HImode || TARGET_FAST_PREFIX)
19622 && (mode != (TARGET_64BIT ? TImode : DImode))
19623 && CONST_INT_P (operands[2])
19624 && CONST_INT_P (operands[3]))
19626 rtx out = operands[0];
19627 HOST_WIDE_INT ct = INTVAL (operands[2]);
19628 HOST_WIDE_INT cf = INTVAL (operands[3]);
19629 HOST_WIDE_INT diff;
19631 diff = ct - cf;
19632 /* Sign bit compares are better done using shifts than we do by using
19633 sbb. */
19634 if (sign_bit_compare_p
19635 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19637 /* Detect overlap between destination and compare sources. */
19638 rtx tmp = out;
19640 if (!sign_bit_compare_p)
19642 rtx flags;
19643 bool fpcmp = false;
19645 compare_code = GET_CODE (compare_op);
19647 flags = XEXP (compare_op, 0);
19649 if (GET_MODE (flags) == CCFPmode
19650 || GET_MODE (flags) == CCFPUmode)
19652 fpcmp = true;
19653 compare_code
19654 = ix86_fp_compare_code_to_integer (compare_code);
19657 /* To simplify rest of code, restrict to the GEU case. */
19658 if (compare_code == LTU)
19660 HOST_WIDE_INT tmp = ct;
19661 ct = cf;
19662 cf = tmp;
19663 compare_code = reverse_condition (compare_code);
19664 code = reverse_condition (code);
19666 else
19668 if (fpcmp)
19669 PUT_CODE (compare_op,
19670 reverse_condition_maybe_unordered
19671 (GET_CODE (compare_op)));
19672 else
19673 PUT_CODE (compare_op,
19674 reverse_condition (GET_CODE (compare_op)));
19676 diff = ct - cf;
19678 if (reg_overlap_mentioned_p (out, op0)
19679 || reg_overlap_mentioned_p (out, op1))
19680 tmp = gen_reg_rtx (mode);
19682 if (mode == DImode)
19683 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19684 else
19685 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19686 flags, compare_op));
19688 else
19690 if (code == GT || code == GE)
19691 code = reverse_condition (code);
19692 else
19694 HOST_WIDE_INT tmp = ct;
19695 ct = cf;
19696 cf = tmp;
19697 diff = ct - cf;
19699 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19702 if (diff == 1)
19705 * cmpl op0,op1
19706 * sbbl dest,dest
19707 * [addl dest, ct]
19709 * Size 5 - 8.
19711 if (ct)
19712 tmp = expand_simple_binop (mode, PLUS,
19713 tmp, GEN_INT (ct),
19714 copy_rtx (tmp), 1, OPTAB_DIRECT);
19716 else if (cf == -1)
19719 * cmpl op0,op1
19720 * sbbl dest,dest
19721 * orl $ct, dest
19723 * Size 8.
19725 tmp = expand_simple_binop (mode, IOR,
19726 tmp, GEN_INT (ct),
19727 copy_rtx (tmp), 1, OPTAB_DIRECT);
19729 else if (diff == -1 && ct)
19732 * cmpl op0,op1
19733 * sbbl dest,dest
19734 * notl dest
19735 * [addl dest, cf]
19737 * Size 8 - 11.
19739 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19740 if (cf)
19741 tmp = expand_simple_binop (mode, PLUS,
19742 copy_rtx (tmp), GEN_INT (cf),
19743 copy_rtx (tmp), 1, OPTAB_DIRECT);
19745 else
19748 * cmpl op0,op1
19749 * sbbl dest,dest
19750 * [notl dest]
19751 * andl cf - ct, dest
19752 * [addl dest, ct]
19754 * Size 8 - 11.
19757 if (cf == 0)
19759 cf = ct;
19760 ct = 0;
19761 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19764 tmp = expand_simple_binop (mode, AND,
19765 copy_rtx (tmp),
19766 gen_int_mode (cf - ct, mode),
19767 copy_rtx (tmp), 1, OPTAB_DIRECT);
19768 if (ct)
19769 tmp = expand_simple_binop (mode, PLUS,
19770 copy_rtx (tmp), GEN_INT (ct),
19771 copy_rtx (tmp), 1, OPTAB_DIRECT);
19774 if (!rtx_equal_p (tmp, out))
19775 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19777 return true;
19780 if (diff < 0)
19782 enum machine_mode cmp_mode = GET_MODE (op0);
19784 HOST_WIDE_INT tmp;
19785 tmp = ct, ct = cf, cf = tmp;
19786 diff = -diff;
19788 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19790 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19792 /* We may be reversing unordered compare to normal compare, that
19793 is not valid in general (we may convert non-trapping condition
19794 to trapping one), however on i386 we currently emit all
19795 comparisons unordered. */
19796 compare_code = reverse_condition_maybe_unordered (compare_code);
19797 code = reverse_condition_maybe_unordered (code);
19799 else
19801 compare_code = reverse_condition (compare_code);
19802 code = reverse_condition (code);
19806 compare_code = UNKNOWN;
19807 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19808 && CONST_INT_P (op1))
19810 if (op1 == const0_rtx
19811 && (code == LT || code == GE))
19812 compare_code = code;
19813 else if (op1 == constm1_rtx)
19815 if (code == LE)
19816 compare_code = LT;
19817 else if (code == GT)
19818 compare_code = GE;
19822 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19823 if (compare_code != UNKNOWN
19824 && GET_MODE (op0) == GET_MODE (out)
19825 && (cf == -1 || ct == -1))
19827 /* If lea code below could be used, only optimize
19828 if it results in a 2 insn sequence. */
19830 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19831 || diff == 3 || diff == 5 || diff == 9)
19832 || (compare_code == LT && ct == -1)
19833 || (compare_code == GE && cf == -1))
19836 * notl op1 (if necessary)
19837 * sarl $31, op1
19838 * orl cf, op1
19840 if (ct != -1)
19842 cf = ct;
19843 ct = -1;
19844 code = reverse_condition (code);
19847 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19849 out = expand_simple_binop (mode, IOR,
19850 out, GEN_INT (cf),
19851 out, 1, OPTAB_DIRECT);
19852 if (out != operands[0])
19853 emit_move_insn (operands[0], out);
19855 return true;
19860 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19861 || diff == 3 || diff == 5 || diff == 9)
19862 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19863 && (mode != DImode
19864 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19867 * xorl dest,dest
19868 * cmpl op1,op2
19869 * setcc dest
19870 * lea cf(dest*(ct-cf)),dest
19872 * Size 14.
19874 * This also catches the degenerate setcc-only case.
19877 rtx tmp;
19878 int nops;
19880 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19882 nops = 0;
19883 /* On x86_64 the lea instruction operates on Pmode, so we need
19884 to get arithmetics done in proper mode to match. */
19885 if (diff == 1)
19886 tmp = copy_rtx (out);
19887 else
19889 rtx out1;
19890 out1 = copy_rtx (out);
19891 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19892 nops++;
19893 if (diff & 1)
19895 tmp = gen_rtx_PLUS (mode, tmp, out1);
19896 nops++;
19899 if (cf != 0)
19901 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19902 nops++;
19904 if (!rtx_equal_p (tmp, out))
19906 if (nops == 1)
19907 out = force_operand (tmp, copy_rtx (out));
19908 else
19909 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19911 if (!rtx_equal_p (out, operands[0]))
19912 emit_move_insn (operands[0], copy_rtx (out));
19914 return true;
19918 * General case: Jumpful:
19919 * xorl dest,dest cmpl op1, op2
19920 * cmpl op1, op2 movl ct, dest
19921 * setcc dest jcc 1f
19922 * decl dest movl cf, dest
19923 * andl (cf-ct),dest 1:
19924 * addl ct,dest
19926 * Size 20. Size 14.
19928 * This is reasonably steep, but branch mispredict costs are
19929 * high on modern cpus, so consider failing only if optimizing
19930 * for space.
19933 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19934 && BRANCH_COST (optimize_insn_for_speed_p (),
19935 false) >= 2)
19937 if (cf == 0)
19939 enum machine_mode cmp_mode = GET_MODE (op0);
19941 cf = ct;
19942 ct = 0;
19944 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19946 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19948 /* We may be reversing unordered compare to normal compare,
19949 that is not valid in general (we may convert non-trapping
19950 condition to trapping one), however on i386 we currently
19951 emit all comparisons unordered. */
19952 code = reverse_condition_maybe_unordered (code);
19954 else
19956 code = reverse_condition (code);
19957 if (compare_code != UNKNOWN)
19958 compare_code = reverse_condition (compare_code);
19962 if (compare_code != UNKNOWN)
19964 /* notl op1 (if needed)
19965 sarl $31, op1
19966 andl (cf-ct), op1
19967 addl ct, op1
19969 For x < 0 (resp. x <= -1) there will be no notl,
19970 so if possible swap the constants to get rid of the
19971 complement.
19972 True/false will be -1/0 while code below (store flag
19973 followed by decrement) is 0/-1, so the constants need
19974 to be exchanged once more. */
19976 if (compare_code == GE || !cf)
19978 code = reverse_condition (code);
19979 compare_code = LT;
19981 else
19983 HOST_WIDE_INT tmp = cf;
19984 cf = ct;
19985 ct = tmp;
19988 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19990 else
19992 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19994 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19995 constm1_rtx,
19996 copy_rtx (out), 1, OPTAB_DIRECT);
19999 out = expand_simple_binop (mode, AND, copy_rtx (out),
20000 gen_int_mode (cf - ct, mode),
20001 copy_rtx (out), 1, OPTAB_DIRECT);
20002 if (ct)
20003 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20004 copy_rtx (out), 1, OPTAB_DIRECT);
20005 if (!rtx_equal_p (out, operands[0]))
20006 emit_move_insn (operands[0], copy_rtx (out));
20008 return true;
20012 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20014 /* Try a few things more with specific constants and a variable. */
20016 optab op;
20017 rtx var, orig_out, out, tmp;
20019 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20020 return false;
20022 /* If one of the two operands is an interesting constant, load a
20023 constant with the above and mask it in with a logical operation. */
20025 if (CONST_INT_P (operands[2]))
20027 var = operands[3];
20028 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20029 operands[3] = constm1_rtx, op = and_optab;
20030 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20031 operands[3] = const0_rtx, op = ior_optab;
20032 else
20033 return false;
20035 else if (CONST_INT_P (operands[3]))
20037 var = operands[2];
20038 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20039 operands[2] = constm1_rtx, op = and_optab;
20040 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20041 operands[2] = const0_rtx, op = ior_optab;
20042 else
20043 return false;
20045 else
20046 return false;
20048 orig_out = operands[0];
20049 tmp = gen_reg_rtx (mode);
20050 operands[0] = tmp;
20052 /* Recurse to get the constant loaded. */
20053 if (ix86_expand_int_movcc (operands) == 0)
20054 return false;
20056 /* Mask in the interesting variable. */
20057 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20058 OPTAB_WIDEN);
20059 if (!rtx_equal_p (out, orig_out))
20060 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20062 return true;
20066 * For comparison with above,
20068 * movl cf,dest
20069 * movl ct,tmp
20070 * cmpl op1,op2
20071 * cmovcc tmp,dest
20073 * Size 15.
20076 if (! nonimmediate_operand (operands[2], mode))
20077 operands[2] = force_reg (mode, operands[2]);
20078 if (! nonimmediate_operand (operands[3], mode))
20079 operands[3] = force_reg (mode, operands[3]);
20081 if (! register_operand (operands[2], VOIDmode)
20082 && (mode == QImode
20083 || ! register_operand (operands[3], VOIDmode)))
20084 operands[2] = force_reg (mode, operands[2]);
20086 if (mode == QImode
20087 && ! register_operand (operands[3], VOIDmode))
20088 operands[3] = force_reg (mode, operands[3]);
20090 emit_insn (compare_seq);
20091 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20092 gen_rtx_IF_THEN_ELSE (mode,
20093 compare_op, operands[2],
20094 operands[3])));
20095 return true;
20098 /* Swap, force into registers, or otherwise massage the two operands
20099 to an sse comparison with a mask result. Thus we differ a bit from
20100 ix86_prepare_fp_compare_args which expects to produce a flags result.
20102 The DEST operand exists to help determine whether to commute commutative
20103 operators. The POP0/POP1 operands are updated in place. The new
20104 comparison code is returned, or UNKNOWN if not implementable. */
20106 static enum rtx_code
20107 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20108 rtx *pop0, rtx *pop1)
20110 rtx tmp;
20112 switch (code)
20114 case LTGT:
20115 case UNEQ:
20116 /* AVX supports all the needed comparisons. */
20117 if (TARGET_AVX)
20118 break;
20119 /* We have no LTGT as an operator. We could implement it with
20120 NE & ORDERED, but this requires an extra temporary. It's
20121 not clear that it's worth it. */
20122 return UNKNOWN;
20124 case LT:
20125 case LE:
20126 case UNGT:
20127 case UNGE:
20128 /* These are supported directly. */
20129 break;
20131 case EQ:
20132 case NE:
20133 case UNORDERED:
20134 case ORDERED:
20135 /* AVX has 3 operand comparisons, no need to swap anything. */
20136 if (TARGET_AVX)
20137 break;
20138 /* For commutative operators, try to canonicalize the destination
20139 operand to be first in the comparison - this helps reload to
20140 avoid extra moves. */
20141 if (!dest || !rtx_equal_p (dest, *pop1))
20142 break;
20143 /* FALLTHRU */
20145 case GE:
20146 case GT:
20147 case UNLE:
20148 case UNLT:
20149 /* These are not supported directly before AVX, and furthermore
20150 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20151 comparison operands to transform into something that is
20152 supported. */
20153 tmp = *pop0;
20154 *pop0 = *pop1;
20155 *pop1 = tmp;
20156 code = swap_condition (code);
20157 break;
20159 default:
20160 gcc_unreachable ();
20163 return code;
20166 /* Detect conditional moves that exactly match min/max operational
20167 semantics. Note that this is IEEE safe, as long as we don't
20168 interchange the operands.
20170 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20171 and TRUE if the operation is successful and instructions are emitted. */
20173 static bool
20174 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20175 rtx cmp_op1, rtx if_true, rtx if_false)
20177 enum machine_mode mode;
20178 bool is_min;
20179 rtx tmp;
20181 if (code == LT)
20183 else if (code == UNGE)
20185 tmp = if_true;
20186 if_true = if_false;
20187 if_false = tmp;
20189 else
20190 return false;
20192 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20193 is_min = true;
20194 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20195 is_min = false;
20196 else
20197 return false;
20199 mode = GET_MODE (dest);
20201 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20202 but MODE may be a vector mode and thus not appropriate. */
20203 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20205 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20206 rtvec v;
20208 if_true = force_reg (mode, if_true);
20209 v = gen_rtvec (2, if_true, if_false);
20210 tmp = gen_rtx_UNSPEC (mode, v, u);
20212 else
20214 code = is_min ? SMIN : SMAX;
20215 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20218 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20219 return true;
20222 /* Expand an sse vector comparison. Return the register with the result. */
20224 static rtx
20225 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20226 rtx op_true, rtx op_false)
20228 enum machine_mode mode = GET_MODE (dest);
20229 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20230 rtx x;
20232 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20233 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20234 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20236 if (optimize
20237 || reg_overlap_mentioned_p (dest, op_true)
20238 || reg_overlap_mentioned_p (dest, op_false))
20239 dest = gen_reg_rtx (mode);
20241 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20242 if (cmp_mode != mode)
20244 x = force_reg (cmp_mode, x);
20245 convert_move (dest, x, false);
20247 else
20248 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20250 return dest;
20253 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20254 operations. This is used for both scalar and vector conditional moves. */
20256 static void
20257 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20259 enum machine_mode mode = GET_MODE (dest);
20260 rtx t2, t3, x;
20262 if (vector_all_ones_operand (op_true, mode)
20263 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20265 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20267 else if (op_false == CONST0_RTX (mode))
20269 op_true = force_reg (mode, op_true);
20270 x = gen_rtx_AND (mode, cmp, op_true);
20271 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20273 else if (op_true == CONST0_RTX (mode))
20275 op_false = force_reg (mode, op_false);
20276 x = gen_rtx_NOT (mode, cmp);
20277 x = gen_rtx_AND (mode, x, op_false);
20278 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20280 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20282 op_false = force_reg (mode, op_false);
20283 x = gen_rtx_IOR (mode, cmp, op_false);
20284 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20286 else if (TARGET_XOP)
20288 op_true = force_reg (mode, op_true);
20290 if (!nonimmediate_operand (op_false, mode))
20291 op_false = force_reg (mode, op_false);
20293 emit_insn (gen_rtx_SET (mode, dest,
20294 gen_rtx_IF_THEN_ELSE (mode, cmp,
20295 op_true,
20296 op_false)));
20298 else
20300 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20302 if (!nonimmediate_operand (op_true, mode))
20303 op_true = force_reg (mode, op_true);
20305 op_false = force_reg (mode, op_false);
20307 switch (mode)
20309 case V4SFmode:
20310 if (TARGET_SSE4_1)
20311 gen = gen_sse4_1_blendvps;
20312 break;
20313 case V2DFmode:
20314 if (TARGET_SSE4_1)
20315 gen = gen_sse4_1_blendvpd;
20316 break;
20317 case V16QImode:
20318 case V8HImode:
20319 case V4SImode:
20320 case V2DImode:
20321 if (TARGET_SSE4_1)
20323 gen = gen_sse4_1_pblendvb;
20324 dest = gen_lowpart (V16QImode, dest);
20325 op_false = gen_lowpart (V16QImode, op_false);
20326 op_true = gen_lowpart (V16QImode, op_true);
20327 cmp = gen_lowpart (V16QImode, cmp);
20329 break;
20330 case V8SFmode:
20331 if (TARGET_AVX)
20332 gen = gen_avx_blendvps256;
20333 break;
20334 case V4DFmode:
20335 if (TARGET_AVX)
20336 gen = gen_avx_blendvpd256;
20337 break;
20338 case V32QImode:
20339 case V16HImode:
20340 case V8SImode:
20341 case V4DImode:
20342 if (TARGET_AVX2)
20344 gen = gen_avx2_pblendvb;
20345 dest = gen_lowpart (V32QImode, dest);
20346 op_false = gen_lowpart (V32QImode, op_false);
20347 op_true = gen_lowpart (V32QImode, op_true);
20348 cmp = gen_lowpart (V32QImode, cmp);
20350 break;
20351 default:
20352 break;
20355 if (gen != NULL)
20356 emit_insn (gen (dest, op_false, op_true, cmp));
20357 else
20359 op_true = force_reg (mode, op_true);
20361 t2 = gen_reg_rtx (mode);
20362 if (optimize)
20363 t3 = gen_reg_rtx (mode);
20364 else
20365 t3 = dest;
20367 x = gen_rtx_AND (mode, op_true, cmp);
20368 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20370 x = gen_rtx_NOT (mode, cmp);
20371 x = gen_rtx_AND (mode, x, op_false);
20372 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20374 x = gen_rtx_IOR (mode, t3, t2);
20375 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20380 /* Expand a floating-point conditional move. Return true if successful. */
20382 bool
20383 ix86_expand_fp_movcc (rtx operands[])
20385 enum machine_mode mode = GET_MODE (operands[0]);
20386 enum rtx_code code = GET_CODE (operands[1]);
20387 rtx tmp, compare_op;
20388 rtx op0 = XEXP (operands[1], 0);
20389 rtx op1 = XEXP (operands[1], 1);
20391 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20393 enum machine_mode cmode;
20395 /* Since we've no cmove for sse registers, don't force bad register
20396 allocation just to gain access to it. Deny movcc when the
20397 comparison mode doesn't match the move mode. */
20398 cmode = GET_MODE (op0);
20399 if (cmode == VOIDmode)
20400 cmode = GET_MODE (op1);
20401 if (cmode != mode)
20402 return false;
20404 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20405 if (code == UNKNOWN)
20406 return false;
20408 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20409 operands[2], operands[3]))
20410 return true;
20412 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20413 operands[2], operands[3]);
20414 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20415 return true;
20418 if (GET_MODE (op0) == TImode
20419 || (GET_MODE (op0) == DImode
20420 && !TARGET_64BIT))
20421 return false;
20423 /* The floating point conditional move instructions don't directly
20424 support conditions resulting from a signed integer comparison. */
20426 compare_op = ix86_expand_compare (code, op0, op1);
20427 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20429 tmp = gen_reg_rtx (QImode);
20430 ix86_expand_setcc (tmp, code, op0, op1);
20432 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20435 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20436 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20437 operands[2], operands[3])));
20439 return true;
20442 /* Expand a floating-point vector conditional move; a vcond operation
20443 rather than a movcc operation. */
20445 bool
20446 ix86_expand_fp_vcond (rtx operands[])
20448 enum rtx_code code = GET_CODE (operands[3]);
20449 rtx cmp;
20451 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20452 &operands[4], &operands[5]);
20453 if (code == UNKNOWN)
20455 rtx temp;
20456 switch (GET_CODE (operands[3]))
20458 case LTGT:
20459 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20460 operands[5], operands[0], operands[0]);
20461 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20462 operands[5], operands[1], operands[2]);
20463 code = AND;
20464 break;
20465 case UNEQ:
20466 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20467 operands[5], operands[0], operands[0]);
20468 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20469 operands[5], operands[1], operands[2]);
20470 code = IOR;
20471 break;
20472 default:
20473 gcc_unreachable ();
20475 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20476 OPTAB_DIRECT);
20477 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20478 return true;
20481 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20482 operands[5], operands[1], operands[2]))
20483 return true;
20485 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20486 operands[1], operands[2]);
20487 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20488 return true;
20491 /* Expand a signed/unsigned integral vector conditional move. */
20493 bool
20494 ix86_expand_int_vcond (rtx operands[])
20496 enum machine_mode data_mode = GET_MODE (operands[0]);
20497 enum machine_mode mode = GET_MODE (operands[4]);
20498 enum rtx_code code = GET_CODE (operands[3]);
20499 bool negate = false;
20500 rtx x, cop0, cop1;
20502 cop0 = operands[4];
20503 cop1 = operands[5];
20505 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20506 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20507 if ((code == LT || code == GE)
20508 && data_mode == mode
20509 && cop1 == CONST0_RTX (mode)
20510 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20511 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20512 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20513 && (GET_MODE_SIZE (data_mode) == 16
20514 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20516 rtx negop = operands[2 - (code == LT)];
20517 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20518 if (negop == CONST1_RTX (data_mode))
20520 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20521 operands[0], 1, OPTAB_DIRECT);
20522 if (res != operands[0])
20523 emit_move_insn (operands[0], res);
20524 return true;
20526 else if (GET_MODE_INNER (data_mode) != DImode
20527 && vector_all_ones_operand (negop, data_mode))
20529 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20530 operands[0], 0, OPTAB_DIRECT);
20531 if (res != operands[0])
20532 emit_move_insn (operands[0], res);
20533 return true;
20537 if (!nonimmediate_operand (cop1, mode))
20538 cop1 = force_reg (mode, cop1);
20539 if (!general_operand (operands[1], data_mode))
20540 operands[1] = force_reg (data_mode, operands[1]);
20541 if (!general_operand (operands[2], data_mode))
20542 operands[2] = force_reg (data_mode, operands[2]);
20544 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20545 if (TARGET_XOP
20546 && (mode == V16QImode || mode == V8HImode
20547 || mode == V4SImode || mode == V2DImode))
20549 else
20551 /* Canonicalize the comparison to EQ, GT, GTU. */
20552 switch (code)
20554 case EQ:
20555 case GT:
20556 case GTU:
20557 break;
20559 case NE:
20560 case LE:
20561 case LEU:
20562 code = reverse_condition (code);
20563 negate = true;
20564 break;
20566 case GE:
20567 case GEU:
20568 code = reverse_condition (code);
20569 negate = true;
20570 /* FALLTHRU */
20572 case LT:
20573 case LTU:
20574 code = swap_condition (code);
20575 x = cop0, cop0 = cop1, cop1 = x;
20576 break;
20578 default:
20579 gcc_unreachable ();
20582 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20583 if (mode == V2DImode)
20585 switch (code)
20587 case EQ:
20588 /* SSE4.1 supports EQ. */
20589 if (!TARGET_SSE4_1)
20590 return false;
20591 break;
20593 case GT:
20594 case GTU:
20595 /* SSE4.2 supports GT/GTU. */
20596 if (!TARGET_SSE4_2)
20597 return false;
20598 break;
20600 default:
20601 gcc_unreachable ();
20605 /* Unsigned parallel compare is not supported by the hardware.
20606 Play some tricks to turn this into a signed comparison
20607 against 0. */
20608 if (code == GTU)
20610 cop0 = force_reg (mode, cop0);
20612 switch (mode)
20614 case V8SImode:
20615 case V4DImode:
20616 case V4SImode:
20617 case V2DImode:
20619 rtx t1, t2, mask;
20620 rtx (*gen_sub3) (rtx, rtx, rtx);
20622 switch (mode)
20624 case V8SImode: gen_sub3 = gen_subv8si3; break;
20625 case V4DImode: gen_sub3 = gen_subv4di3; break;
20626 case V4SImode: gen_sub3 = gen_subv4si3; break;
20627 case V2DImode: gen_sub3 = gen_subv2di3; break;
20628 default:
20629 gcc_unreachable ();
20631 /* Subtract (-(INT MAX) - 1) from both operands to make
20632 them signed. */
20633 mask = ix86_build_signbit_mask (mode, true, false);
20634 t1 = gen_reg_rtx (mode);
20635 emit_insn (gen_sub3 (t1, cop0, mask));
20637 t2 = gen_reg_rtx (mode);
20638 emit_insn (gen_sub3 (t2, cop1, mask));
20640 cop0 = t1;
20641 cop1 = t2;
20642 code = GT;
20644 break;
20646 case V32QImode:
20647 case V16HImode:
20648 case V16QImode:
20649 case V8HImode:
20650 /* Perform a parallel unsigned saturating subtraction. */
20651 x = gen_reg_rtx (mode);
20652 emit_insn (gen_rtx_SET (VOIDmode, x,
20653 gen_rtx_US_MINUS (mode, cop0, cop1)));
20655 cop0 = x;
20656 cop1 = CONST0_RTX (mode);
20657 code = EQ;
20658 negate = !negate;
20659 break;
20661 default:
20662 gcc_unreachable ();
20667 /* Allow the comparison to be done in one mode, but the movcc to
20668 happen in another mode. */
20669 if (data_mode == mode)
20671 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20672 operands[1+negate], operands[2-negate]);
20674 else
20676 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20677 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20678 code, cop0, cop1,
20679 operands[1+negate], operands[2-negate]);
20680 x = gen_lowpart (data_mode, x);
20683 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20684 operands[2-negate]);
20685 return true;
20688 /* Expand a variable vector permutation. */
20690 void
20691 ix86_expand_vec_perm (rtx operands[])
20693 rtx target = operands[0];
20694 rtx op0 = operands[1];
20695 rtx op1 = operands[2];
20696 rtx mask = operands[3];
20697 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20698 enum machine_mode mode = GET_MODE (op0);
20699 enum machine_mode maskmode = GET_MODE (mask);
20700 int w, e, i;
20701 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20703 /* Number of elements in the vector. */
20704 w = GET_MODE_NUNITS (mode);
20705 e = GET_MODE_UNIT_SIZE (mode);
20706 gcc_assert (w <= 32);
20708 if (TARGET_AVX2)
20710 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20712 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20713 an constant shuffle operand. With a tiny bit of effort we can
20714 use VPERMD instead. A re-interpretation stall for V4DFmode is
20715 unfortunate but there's no avoiding it.
20716 Similarly for V16HImode we don't have instructions for variable
20717 shuffling, while for V32QImode we can use after preparing suitable
20718 masks vpshufb; vpshufb; vpermq; vpor. */
20720 if (mode == V16HImode)
20722 maskmode = mode = V32QImode;
20723 w = 32;
20724 e = 1;
20726 else
20728 maskmode = mode = V8SImode;
20729 w = 8;
20730 e = 4;
20732 t1 = gen_reg_rtx (maskmode);
20734 /* Replicate the low bits of the V4DImode mask into V8SImode:
20735 mask = { A B C D }
20736 t1 = { A A B B C C D D }. */
20737 for (i = 0; i < w / 2; ++i)
20738 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20739 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20740 vt = force_reg (maskmode, vt);
20741 mask = gen_lowpart (maskmode, mask);
20742 if (maskmode == V8SImode)
20743 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20744 else
20745 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20747 /* Multiply the shuffle indicies by two. */
20748 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20749 OPTAB_DIRECT);
20751 /* Add one to the odd shuffle indicies:
20752 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20753 for (i = 0; i < w / 2; ++i)
20755 vec[i * 2] = const0_rtx;
20756 vec[i * 2 + 1] = const1_rtx;
20758 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20759 vt = validize_mem (force_const_mem (maskmode, vt));
20760 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20761 OPTAB_DIRECT);
20763 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20764 operands[3] = mask = t1;
20765 target = gen_lowpart (mode, target);
20766 op0 = gen_lowpart (mode, op0);
20767 op1 = gen_lowpart (mode, op1);
20770 switch (mode)
20772 case V8SImode:
20773 /* The VPERMD and VPERMPS instructions already properly ignore
20774 the high bits of the shuffle elements. No need for us to
20775 perform an AND ourselves. */
20776 if (one_operand_shuffle)
20777 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20778 else
20780 t1 = gen_reg_rtx (V8SImode);
20781 t2 = gen_reg_rtx (V8SImode);
20782 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20783 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20784 goto merge_two;
20786 return;
20788 case V8SFmode:
20789 mask = gen_lowpart (V8SFmode, mask);
20790 if (one_operand_shuffle)
20791 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20792 else
20794 t1 = gen_reg_rtx (V8SFmode);
20795 t2 = gen_reg_rtx (V8SFmode);
20796 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20797 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20798 goto merge_two;
20800 return;
20802 case V4SImode:
20803 /* By combining the two 128-bit input vectors into one 256-bit
20804 input vector, we can use VPERMD and VPERMPS for the full
20805 two-operand shuffle. */
20806 t1 = gen_reg_rtx (V8SImode);
20807 t2 = gen_reg_rtx (V8SImode);
20808 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20809 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20810 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20811 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20812 return;
20814 case V4SFmode:
20815 t1 = gen_reg_rtx (V8SFmode);
20816 t2 = gen_reg_rtx (V8SImode);
20817 mask = gen_lowpart (V4SImode, mask);
20818 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20819 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20820 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20821 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20822 return;
20824 case V32QImode:
20825 t1 = gen_reg_rtx (V32QImode);
20826 t2 = gen_reg_rtx (V32QImode);
20827 t3 = gen_reg_rtx (V32QImode);
20828 vt2 = GEN_INT (128);
20829 for (i = 0; i < 32; i++)
20830 vec[i] = vt2;
20831 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20832 vt = force_reg (V32QImode, vt);
20833 for (i = 0; i < 32; i++)
20834 vec[i] = i < 16 ? vt2 : const0_rtx;
20835 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20836 vt2 = force_reg (V32QImode, vt2);
20837 /* From mask create two adjusted masks, which contain the same
20838 bits as mask in the low 7 bits of each vector element.
20839 The first mask will have the most significant bit clear
20840 if it requests element from the same 128-bit lane
20841 and MSB set if it requests element from the other 128-bit lane.
20842 The second mask will have the opposite values of the MSB,
20843 and additionally will have its 128-bit lanes swapped.
20844 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20845 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20846 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20847 stands for other 12 bytes. */
20848 /* The bit whether element is from the same lane or the other
20849 lane is bit 4, so shift it up by 3 to the MSB position. */
20850 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20851 gen_lowpart (V4DImode, mask),
20852 GEN_INT (3)));
20853 /* Clear MSB bits from the mask just in case it had them set. */
20854 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20855 /* After this t1 will have MSB set for elements from other lane. */
20856 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20857 /* Clear bits other than MSB. */
20858 emit_insn (gen_andv32qi3 (t1, t1, vt));
20859 /* Or in the lower bits from mask into t3. */
20860 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20861 /* And invert MSB bits in t1, so MSB is set for elements from the same
20862 lane. */
20863 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20864 /* Swap 128-bit lanes in t3. */
20865 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20866 gen_lowpart (V4DImode, t3),
20867 const2_rtx, GEN_INT (3),
20868 const0_rtx, const1_rtx));
20869 /* And or in the lower bits from mask into t1. */
20870 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20871 if (one_operand_shuffle)
20873 /* Each of these shuffles will put 0s in places where
20874 element from the other 128-bit lane is needed, otherwise
20875 will shuffle in the requested value. */
20876 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20877 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20878 /* For t3 the 128-bit lanes are swapped again. */
20879 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20880 gen_lowpart (V4DImode, t3),
20881 const2_rtx, GEN_INT (3),
20882 const0_rtx, const1_rtx));
20883 /* And oring both together leads to the result. */
20884 emit_insn (gen_iorv32qi3 (target, t1, t3));
20885 return;
20888 t4 = gen_reg_rtx (V32QImode);
20889 /* Similarly to the above one_operand_shuffle code,
20890 just for repeated twice for each operand. merge_two:
20891 code will merge the two results together. */
20892 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20893 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20894 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20895 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20896 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20897 gen_lowpart (V4DImode, t4),
20898 const2_rtx, GEN_INT (3),
20899 const0_rtx, const1_rtx));
20900 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20901 gen_lowpart (V4DImode, t3),
20902 const2_rtx, GEN_INT (3),
20903 const0_rtx, const1_rtx));
20904 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20905 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20906 t1 = t4;
20907 t2 = t3;
20908 goto merge_two;
20910 default:
20911 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20912 break;
20916 if (TARGET_XOP)
20918 /* The XOP VPPERM insn supports three inputs. By ignoring the
20919 one_operand_shuffle special case, we avoid creating another
20920 set of constant vectors in memory. */
20921 one_operand_shuffle = false;
20923 /* mask = mask & {2*w-1, ...} */
20924 vt = GEN_INT (2*w - 1);
20926 else
20928 /* mask = mask & {w-1, ...} */
20929 vt = GEN_INT (w - 1);
20932 for (i = 0; i < w; i++)
20933 vec[i] = vt;
20934 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20935 mask = expand_simple_binop (maskmode, AND, mask, vt,
20936 NULL_RTX, 0, OPTAB_DIRECT);
20938 /* For non-QImode operations, convert the word permutation control
20939 into a byte permutation control. */
20940 if (mode != V16QImode)
20942 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20943 GEN_INT (exact_log2 (e)),
20944 NULL_RTX, 0, OPTAB_DIRECT);
20946 /* Convert mask to vector of chars. */
20947 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20949 /* Replicate each of the input bytes into byte positions:
20950 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20951 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20952 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20953 for (i = 0; i < 16; ++i)
20954 vec[i] = GEN_INT (i/e * e);
20955 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20956 vt = validize_mem (force_const_mem (V16QImode, vt));
20957 if (TARGET_XOP)
20958 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20959 else
20960 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20962 /* Convert it into the byte positions by doing
20963 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20964 for (i = 0; i < 16; ++i)
20965 vec[i] = GEN_INT (i % e);
20966 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20967 vt = validize_mem (force_const_mem (V16QImode, vt));
20968 emit_insn (gen_addv16qi3 (mask, mask, vt));
20971 /* The actual shuffle operations all operate on V16QImode. */
20972 op0 = gen_lowpart (V16QImode, op0);
20973 op1 = gen_lowpart (V16QImode, op1);
20974 target = gen_lowpart (V16QImode, target);
20976 if (TARGET_XOP)
20978 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20980 else if (one_operand_shuffle)
20982 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20984 else
20986 rtx xops[6];
20987 bool ok;
20989 /* Shuffle the two input vectors independently. */
20990 t1 = gen_reg_rtx (V16QImode);
20991 t2 = gen_reg_rtx (V16QImode);
20992 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20993 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20995 merge_two:
20996 /* Then merge them together. The key is whether any given control
20997 element contained a bit set that indicates the second word. */
20998 mask = operands[3];
20999 vt = GEN_INT (w);
21000 if (maskmode == V2DImode && !TARGET_SSE4_1)
21002 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21003 more shuffle to convert the V2DI input mask into a V4SI
21004 input mask. At which point the masking that expand_int_vcond
21005 will work as desired. */
21006 rtx t3 = gen_reg_rtx (V4SImode);
21007 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21008 const0_rtx, const0_rtx,
21009 const2_rtx, const2_rtx));
21010 mask = t3;
21011 maskmode = V4SImode;
21012 e = w = 4;
21015 for (i = 0; i < w; i++)
21016 vec[i] = vt;
21017 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21018 vt = force_reg (maskmode, vt);
21019 mask = expand_simple_binop (maskmode, AND, mask, vt,
21020 NULL_RTX, 0, OPTAB_DIRECT);
21022 xops[0] = gen_lowpart (mode, operands[0]);
21023 xops[1] = gen_lowpart (mode, t2);
21024 xops[2] = gen_lowpart (mode, t1);
21025 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21026 xops[4] = mask;
21027 xops[5] = vt;
21028 ok = ix86_expand_int_vcond (xops);
21029 gcc_assert (ok);
21033 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21034 true if we should do zero extension, else sign extension. HIGH_P is
21035 true if we want the N/2 high elements, else the low elements. */
21037 void
21038 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21040 enum machine_mode imode = GET_MODE (src);
21041 rtx tmp;
21043 if (TARGET_SSE4_1)
21045 rtx (*unpack)(rtx, rtx);
21046 rtx (*extract)(rtx, rtx) = NULL;
21047 enum machine_mode halfmode = BLKmode;
21049 switch (imode)
21051 case V32QImode:
21052 if (unsigned_p)
21053 unpack = gen_avx2_zero_extendv16qiv16hi2;
21054 else
21055 unpack = gen_avx2_sign_extendv16qiv16hi2;
21056 halfmode = V16QImode;
21057 extract
21058 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21059 break;
21060 case V16HImode:
21061 if (unsigned_p)
21062 unpack = gen_avx2_zero_extendv8hiv8si2;
21063 else
21064 unpack = gen_avx2_sign_extendv8hiv8si2;
21065 halfmode = V8HImode;
21066 extract
21067 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21068 break;
21069 case V8SImode:
21070 if (unsigned_p)
21071 unpack = gen_avx2_zero_extendv4siv4di2;
21072 else
21073 unpack = gen_avx2_sign_extendv4siv4di2;
21074 halfmode = V4SImode;
21075 extract
21076 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21077 break;
21078 case V16QImode:
21079 if (unsigned_p)
21080 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21081 else
21082 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21083 break;
21084 case V8HImode:
21085 if (unsigned_p)
21086 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21087 else
21088 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21089 break;
21090 case V4SImode:
21091 if (unsigned_p)
21092 unpack = gen_sse4_1_zero_extendv2siv2di2;
21093 else
21094 unpack = gen_sse4_1_sign_extendv2siv2di2;
21095 break;
21096 default:
21097 gcc_unreachable ();
21100 if (GET_MODE_SIZE (imode) == 32)
21102 tmp = gen_reg_rtx (halfmode);
21103 emit_insn (extract (tmp, src));
21105 else if (high_p)
21107 /* Shift higher 8 bytes to lower 8 bytes. */
21108 tmp = gen_reg_rtx (imode);
21109 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
21110 gen_lowpart (V1TImode, src),
21111 GEN_INT (64)));
21113 else
21114 tmp = src;
21116 emit_insn (unpack (dest, tmp));
21118 else
21120 rtx (*unpack)(rtx, rtx, rtx);
21122 switch (imode)
21124 case V16QImode:
21125 if (high_p)
21126 unpack = gen_vec_interleave_highv16qi;
21127 else
21128 unpack = gen_vec_interleave_lowv16qi;
21129 break;
21130 case V8HImode:
21131 if (high_p)
21132 unpack = gen_vec_interleave_highv8hi;
21133 else
21134 unpack = gen_vec_interleave_lowv8hi;
21135 break;
21136 case V4SImode:
21137 if (high_p)
21138 unpack = gen_vec_interleave_highv4si;
21139 else
21140 unpack = gen_vec_interleave_lowv4si;
21141 break;
21142 default:
21143 gcc_unreachable ();
21146 if (unsigned_p)
21147 tmp = force_reg (imode, CONST0_RTX (imode));
21148 else
21149 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21150 src, pc_rtx, pc_rtx);
21152 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
21156 /* Expand conditional increment or decrement using adb/sbb instructions.
21157 The default case using setcc followed by the conditional move can be
21158 done by generic code. */
21159 bool
21160 ix86_expand_int_addcc (rtx operands[])
21162 enum rtx_code code = GET_CODE (operands[1]);
21163 rtx flags;
21164 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21165 rtx compare_op;
21166 rtx val = const0_rtx;
21167 bool fpcmp = false;
21168 enum machine_mode mode;
21169 rtx op0 = XEXP (operands[1], 0);
21170 rtx op1 = XEXP (operands[1], 1);
21172 if (operands[3] != const1_rtx
21173 && operands[3] != constm1_rtx)
21174 return false;
21175 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21176 return false;
21177 code = GET_CODE (compare_op);
21179 flags = XEXP (compare_op, 0);
21181 if (GET_MODE (flags) == CCFPmode
21182 || GET_MODE (flags) == CCFPUmode)
21184 fpcmp = true;
21185 code = ix86_fp_compare_code_to_integer (code);
21188 if (code != LTU)
21190 val = constm1_rtx;
21191 if (fpcmp)
21192 PUT_CODE (compare_op,
21193 reverse_condition_maybe_unordered
21194 (GET_CODE (compare_op)));
21195 else
21196 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21199 mode = GET_MODE (operands[0]);
21201 /* Construct either adc or sbb insn. */
21202 if ((code == LTU) == (operands[3] == constm1_rtx))
21204 switch (mode)
21206 case QImode:
21207 insn = gen_subqi3_carry;
21208 break;
21209 case HImode:
21210 insn = gen_subhi3_carry;
21211 break;
21212 case SImode:
21213 insn = gen_subsi3_carry;
21214 break;
21215 case DImode:
21216 insn = gen_subdi3_carry;
21217 break;
21218 default:
21219 gcc_unreachable ();
21222 else
21224 switch (mode)
21226 case QImode:
21227 insn = gen_addqi3_carry;
21228 break;
21229 case HImode:
21230 insn = gen_addhi3_carry;
21231 break;
21232 case SImode:
21233 insn = gen_addsi3_carry;
21234 break;
21235 case DImode:
21236 insn = gen_adddi3_carry;
21237 break;
21238 default:
21239 gcc_unreachable ();
21242 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21244 return true;
21248 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21249 but works for floating pointer parameters and nonoffsetable memories.
21250 For pushes, it returns just stack offsets; the values will be saved
21251 in the right order. Maximally three parts are generated. */
21253 static int
21254 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21256 int size;
21258 if (!TARGET_64BIT)
21259 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21260 else
21261 size = (GET_MODE_SIZE (mode) + 4) / 8;
21263 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21264 gcc_assert (size >= 2 && size <= 4);
21266 /* Optimize constant pool reference to immediates. This is used by fp
21267 moves, that force all constants to memory to allow combining. */
21268 if (MEM_P (operand) && MEM_READONLY_P (operand))
21270 rtx tmp = maybe_get_pool_constant (operand);
21271 if (tmp)
21272 operand = tmp;
21275 if (MEM_P (operand) && !offsettable_memref_p (operand))
21277 /* The only non-offsetable memories we handle are pushes. */
21278 int ok = push_operand (operand, VOIDmode);
21280 gcc_assert (ok);
21282 operand = copy_rtx (operand);
21283 PUT_MODE (operand, word_mode);
21284 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21285 return size;
21288 if (GET_CODE (operand) == CONST_VECTOR)
21290 enum machine_mode imode = int_mode_for_mode (mode);
21291 /* Caution: if we looked through a constant pool memory above,
21292 the operand may actually have a different mode now. That's
21293 ok, since we want to pun this all the way back to an integer. */
21294 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21295 gcc_assert (operand != NULL);
21296 mode = imode;
21299 if (!TARGET_64BIT)
21301 if (mode == DImode)
21302 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21303 else
21305 int i;
21307 if (REG_P (operand))
21309 gcc_assert (reload_completed);
21310 for (i = 0; i < size; i++)
21311 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21313 else if (offsettable_memref_p (operand))
21315 operand = adjust_address (operand, SImode, 0);
21316 parts[0] = operand;
21317 for (i = 1; i < size; i++)
21318 parts[i] = adjust_address (operand, SImode, 4 * i);
21320 else if (GET_CODE (operand) == CONST_DOUBLE)
21322 REAL_VALUE_TYPE r;
21323 long l[4];
21325 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21326 switch (mode)
21328 case TFmode:
21329 real_to_target (l, &r, mode);
21330 parts[3] = gen_int_mode (l[3], SImode);
21331 parts[2] = gen_int_mode (l[2], SImode);
21332 break;
21333 case XFmode:
21334 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21335 long double may not be 80-bit. */
21336 real_to_target (l, &r, mode);
21337 parts[2] = gen_int_mode (l[2], SImode);
21338 break;
21339 case DFmode:
21340 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21341 break;
21342 default:
21343 gcc_unreachable ();
21345 parts[1] = gen_int_mode (l[1], SImode);
21346 parts[0] = gen_int_mode (l[0], SImode);
21348 else
21349 gcc_unreachable ();
21352 else
21354 if (mode == TImode)
21355 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21356 if (mode == XFmode || mode == TFmode)
21358 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21359 if (REG_P (operand))
21361 gcc_assert (reload_completed);
21362 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21363 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21365 else if (offsettable_memref_p (operand))
21367 operand = adjust_address (operand, DImode, 0);
21368 parts[0] = operand;
21369 parts[1] = adjust_address (operand, upper_mode, 8);
21371 else if (GET_CODE (operand) == CONST_DOUBLE)
21373 REAL_VALUE_TYPE r;
21374 long l[4];
21376 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21377 real_to_target (l, &r, mode);
21379 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21380 if (HOST_BITS_PER_WIDE_INT >= 64)
21381 parts[0]
21382 = gen_int_mode
21383 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21384 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21385 DImode);
21386 else
21387 parts[0] = immed_double_const (l[0], l[1], DImode);
21389 if (upper_mode == SImode)
21390 parts[1] = gen_int_mode (l[2], SImode);
21391 else if (HOST_BITS_PER_WIDE_INT >= 64)
21392 parts[1]
21393 = gen_int_mode
21394 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21395 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21396 DImode);
21397 else
21398 parts[1] = immed_double_const (l[2], l[3], DImode);
21400 else
21401 gcc_unreachable ();
21405 return size;
21408 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21409 Return false when normal moves are needed; true when all required
21410 insns have been emitted. Operands 2-4 contain the input values
21411 int the correct order; operands 5-7 contain the output values. */
21413 void
21414 ix86_split_long_move (rtx operands[])
21416 rtx part[2][4];
21417 int nparts, i, j;
21418 int push = 0;
21419 int collisions = 0;
21420 enum machine_mode mode = GET_MODE (operands[0]);
21421 bool collisionparts[4];
21423 /* The DFmode expanders may ask us to move double.
21424 For 64bit target this is single move. By hiding the fact
21425 here we simplify i386.md splitters. */
21426 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21428 /* Optimize constant pool reference to immediates. This is used by
21429 fp moves, that force all constants to memory to allow combining. */
21431 if (MEM_P (operands[1])
21432 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21433 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21434 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21435 if (push_operand (operands[0], VOIDmode))
21437 operands[0] = copy_rtx (operands[0]);
21438 PUT_MODE (operands[0], word_mode);
21440 else
21441 operands[0] = gen_lowpart (DImode, operands[0]);
21442 operands[1] = gen_lowpart (DImode, operands[1]);
21443 emit_move_insn (operands[0], operands[1]);
21444 return;
21447 /* The only non-offsettable memory we handle is push. */
21448 if (push_operand (operands[0], VOIDmode))
21449 push = 1;
21450 else
21451 gcc_assert (!MEM_P (operands[0])
21452 || offsettable_memref_p (operands[0]));
21454 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21455 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21457 /* When emitting push, take care for source operands on the stack. */
21458 if (push && MEM_P (operands[1])
21459 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21461 rtx src_base = XEXP (part[1][nparts - 1], 0);
21463 /* Compensate for the stack decrement by 4. */
21464 if (!TARGET_64BIT && nparts == 3
21465 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21466 src_base = plus_constant (Pmode, src_base, 4);
21468 /* src_base refers to the stack pointer and is
21469 automatically decreased by emitted push. */
21470 for (i = 0; i < nparts; i++)
21471 part[1][i] = change_address (part[1][i],
21472 GET_MODE (part[1][i]), src_base);
21475 /* We need to do copy in the right order in case an address register
21476 of the source overlaps the destination. */
21477 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21479 rtx tmp;
21481 for (i = 0; i < nparts; i++)
21483 collisionparts[i]
21484 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21485 if (collisionparts[i])
21486 collisions++;
21489 /* Collision in the middle part can be handled by reordering. */
21490 if (collisions == 1 && nparts == 3 && collisionparts [1])
21492 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21493 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21495 else if (collisions == 1
21496 && nparts == 4
21497 && (collisionparts [1] || collisionparts [2]))
21499 if (collisionparts [1])
21501 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21502 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21504 else
21506 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21507 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21511 /* If there are more collisions, we can't handle it by reordering.
21512 Do an lea to the last part and use only one colliding move. */
21513 else if (collisions > 1)
21515 rtx base;
21517 collisions = 1;
21519 base = part[0][nparts - 1];
21521 /* Handle the case when the last part isn't valid for lea.
21522 Happens in 64-bit mode storing the 12-byte XFmode. */
21523 if (GET_MODE (base) != Pmode)
21524 base = gen_rtx_REG (Pmode, REGNO (base));
21526 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21527 part[1][0] = replace_equiv_address (part[1][0], base);
21528 for (i = 1; i < nparts; i++)
21530 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21531 part[1][i] = replace_equiv_address (part[1][i], tmp);
21536 if (push)
21538 if (!TARGET_64BIT)
21540 if (nparts == 3)
21542 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21543 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21544 stack_pointer_rtx, GEN_INT (-4)));
21545 emit_move_insn (part[0][2], part[1][2]);
21547 else if (nparts == 4)
21549 emit_move_insn (part[0][3], part[1][3]);
21550 emit_move_insn (part[0][2], part[1][2]);
21553 else
21555 /* In 64bit mode we don't have 32bit push available. In case this is
21556 register, it is OK - we will just use larger counterpart. We also
21557 retype memory - these comes from attempt to avoid REX prefix on
21558 moving of second half of TFmode value. */
21559 if (GET_MODE (part[1][1]) == SImode)
21561 switch (GET_CODE (part[1][1]))
21563 case MEM:
21564 part[1][1] = adjust_address (part[1][1], DImode, 0);
21565 break;
21567 case REG:
21568 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21569 break;
21571 default:
21572 gcc_unreachable ();
21575 if (GET_MODE (part[1][0]) == SImode)
21576 part[1][0] = part[1][1];
21579 emit_move_insn (part[0][1], part[1][1]);
21580 emit_move_insn (part[0][0], part[1][0]);
21581 return;
21584 /* Choose correct order to not overwrite the source before it is copied. */
21585 if ((REG_P (part[0][0])
21586 && REG_P (part[1][1])
21587 && (REGNO (part[0][0]) == REGNO (part[1][1])
21588 || (nparts == 3
21589 && REGNO (part[0][0]) == REGNO (part[1][2]))
21590 || (nparts == 4
21591 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21592 || (collisions > 0
21593 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21595 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21597 operands[2 + i] = part[0][j];
21598 operands[6 + i] = part[1][j];
21601 else
21603 for (i = 0; i < nparts; i++)
21605 operands[2 + i] = part[0][i];
21606 operands[6 + i] = part[1][i];
21610 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21611 if (optimize_insn_for_size_p ())
21613 for (j = 0; j < nparts - 1; j++)
21614 if (CONST_INT_P (operands[6 + j])
21615 && operands[6 + j] != const0_rtx
21616 && REG_P (operands[2 + j]))
21617 for (i = j; i < nparts - 1; i++)
21618 if (CONST_INT_P (operands[7 + i])
21619 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21620 operands[7 + i] = operands[2 + j];
21623 for (i = 0; i < nparts; i++)
21624 emit_move_insn (operands[2 + i], operands[6 + i]);
21626 return;
21629 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21630 left shift by a constant, either using a single shift or
21631 a sequence of add instructions. */
21633 static void
21634 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21636 rtx (*insn)(rtx, rtx, rtx);
21638 if (count == 1
21639 || (count * ix86_cost->add <= ix86_cost->shift_const
21640 && !optimize_insn_for_size_p ()))
21642 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21643 while (count-- > 0)
21644 emit_insn (insn (operand, operand, operand));
21646 else
21648 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21649 emit_insn (insn (operand, operand, GEN_INT (count)));
21653 void
21654 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21656 rtx (*gen_ashl3)(rtx, rtx, rtx);
21657 rtx (*gen_shld)(rtx, rtx, rtx);
21658 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21660 rtx low[2], high[2];
21661 int count;
21663 if (CONST_INT_P (operands[2]))
21665 split_double_mode (mode, operands, 2, low, high);
21666 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21668 if (count >= half_width)
21670 emit_move_insn (high[0], low[1]);
21671 emit_move_insn (low[0], const0_rtx);
21673 if (count > half_width)
21674 ix86_expand_ashl_const (high[0], count - half_width, mode);
21676 else
21678 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21680 if (!rtx_equal_p (operands[0], operands[1]))
21681 emit_move_insn (operands[0], operands[1]);
21683 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21684 ix86_expand_ashl_const (low[0], count, mode);
21686 return;
21689 split_double_mode (mode, operands, 1, low, high);
21691 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21693 if (operands[1] == const1_rtx)
21695 /* Assuming we've chosen a QImode capable registers, then 1 << N
21696 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21697 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21699 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21701 ix86_expand_clear (low[0]);
21702 ix86_expand_clear (high[0]);
21703 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21705 d = gen_lowpart (QImode, low[0]);
21706 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21707 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21708 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21710 d = gen_lowpart (QImode, high[0]);
21711 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21712 s = gen_rtx_NE (QImode, flags, const0_rtx);
21713 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21716 /* Otherwise, we can get the same results by manually performing
21717 a bit extract operation on bit 5/6, and then performing the two
21718 shifts. The two methods of getting 0/1 into low/high are exactly
21719 the same size. Avoiding the shift in the bit extract case helps
21720 pentium4 a bit; no one else seems to care much either way. */
21721 else
21723 enum machine_mode half_mode;
21724 rtx (*gen_lshr3)(rtx, rtx, rtx);
21725 rtx (*gen_and3)(rtx, rtx, rtx);
21726 rtx (*gen_xor3)(rtx, rtx, rtx);
21727 HOST_WIDE_INT bits;
21728 rtx x;
21730 if (mode == DImode)
21732 half_mode = SImode;
21733 gen_lshr3 = gen_lshrsi3;
21734 gen_and3 = gen_andsi3;
21735 gen_xor3 = gen_xorsi3;
21736 bits = 5;
21738 else
21740 half_mode = DImode;
21741 gen_lshr3 = gen_lshrdi3;
21742 gen_and3 = gen_anddi3;
21743 gen_xor3 = gen_xordi3;
21744 bits = 6;
21747 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21748 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21749 else
21750 x = gen_lowpart (half_mode, operands[2]);
21751 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21753 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21754 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21755 emit_move_insn (low[0], high[0]);
21756 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21759 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21760 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21761 return;
21764 if (operands[1] == constm1_rtx)
21766 /* For -1 << N, we can avoid the shld instruction, because we
21767 know that we're shifting 0...31/63 ones into a -1. */
21768 emit_move_insn (low[0], constm1_rtx);
21769 if (optimize_insn_for_size_p ())
21770 emit_move_insn (high[0], low[0]);
21771 else
21772 emit_move_insn (high[0], constm1_rtx);
21774 else
21776 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21778 if (!rtx_equal_p (operands[0], operands[1]))
21779 emit_move_insn (operands[0], operands[1]);
21781 split_double_mode (mode, operands, 1, low, high);
21782 emit_insn (gen_shld (high[0], low[0], operands[2]));
21785 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21787 if (TARGET_CMOVE && scratch)
21789 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21790 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21792 ix86_expand_clear (scratch);
21793 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21795 else
21797 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21798 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21800 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21804 void
21805 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21807 rtx (*gen_ashr3)(rtx, rtx, rtx)
21808 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21809 rtx (*gen_shrd)(rtx, rtx, rtx);
21810 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21812 rtx low[2], high[2];
21813 int count;
21815 if (CONST_INT_P (operands[2]))
21817 split_double_mode (mode, operands, 2, low, high);
21818 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21820 if (count == GET_MODE_BITSIZE (mode) - 1)
21822 emit_move_insn (high[0], high[1]);
21823 emit_insn (gen_ashr3 (high[0], high[0],
21824 GEN_INT (half_width - 1)));
21825 emit_move_insn (low[0], high[0]);
21828 else if (count >= half_width)
21830 emit_move_insn (low[0], high[1]);
21831 emit_move_insn (high[0], low[0]);
21832 emit_insn (gen_ashr3 (high[0], high[0],
21833 GEN_INT (half_width - 1)));
21835 if (count > half_width)
21836 emit_insn (gen_ashr3 (low[0], low[0],
21837 GEN_INT (count - half_width)));
21839 else
21841 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21843 if (!rtx_equal_p (operands[0], operands[1]))
21844 emit_move_insn (operands[0], operands[1]);
21846 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21847 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21850 else
21852 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21854 if (!rtx_equal_p (operands[0], operands[1]))
21855 emit_move_insn (operands[0], operands[1]);
21857 split_double_mode (mode, operands, 1, low, high);
21859 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21860 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21862 if (TARGET_CMOVE && scratch)
21864 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21865 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21867 emit_move_insn (scratch, high[0]);
21868 emit_insn (gen_ashr3 (scratch, scratch,
21869 GEN_INT (half_width - 1)));
21870 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21871 scratch));
21873 else
21875 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21876 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21878 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21883 void
21884 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21886 rtx (*gen_lshr3)(rtx, rtx, rtx)
21887 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21888 rtx (*gen_shrd)(rtx, rtx, rtx);
21889 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21891 rtx low[2], high[2];
21892 int count;
21894 if (CONST_INT_P (operands[2]))
21896 split_double_mode (mode, operands, 2, low, high);
21897 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21899 if (count >= half_width)
21901 emit_move_insn (low[0], high[1]);
21902 ix86_expand_clear (high[0]);
21904 if (count > half_width)
21905 emit_insn (gen_lshr3 (low[0], low[0],
21906 GEN_INT (count - half_width)));
21908 else
21910 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21912 if (!rtx_equal_p (operands[0], operands[1]))
21913 emit_move_insn (operands[0], operands[1]);
21915 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21916 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21919 else
21921 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21923 if (!rtx_equal_p (operands[0], operands[1]))
21924 emit_move_insn (operands[0], operands[1]);
21926 split_double_mode (mode, operands, 1, low, high);
21928 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21929 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21931 if (TARGET_CMOVE && scratch)
21933 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21934 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21936 ix86_expand_clear (scratch);
21937 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21938 scratch));
21940 else
21942 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21943 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21945 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21950 /* Predict just emitted jump instruction to be taken with probability PROB. */
21951 static void
21952 predict_jump (int prob)
21954 rtx insn = get_last_insn ();
21955 gcc_assert (JUMP_P (insn));
21956 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21959 /* Helper function for the string operations below. Dest VARIABLE whether
21960 it is aligned to VALUE bytes. If true, jump to the label. */
21961 static rtx
21962 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21964 rtx label = gen_label_rtx ();
21965 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21966 if (GET_MODE (variable) == DImode)
21967 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21968 else
21969 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21970 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21971 1, label);
21972 if (epilogue)
21973 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21974 else
21975 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21976 return label;
21979 /* Adjust COUNTER by the VALUE. */
21980 static void
21981 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21983 rtx (*gen_add)(rtx, rtx, rtx)
21984 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21986 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21989 /* Zero extend possibly SImode EXP to Pmode register. */
21991 ix86_zero_extend_to_Pmode (rtx exp)
21993 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21996 /* Divide COUNTREG by SCALE. */
21997 static rtx
21998 scale_counter (rtx countreg, int scale)
22000 rtx sc;
22002 if (scale == 1)
22003 return countreg;
22004 if (CONST_INT_P (countreg))
22005 return GEN_INT (INTVAL (countreg) / scale);
22006 gcc_assert (REG_P (countreg));
22008 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22009 GEN_INT (exact_log2 (scale)),
22010 NULL, 1, OPTAB_DIRECT);
22011 return sc;
22014 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22015 DImode for constant loop counts. */
22017 static enum machine_mode
22018 counter_mode (rtx count_exp)
22020 if (GET_MODE (count_exp) != VOIDmode)
22021 return GET_MODE (count_exp);
22022 if (!CONST_INT_P (count_exp))
22023 return Pmode;
22024 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22025 return DImode;
22026 return SImode;
22029 /* When SRCPTR is non-NULL, output simple loop to move memory
22030 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
22031 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
22032 equivalent loop to set memory by VALUE (supposed to be in MODE).
22034 The size is rounded down to whole number of chunk size moved at once.
22035 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22038 static void
22039 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22040 rtx destptr, rtx srcptr, rtx value,
22041 rtx count, enum machine_mode mode, int unroll,
22042 int expected_size)
22044 rtx out_label, top_label, iter, tmp;
22045 enum machine_mode iter_mode = counter_mode (count);
22046 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22047 rtx piece_size = GEN_INT (piece_size_n);
22048 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22049 rtx size;
22050 int i;
22052 top_label = gen_label_rtx ();
22053 out_label = gen_label_rtx ();
22054 iter = gen_reg_rtx (iter_mode);
22056 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22057 NULL, 1, OPTAB_DIRECT);
22058 /* Those two should combine. */
22059 if (piece_size == const1_rtx)
22061 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22062 true, out_label);
22063 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22065 emit_move_insn (iter, const0_rtx);
22067 emit_label (top_label);
22069 tmp = convert_modes (Pmode, iter_mode, iter, true);
22071 /* This assert could be relaxed - in this case we'll need to compute
22072 smallest power of two, containing in PIECE_SIZE_N and pass it to
22073 offset_address. */
22074 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22075 destmem = offset_address (destmem, tmp, piece_size_n);
22076 destmem = adjust_address (destmem, mode, 0);
22078 if (srcmem)
22080 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22081 srcmem = adjust_address (srcmem, mode, 0);
22083 /* When unrolling for chips that reorder memory reads and writes,
22084 we can save registers by using single temporary.
22085 Also using 4 temporaries is overkill in 32bit mode. */
22086 if (!TARGET_64BIT && 0)
22088 for (i = 0; i < unroll; i++)
22090 if (i)
22092 destmem =
22093 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22094 srcmem =
22095 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22097 emit_move_insn (destmem, srcmem);
22100 else
22102 rtx tmpreg[4];
22103 gcc_assert (unroll <= 4);
22104 for (i = 0; i < unroll; i++)
22106 tmpreg[i] = gen_reg_rtx (mode);
22107 if (i)
22109 srcmem =
22110 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22112 emit_move_insn (tmpreg[i], srcmem);
22114 for (i = 0; i < unroll; i++)
22116 if (i)
22118 destmem =
22119 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22121 emit_move_insn (destmem, tmpreg[i]);
22125 else
22126 for (i = 0; i < unroll; i++)
22128 if (i)
22129 destmem =
22130 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22131 emit_move_insn (destmem, value);
22134 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22135 true, OPTAB_LIB_WIDEN);
22136 if (tmp != iter)
22137 emit_move_insn (iter, tmp);
22139 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22140 true, top_label);
22141 if (expected_size != -1)
22143 expected_size /= GET_MODE_SIZE (mode) * unroll;
22144 if (expected_size == 0)
22145 predict_jump (0);
22146 else if (expected_size > REG_BR_PROB_BASE)
22147 predict_jump (REG_BR_PROB_BASE - 1);
22148 else
22149 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22151 else
22152 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22153 iter = ix86_zero_extend_to_Pmode (iter);
22154 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22155 true, OPTAB_LIB_WIDEN);
22156 if (tmp != destptr)
22157 emit_move_insn (destptr, tmp);
22158 if (srcptr)
22160 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22161 true, OPTAB_LIB_WIDEN);
22162 if (tmp != srcptr)
22163 emit_move_insn (srcptr, tmp);
22165 emit_label (out_label);
22168 /* Output "rep; mov" instruction.
22169 Arguments have same meaning as for previous function */
22170 static void
22171 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
22172 rtx destptr, rtx srcptr,
22173 rtx count,
22174 enum machine_mode mode)
22176 rtx destexp;
22177 rtx srcexp;
22178 rtx countreg;
22179 HOST_WIDE_INT rounded_count;
22181 /* If the size is known, it is shorter to use rep movs. */
22182 if (mode == QImode && CONST_INT_P (count)
22183 && !(INTVAL (count) & 3))
22184 mode = SImode;
22186 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22187 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22188 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22189 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22190 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22191 if (mode != QImode)
22193 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22194 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22195 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22196 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22197 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22198 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22200 else
22202 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22203 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22205 if (CONST_INT_P (count))
22207 rounded_count = (INTVAL (count)
22208 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22209 destmem = shallow_copy_rtx (destmem);
22210 srcmem = shallow_copy_rtx (srcmem);
22211 set_mem_size (destmem, rounded_count);
22212 set_mem_size (srcmem, rounded_count);
22214 else
22216 if (MEM_SIZE_KNOWN_P (destmem))
22217 clear_mem_size (destmem);
22218 if (MEM_SIZE_KNOWN_P (srcmem))
22219 clear_mem_size (srcmem);
22221 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22222 destexp, srcexp));
22225 /* Output "rep; stos" instruction.
22226 Arguments have same meaning as for previous function */
22227 static void
22228 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
22229 rtx count, enum machine_mode mode,
22230 rtx orig_value)
22232 rtx destexp;
22233 rtx countreg;
22234 HOST_WIDE_INT rounded_count;
22236 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22237 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22238 value = force_reg (mode, gen_lowpart (mode, value));
22239 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22240 if (mode != QImode)
22242 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22243 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22244 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22246 else
22247 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22248 if (orig_value == const0_rtx && CONST_INT_P (count))
22250 rounded_count = (INTVAL (count)
22251 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22252 destmem = shallow_copy_rtx (destmem);
22253 set_mem_size (destmem, rounded_count);
22255 else if (MEM_SIZE_KNOWN_P (destmem))
22256 clear_mem_size (destmem);
22257 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22260 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22261 DESTMEM.
22262 SRC is passed by pointer to be updated on return.
22263 Return value is updated DST. */
22264 static rtx
22265 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22266 HOST_WIDE_INT size_to_move)
22268 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22269 enum insn_code code;
22270 enum machine_mode move_mode;
22271 int piece_size, i;
22273 /* Find the widest mode in which we could perform moves.
22274 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22275 it until move of such size is supported. */
22276 piece_size = 1 << floor_log2 (size_to_move);
22277 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22278 code = optab_handler (mov_optab, move_mode);
22279 while (code == CODE_FOR_nothing && piece_size > 1)
22281 piece_size >>= 1;
22282 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22283 code = optab_handler (mov_optab, move_mode);
22286 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22287 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22288 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22290 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22291 move_mode = mode_for_vector (word_mode, nunits);
22292 code = optab_handler (mov_optab, move_mode);
22293 if (code == CODE_FOR_nothing)
22295 move_mode = word_mode;
22296 piece_size = GET_MODE_SIZE (move_mode);
22297 code = optab_handler (mov_optab, move_mode);
22300 gcc_assert (code != CODE_FOR_nothing);
22302 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22303 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22305 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22306 gcc_assert (size_to_move % piece_size == 0);
22307 adjust = GEN_INT (piece_size);
22308 for (i = 0; i < size_to_move; i += piece_size)
22310 /* We move from memory to memory, so we'll need to do it via
22311 a temporary register. */
22312 tempreg = gen_reg_rtx (move_mode);
22313 emit_insn (GEN_FCN (code) (tempreg, src));
22314 emit_insn (GEN_FCN (code) (dst, tempreg));
22316 emit_move_insn (destptr,
22317 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22318 emit_move_insn (srcptr,
22319 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22321 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22322 piece_size);
22323 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22324 piece_size);
22327 /* Update DST and SRC rtx. */
22328 *srcmem = src;
22329 return dst;
22332 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22333 static void
22334 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22335 rtx destptr, rtx srcptr, rtx count, int max_size)
22337 rtx src, dest;
22338 if (CONST_INT_P (count))
22340 HOST_WIDE_INT countval = INTVAL (count);
22341 HOST_WIDE_INT epilogue_size = countval % max_size;
22342 int i;
22344 /* For now MAX_SIZE should be a power of 2. This assert could be
22345 relaxed, but it'll require a bit more complicated epilogue
22346 expanding. */
22347 gcc_assert ((max_size & (max_size - 1)) == 0);
22348 for (i = max_size; i >= 1; i >>= 1)
22350 if (epilogue_size & i)
22351 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22353 return;
22355 if (max_size > 8)
22357 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22358 count, 1, OPTAB_DIRECT);
22359 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22360 count, QImode, 1, 4);
22361 return;
22364 /* When there are stringops, we can cheaply increase dest and src pointers.
22365 Otherwise we save code size by maintaining offset (zero is readily
22366 available from preceding rep operation) and using x86 addressing modes.
22368 if (TARGET_SINGLE_STRINGOP)
22370 if (max_size > 4)
22372 rtx label = ix86_expand_aligntest (count, 4, true);
22373 src = change_address (srcmem, SImode, srcptr);
22374 dest = change_address (destmem, SImode, destptr);
22375 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22376 emit_label (label);
22377 LABEL_NUSES (label) = 1;
22379 if (max_size > 2)
22381 rtx label = ix86_expand_aligntest (count, 2, true);
22382 src = change_address (srcmem, HImode, srcptr);
22383 dest = change_address (destmem, HImode, destptr);
22384 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22385 emit_label (label);
22386 LABEL_NUSES (label) = 1;
22388 if (max_size > 1)
22390 rtx label = ix86_expand_aligntest (count, 1, true);
22391 src = change_address (srcmem, QImode, srcptr);
22392 dest = change_address (destmem, QImode, destptr);
22393 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22394 emit_label (label);
22395 LABEL_NUSES (label) = 1;
22398 else
22400 rtx offset = force_reg (Pmode, const0_rtx);
22401 rtx tmp;
22403 if (max_size > 4)
22405 rtx label = ix86_expand_aligntest (count, 4, true);
22406 src = change_address (srcmem, SImode, srcptr);
22407 dest = change_address (destmem, SImode, destptr);
22408 emit_move_insn (dest, src);
22409 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22410 true, OPTAB_LIB_WIDEN);
22411 if (tmp != offset)
22412 emit_move_insn (offset, tmp);
22413 emit_label (label);
22414 LABEL_NUSES (label) = 1;
22416 if (max_size > 2)
22418 rtx label = ix86_expand_aligntest (count, 2, true);
22419 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22420 src = change_address (srcmem, HImode, tmp);
22421 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22422 dest = change_address (destmem, HImode, tmp);
22423 emit_move_insn (dest, src);
22424 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22425 true, OPTAB_LIB_WIDEN);
22426 if (tmp != offset)
22427 emit_move_insn (offset, tmp);
22428 emit_label (label);
22429 LABEL_NUSES (label) = 1;
22431 if (max_size > 1)
22433 rtx label = ix86_expand_aligntest (count, 1, true);
22434 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22435 src = change_address (srcmem, QImode, tmp);
22436 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22437 dest = change_address (destmem, QImode, tmp);
22438 emit_move_insn (dest, src);
22439 emit_label (label);
22440 LABEL_NUSES (label) = 1;
22445 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22446 static void
22447 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22448 rtx count, int max_size)
22450 count =
22451 expand_simple_binop (counter_mode (count), AND, count,
22452 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22453 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22454 gen_lowpart (QImode, value), count, QImode,
22455 1, max_size / 2);
22458 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22459 static void
22460 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22462 rtx dest;
22464 if (CONST_INT_P (count))
22466 HOST_WIDE_INT countval = INTVAL (count);
22467 int offset = 0;
22469 if ((countval & 0x10) && max_size > 16)
22471 if (TARGET_64BIT)
22473 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22474 emit_insn (gen_strset (destptr, dest, value));
22475 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22476 emit_insn (gen_strset (destptr, dest, value));
22478 else
22479 gcc_unreachable ();
22480 offset += 16;
22482 if ((countval & 0x08) && max_size > 8)
22484 if (TARGET_64BIT)
22486 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22487 emit_insn (gen_strset (destptr, dest, value));
22489 else
22491 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22492 emit_insn (gen_strset (destptr, dest, value));
22493 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22494 emit_insn (gen_strset (destptr, dest, value));
22496 offset += 8;
22498 if ((countval & 0x04) && max_size > 4)
22500 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22501 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22502 offset += 4;
22504 if ((countval & 0x02) && max_size > 2)
22506 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22507 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22508 offset += 2;
22510 if ((countval & 0x01) && max_size > 1)
22512 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22513 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22514 offset += 1;
22516 return;
22518 if (max_size > 32)
22520 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22521 return;
22523 if (max_size > 16)
22525 rtx label = ix86_expand_aligntest (count, 16, true);
22526 if (TARGET_64BIT)
22528 dest = change_address (destmem, DImode, destptr);
22529 emit_insn (gen_strset (destptr, dest, value));
22530 emit_insn (gen_strset (destptr, dest, value));
22532 else
22534 dest = change_address (destmem, SImode, destptr);
22535 emit_insn (gen_strset (destptr, dest, value));
22536 emit_insn (gen_strset (destptr, dest, value));
22537 emit_insn (gen_strset (destptr, dest, value));
22538 emit_insn (gen_strset (destptr, dest, value));
22540 emit_label (label);
22541 LABEL_NUSES (label) = 1;
22543 if (max_size > 8)
22545 rtx label = ix86_expand_aligntest (count, 8, true);
22546 if (TARGET_64BIT)
22548 dest = change_address (destmem, DImode, destptr);
22549 emit_insn (gen_strset (destptr, dest, value));
22551 else
22553 dest = change_address (destmem, SImode, destptr);
22554 emit_insn (gen_strset (destptr, dest, value));
22555 emit_insn (gen_strset (destptr, dest, value));
22557 emit_label (label);
22558 LABEL_NUSES (label) = 1;
22560 if (max_size > 4)
22562 rtx label = ix86_expand_aligntest (count, 4, true);
22563 dest = change_address (destmem, SImode, destptr);
22564 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22565 emit_label (label);
22566 LABEL_NUSES (label) = 1;
22568 if (max_size > 2)
22570 rtx label = ix86_expand_aligntest (count, 2, true);
22571 dest = change_address (destmem, HImode, destptr);
22572 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22573 emit_label (label);
22574 LABEL_NUSES (label) = 1;
22576 if (max_size > 1)
22578 rtx label = ix86_expand_aligntest (count, 1, true);
22579 dest = change_address (destmem, QImode, destptr);
22580 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22581 emit_label (label);
22582 LABEL_NUSES (label) = 1;
22586 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22587 DESIRED_ALIGNMENT.
22588 Return value is updated DESTMEM. */
22589 static rtx
22590 expand_movmem_prologue (rtx destmem, rtx srcmem,
22591 rtx destptr, rtx srcptr, rtx count,
22592 int align, int desired_alignment)
22594 int i;
22595 for (i = 1; i < desired_alignment; i <<= 1)
22597 if (align <= i)
22599 rtx label = ix86_expand_aligntest (destptr, i, false);
22600 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22601 ix86_adjust_counter (count, i);
22602 emit_label (label);
22603 LABEL_NUSES (label) = 1;
22604 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
22607 return destmem;
22610 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22611 ALIGN_BYTES is how many bytes need to be copied.
22612 The function updates DST and SRC, namely, it sets proper alignment.
22613 DST is returned via return value, SRC is updated via pointer SRCP. */
22614 static rtx
22615 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22616 int desired_align, int align_bytes)
22618 rtx src = *srcp;
22619 rtx orig_dst = dst;
22620 rtx orig_src = src;
22621 int piece_size = 1;
22622 int copied_bytes = 0;
22623 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22624 if (src_align_bytes >= 0)
22625 src_align_bytes = desired_align - src_align_bytes;
22627 for (piece_size = 1;
22628 piece_size <= desired_align && copied_bytes < align_bytes;
22629 piece_size <<= 1)
22631 if (align_bytes & piece_size)
22633 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
22634 copied_bytes += piece_size;
22638 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22639 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22640 if (src_align_bytes >= 0)
22642 unsigned int src_align;
22643 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
22645 if ((src_align_bytes & (src_align - 1))
22646 == (align_bytes & (src_align - 1)))
22647 break;
22649 if (src_align > (unsigned int) desired_align)
22650 src_align = desired_align;
22651 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22652 set_mem_align (src, src_align * BITS_PER_UNIT);
22654 if (MEM_SIZE_KNOWN_P (orig_dst))
22655 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22656 if (MEM_SIZE_KNOWN_P (orig_src))
22657 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22658 *srcp = src;
22659 return dst;
22662 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22663 DESIRED_ALIGNMENT. */
22664 static void
22665 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22666 int align, int desired_alignment)
22668 if (align <= 1 && desired_alignment > 1)
22670 rtx label = ix86_expand_aligntest (destptr, 1, false);
22671 destmem = change_address (destmem, QImode, destptr);
22672 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22673 ix86_adjust_counter (count, 1);
22674 emit_label (label);
22675 LABEL_NUSES (label) = 1;
22677 if (align <= 2 && desired_alignment > 2)
22679 rtx label = ix86_expand_aligntest (destptr, 2, false);
22680 destmem = change_address (destmem, HImode, destptr);
22681 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22682 ix86_adjust_counter (count, 2);
22683 emit_label (label);
22684 LABEL_NUSES (label) = 1;
22686 if (align <= 4 && desired_alignment > 4)
22688 rtx label = ix86_expand_aligntest (destptr, 4, false);
22689 destmem = change_address (destmem, SImode, destptr);
22690 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22691 ix86_adjust_counter (count, 4);
22692 emit_label (label);
22693 LABEL_NUSES (label) = 1;
22695 gcc_assert (desired_alignment <= 8);
22698 /* Set enough from DST to align DST known to by aligned by ALIGN to
22699 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22700 static rtx
22701 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22702 int desired_align, int align_bytes)
22704 int off = 0;
22705 rtx orig_dst = dst;
22706 if (align_bytes & 1)
22708 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22709 off = 1;
22710 emit_insn (gen_strset (destreg, dst,
22711 gen_lowpart (QImode, value)));
22713 if (align_bytes & 2)
22715 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22716 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22717 set_mem_align (dst, 2 * BITS_PER_UNIT);
22718 off = 2;
22719 emit_insn (gen_strset (destreg, dst,
22720 gen_lowpart (HImode, value)));
22722 if (align_bytes & 4)
22724 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22725 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22726 set_mem_align (dst, 4 * BITS_PER_UNIT);
22727 off = 4;
22728 emit_insn (gen_strset (destreg, dst,
22729 gen_lowpart (SImode, value)));
22731 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22732 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22733 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22734 if (MEM_SIZE_KNOWN_P (orig_dst))
22735 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22736 return dst;
22739 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22740 static enum stringop_alg
22741 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22742 int *dynamic_check, bool *noalign)
22744 const struct stringop_algs * algs;
22745 bool optimize_for_speed;
22746 /* Algorithms using the rep prefix want at least edi and ecx;
22747 additionally, memset wants eax and memcpy wants esi. Don't
22748 consider such algorithms if the user has appropriated those
22749 registers for their own purposes. */
22750 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22751 || (memset
22752 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22753 *noalign = false;
22755 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22756 || (alg != rep_prefix_1_byte \
22757 && alg != rep_prefix_4_byte \
22758 && alg != rep_prefix_8_byte))
22759 const struct processor_costs *cost;
22761 /* Even if the string operation call is cold, we still might spend a lot
22762 of time processing large blocks. */
22763 if (optimize_function_for_size_p (cfun)
22764 || (optimize_insn_for_size_p ()
22765 && expected_size != -1 && expected_size < 256))
22766 optimize_for_speed = false;
22767 else
22768 optimize_for_speed = true;
22770 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22772 *dynamic_check = -1;
22773 if (memset)
22774 algs = &cost->memset[TARGET_64BIT != 0];
22775 else
22776 algs = &cost->memcpy[TARGET_64BIT != 0];
22777 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22778 return ix86_stringop_alg;
22779 /* rep; movq or rep; movl is the smallest variant. */
22780 else if (!optimize_for_speed)
22782 if (!count || (count & 3))
22783 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22784 else
22785 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22787 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22789 else if (expected_size != -1 && expected_size < 4)
22790 return loop_1_byte;
22791 else if (expected_size != -1)
22793 unsigned int i;
22794 enum stringop_alg alg = libcall;
22795 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22797 /* We get here if the algorithms that were not libcall-based
22798 were rep-prefix based and we are unable to use rep prefixes
22799 based on global register usage. Break out of the loop and
22800 use the heuristic below. */
22801 if (algs->size[i].max == 0)
22802 break;
22803 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22805 enum stringop_alg candidate = algs->size[i].alg;
22807 if (candidate != libcall && ALG_USABLE_P (candidate))
22808 alg = candidate;
22809 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22810 last non-libcall inline algorithm. */
22811 if (TARGET_INLINE_ALL_STRINGOPS)
22813 /* When the current size is best to be copied by a libcall,
22814 but we are still forced to inline, run the heuristic below
22815 that will pick code for medium sized blocks. */
22816 if (alg != libcall)
22817 return alg;
22818 break;
22820 else if (ALG_USABLE_P (candidate))
22822 *noalign = algs->size[i].noalign;
22823 return candidate;
22827 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22829 /* When asked to inline the call anyway, try to pick meaningful choice.
22830 We look for maximal size of block that is faster to copy by hand and
22831 take blocks of at most of that size guessing that average size will
22832 be roughly half of the block.
22834 If this turns out to be bad, we might simply specify the preferred
22835 choice in ix86_costs. */
22836 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22837 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22839 int max = -1;
22840 enum stringop_alg alg;
22841 int i;
22842 bool any_alg_usable_p = true;
22844 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22846 enum stringop_alg candidate = algs->size[i].alg;
22847 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22849 if (candidate != libcall && candidate
22850 && ALG_USABLE_P (candidate))
22851 max = algs->size[i].max;
22853 /* If there aren't any usable algorithms, then recursing on
22854 smaller sizes isn't going to find anything. Just return the
22855 simple byte-at-a-time copy loop. */
22856 if (!any_alg_usable_p)
22858 /* Pick something reasonable. */
22859 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22860 *dynamic_check = 128;
22861 return loop_1_byte;
22863 if (max == -1)
22864 max = 4096;
22865 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22866 gcc_assert (*dynamic_check == -1);
22867 gcc_assert (alg != libcall);
22868 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22869 *dynamic_check = max;
22870 return alg;
22872 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22873 #undef ALG_USABLE_P
22876 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22877 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22878 static int
22879 decide_alignment (int align,
22880 enum stringop_alg alg,
22881 int expected_size,
22882 enum machine_mode move_mode)
22884 int desired_align = 0;
22886 gcc_assert (alg != no_stringop);
22888 if (alg == libcall)
22889 return 0;
22890 if (move_mode == VOIDmode)
22891 return 0;
22893 desired_align = GET_MODE_SIZE (move_mode);
22894 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22895 copying whole cacheline at once. */
22896 if (TARGET_PENTIUMPRO
22897 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
22898 desired_align = 8;
22900 if (optimize_size)
22901 desired_align = 1;
22902 if (desired_align < align)
22903 desired_align = align;
22904 if (expected_size != -1 && expected_size < 4)
22905 desired_align = align;
22907 return desired_align;
22910 /* Expand string move (memcpy) operation. Use i386 string operations
22911 when profitable. expand_setmem contains similar code. The code
22912 depends upon architecture, block size and alignment, but always has
22913 the same overall structure:
22915 1) Prologue guard: Conditional that jumps up to epilogues for small
22916 blocks that can be handled by epilogue alone. This is faster
22917 but also needed for correctness, since prologue assume the block
22918 is larger than the desired alignment.
22920 Optional dynamic check for size and libcall for large
22921 blocks is emitted here too, with -minline-stringops-dynamically.
22923 2) Prologue: copy first few bytes in order to get destination
22924 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22925 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22926 copied. We emit either a jump tree on power of two sized
22927 blocks, or a byte loop.
22929 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22930 with specified algorithm.
22932 4) Epilogue: code copying tail of the block that is too small to be
22933 handled by main body (or up to size guarded by prologue guard). */
22935 bool
22936 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22937 rtx expected_align_exp, rtx expected_size_exp)
22939 rtx destreg;
22940 rtx srcreg;
22941 rtx label = NULL;
22942 rtx tmp;
22943 rtx jump_around_label = NULL;
22944 HOST_WIDE_INT align = 1;
22945 unsigned HOST_WIDE_INT count = 0;
22946 HOST_WIDE_INT expected_size = -1;
22947 int size_needed = 0, epilogue_size_needed;
22948 int desired_align = 0, align_bytes = 0;
22949 enum stringop_alg alg;
22950 int dynamic_check;
22951 bool need_zero_guard = false;
22952 bool noalign;
22953 enum machine_mode move_mode = VOIDmode;
22954 int unroll_factor = 1;
22956 if (CONST_INT_P (align_exp))
22957 align = INTVAL (align_exp);
22958 /* i386 can do misaligned access on reasonably increased cost. */
22959 if (CONST_INT_P (expected_align_exp)
22960 && INTVAL (expected_align_exp) > align)
22961 align = INTVAL (expected_align_exp);
22962 /* ALIGN is the minimum of destination and source alignment, but we care here
22963 just about destination alignment. */
22964 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22965 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22967 if (CONST_INT_P (count_exp))
22968 count = expected_size = INTVAL (count_exp);
22969 if (CONST_INT_P (expected_size_exp) && count == 0)
22970 expected_size = INTVAL (expected_size_exp);
22972 /* Make sure we don't need to care about overflow later on. */
22973 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22974 return false;
22976 /* Step 0: Decide on preferred algorithm, desired alignment and
22977 size of chunks to be copied by main loop. */
22978 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22979 if (alg == libcall)
22980 return false;
22981 gcc_assert (alg != no_stringop);
22983 if (!count)
22984 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22985 destreg = copy_addr_to_reg (XEXP (dst, 0));
22986 srcreg = copy_addr_to_reg (XEXP (src, 0));
22988 unroll_factor = 1;
22989 move_mode = word_mode;
22990 switch (alg)
22992 case libcall:
22993 case no_stringop:
22994 case last_alg:
22995 gcc_unreachable ();
22996 case loop_1_byte:
22997 need_zero_guard = true;
22998 move_mode = QImode;
22999 break;
23000 case loop:
23001 need_zero_guard = true;
23002 break;
23003 case unrolled_loop:
23004 need_zero_guard = true;
23005 unroll_factor = (TARGET_64BIT ? 4 : 2);
23006 break;
23007 case vector_loop:
23008 need_zero_guard = true;
23009 unroll_factor = 4;
23010 /* Find the widest supported mode. */
23011 move_mode = word_mode;
23012 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23013 != CODE_FOR_nothing)
23014 move_mode = GET_MODE_WIDER_MODE (move_mode);
23016 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23017 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23018 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23020 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23021 move_mode = mode_for_vector (word_mode, nunits);
23022 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23023 move_mode = word_mode;
23025 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23026 break;
23027 case rep_prefix_8_byte:
23028 move_mode = DImode;
23029 break;
23030 case rep_prefix_4_byte:
23031 move_mode = SImode;
23032 break;
23033 case rep_prefix_1_byte:
23034 move_mode = QImode;
23035 break;
23037 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23038 epilogue_size_needed = size_needed;
23040 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23041 if (!TARGET_ALIGN_STRINGOPS || noalign)
23042 align = desired_align;
23044 /* Step 1: Prologue guard. */
23046 /* Alignment code needs count to be in register. */
23047 if (CONST_INT_P (count_exp) && desired_align > align)
23049 if (INTVAL (count_exp) > desired_align
23050 && INTVAL (count_exp) > size_needed)
23052 align_bytes
23053 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23054 if (align_bytes <= 0)
23055 align_bytes = 0;
23056 else
23057 align_bytes = desired_align - align_bytes;
23059 if (align_bytes == 0)
23060 count_exp = force_reg (counter_mode (count_exp), count_exp);
23062 gcc_assert (desired_align >= 1 && align >= 1);
23064 /* Ensure that alignment prologue won't copy past end of block. */
23065 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23067 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23068 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23069 Make sure it is power of 2. */
23070 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23072 if (count)
23074 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23076 /* If main algorithm works on QImode, no epilogue is needed.
23077 For small sizes just don't align anything. */
23078 if (size_needed == 1)
23079 desired_align = align;
23080 else
23081 goto epilogue;
23084 else
23086 label = gen_label_rtx ();
23087 emit_cmp_and_jump_insns (count_exp,
23088 GEN_INT (epilogue_size_needed),
23089 LTU, 0, counter_mode (count_exp), 1, label);
23090 if (expected_size == -1 || expected_size < epilogue_size_needed)
23091 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23092 else
23093 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23097 /* Emit code to decide on runtime whether library call or inline should be
23098 used. */
23099 if (dynamic_check != -1)
23101 if (CONST_INT_P (count_exp))
23103 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
23105 emit_block_move_via_libcall (dst, src, count_exp, false);
23106 count_exp = const0_rtx;
23107 goto epilogue;
23110 else
23112 rtx hot_label = gen_label_rtx ();
23113 jump_around_label = gen_label_rtx ();
23114 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23115 LEU, 0, GET_MODE (count_exp), 1, hot_label);
23116 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23117 emit_block_move_via_libcall (dst, src, count_exp, false);
23118 emit_jump (jump_around_label);
23119 emit_label (hot_label);
23123 /* Step 2: Alignment prologue. */
23125 if (desired_align > align)
23127 if (align_bytes == 0)
23129 /* Except for the first move in epilogue, we no longer know
23130 constant offset in aliasing info. It don't seems to worth
23131 the pain to maintain it for the first move, so throw away
23132 the info early. */
23133 src = change_address (src, BLKmode, srcreg);
23134 dst = change_address (dst, BLKmode, destreg);
23135 dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
23136 desired_align);
23138 else
23140 /* If we know how many bytes need to be stored before dst is
23141 sufficiently aligned, maintain aliasing info accurately. */
23142 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
23143 desired_align, align_bytes);
23144 count_exp = plus_constant (counter_mode (count_exp),
23145 count_exp, -align_bytes);
23146 count -= align_bytes;
23148 if (need_zero_guard
23149 && (count < (unsigned HOST_WIDE_INT) size_needed
23150 || (align_bytes == 0
23151 && count < ((unsigned HOST_WIDE_INT) size_needed
23152 + desired_align - align))))
23154 /* It is possible that we copied enough so the main loop will not
23155 execute. */
23156 gcc_assert (size_needed > 1);
23157 if (label == NULL_RTX)
23158 label = gen_label_rtx ();
23159 emit_cmp_and_jump_insns (count_exp,
23160 GEN_INT (size_needed),
23161 LTU, 0, counter_mode (count_exp), 1, label);
23162 if (expected_size == -1
23163 || expected_size < (desired_align - align) / 2 + size_needed)
23164 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23165 else
23166 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23169 if (label && size_needed == 1)
23171 emit_label (label);
23172 LABEL_NUSES (label) = 1;
23173 label = NULL;
23174 epilogue_size_needed = 1;
23176 else if (label == NULL_RTX)
23177 epilogue_size_needed = size_needed;
23179 /* Step 3: Main loop. */
23181 switch (alg)
23183 case libcall:
23184 case no_stringop:
23185 case last_alg:
23186 gcc_unreachable ();
23187 case loop_1_byte:
23188 case loop:
23189 case unrolled_loop:
23190 case vector_loop:
23191 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
23192 count_exp, move_mode, unroll_factor,
23193 expected_size);
23194 break;
23195 case rep_prefix_8_byte:
23196 case rep_prefix_4_byte:
23197 case rep_prefix_1_byte:
23198 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23199 move_mode);
23200 break;
23202 /* Adjust properly the offset of src and dest memory for aliasing. */
23203 if (CONST_INT_P (count_exp))
23205 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
23206 (count / size_needed) * size_needed);
23207 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23208 (count / size_needed) * size_needed);
23210 else
23212 src = change_address (src, BLKmode, srcreg);
23213 dst = change_address (dst, BLKmode, destreg);
23216 /* Step 4: Epilogue to copy the remaining bytes. */
23217 epilogue:
23218 if (label)
23220 /* When the main loop is done, COUNT_EXP might hold original count,
23221 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23222 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23223 bytes. Compensate if needed. */
23225 if (size_needed < epilogue_size_needed)
23227 tmp =
23228 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23229 GEN_INT (size_needed - 1), count_exp, 1,
23230 OPTAB_DIRECT);
23231 if (tmp != count_exp)
23232 emit_move_insn (count_exp, tmp);
23234 emit_label (label);
23235 LABEL_NUSES (label) = 1;
23238 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23239 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
23240 size_needed);
23241 if (jump_around_label)
23242 emit_label (jump_around_label);
23243 return true;
23246 /* Helper function for memcpy. For QImode value 0xXY produce
23247 0xXYXYXYXY of wide specified by MODE. This is essentially
23248 a * 0x10101010, but we can do slightly better than
23249 synth_mult by unwinding the sequence by hand on CPUs with
23250 slow multiply. */
23251 static rtx
23252 promote_duplicated_reg (enum machine_mode mode, rtx val)
23254 enum machine_mode valmode = GET_MODE (val);
23255 rtx tmp;
23256 int nops = mode == DImode ? 3 : 2;
23258 gcc_assert (mode == SImode || mode == DImode);
23259 if (val == const0_rtx)
23260 return copy_to_mode_reg (mode, const0_rtx);
23261 if (CONST_INT_P (val))
23263 HOST_WIDE_INT v = INTVAL (val) & 255;
23265 v |= v << 8;
23266 v |= v << 16;
23267 if (mode == DImode)
23268 v |= (v << 16) << 16;
23269 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23272 if (valmode == VOIDmode)
23273 valmode = QImode;
23274 if (valmode != QImode)
23275 val = gen_lowpart (QImode, val);
23276 if (mode == QImode)
23277 return val;
23278 if (!TARGET_PARTIAL_REG_STALL)
23279 nops--;
23280 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23281 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23282 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23283 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23285 rtx reg = convert_modes (mode, QImode, val, true);
23286 tmp = promote_duplicated_reg (mode, const1_rtx);
23287 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23288 OPTAB_DIRECT);
23290 else
23292 rtx reg = convert_modes (mode, QImode, val, true);
23294 if (!TARGET_PARTIAL_REG_STALL)
23295 if (mode == SImode)
23296 emit_insn (gen_movsi_insv_1 (reg, reg));
23297 else
23298 emit_insn (gen_movdi_insv_1 (reg, reg));
23299 else
23301 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23302 NULL, 1, OPTAB_DIRECT);
23303 reg =
23304 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23306 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23307 NULL, 1, OPTAB_DIRECT);
23308 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23309 if (mode == SImode)
23310 return reg;
23311 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23312 NULL, 1, OPTAB_DIRECT);
23313 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23314 return reg;
23318 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23319 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23320 alignment from ALIGN to DESIRED_ALIGN. */
23321 static rtx
23322 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23324 rtx promoted_val;
23326 if (TARGET_64BIT
23327 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23328 promoted_val = promote_duplicated_reg (DImode, val);
23329 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23330 promoted_val = promote_duplicated_reg (SImode, val);
23331 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23332 promoted_val = promote_duplicated_reg (HImode, val);
23333 else
23334 promoted_val = val;
23336 return promoted_val;
23339 /* Expand string clear operation (bzero). Use i386 string operations when
23340 profitable. See expand_movmem comment for explanation of individual
23341 steps performed. */
23342 bool
23343 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23344 rtx expected_align_exp, rtx expected_size_exp)
23346 rtx destreg;
23347 rtx label = NULL;
23348 rtx tmp;
23349 rtx jump_around_label = NULL;
23350 HOST_WIDE_INT align = 1;
23351 unsigned HOST_WIDE_INT count = 0;
23352 HOST_WIDE_INT expected_size = -1;
23353 int size_needed = 0, epilogue_size_needed;
23354 int desired_align = 0, align_bytes = 0;
23355 enum stringop_alg alg;
23356 rtx promoted_val = NULL;
23357 bool force_loopy_epilogue = false;
23358 int dynamic_check;
23359 bool need_zero_guard = false;
23360 bool noalign;
23361 enum machine_mode move_mode = VOIDmode;
23362 int unroll_factor;
23364 if (CONST_INT_P (align_exp))
23365 align = INTVAL (align_exp);
23366 /* i386 can do misaligned access on reasonably increased cost. */
23367 if (CONST_INT_P (expected_align_exp)
23368 && INTVAL (expected_align_exp) > align)
23369 align = INTVAL (expected_align_exp);
23370 if (CONST_INT_P (count_exp))
23371 count = expected_size = INTVAL (count_exp);
23372 if (CONST_INT_P (expected_size_exp) && count == 0)
23373 expected_size = INTVAL (expected_size_exp);
23375 /* Make sure we don't need to care about overflow later on. */
23376 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23377 return false;
23379 /* Step 0: Decide on preferred algorithm, desired alignment and
23380 size of chunks to be copied by main loop. */
23382 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23383 if (alg == libcall)
23384 return false;
23385 gcc_assert (alg != no_stringop);
23387 if (!count)
23388 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23389 destreg = copy_addr_to_reg (XEXP (dst, 0));
23391 move_mode = word_mode;
23392 unroll_factor = 1;
23393 switch (alg)
23395 case libcall:
23396 case no_stringop:
23397 case last_alg:
23398 gcc_unreachable ();
23399 case loop:
23400 need_zero_guard = true;
23401 break;
23402 case vector_loop:
23403 case unrolled_loop:
23404 need_zero_guard = true;
23405 unroll_factor = 4;
23406 break;
23407 case rep_prefix_8_byte:
23408 move_mode = DImode;
23409 break;
23410 case rep_prefix_4_byte:
23411 move_mode = SImode;
23412 break;
23413 case rep_prefix_1_byte:
23414 move_mode = QImode;
23415 break;
23416 case loop_1_byte:
23417 need_zero_guard = true;
23418 move_mode = QImode;
23419 break;
23421 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23422 epilogue_size_needed = size_needed;
23424 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23425 if (!TARGET_ALIGN_STRINGOPS || noalign)
23426 align = desired_align;
23428 /* Step 1: Prologue guard. */
23430 /* Alignment code needs count to be in register. */
23431 if (CONST_INT_P (count_exp) && desired_align > align)
23433 if (INTVAL (count_exp) > desired_align
23434 && INTVAL (count_exp) > size_needed)
23436 align_bytes
23437 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23438 if (align_bytes <= 0)
23439 align_bytes = 0;
23440 else
23441 align_bytes = desired_align - align_bytes;
23443 if (align_bytes == 0)
23445 enum machine_mode mode = SImode;
23446 if (TARGET_64BIT && (count & ~0xffffffff))
23447 mode = DImode;
23448 count_exp = force_reg (mode, count_exp);
23451 /* Do the cheap promotion to allow better CSE across the
23452 main loop and epilogue (ie one load of the big constant in the
23453 front of all code. */
23454 if (CONST_INT_P (val_exp))
23455 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23456 desired_align, align);
23457 /* Ensure that alignment prologue won't copy past end of block. */
23458 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23460 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23461 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23462 Make sure it is power of 2. */
23463 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23465 /* To improve performance of small blocks, we jump around the VAL
23466 promoting mode. This mean that if the promoted VAL is not constant,
23467 we might not use it in the epilogue and have to use byte
23468 loop variant. */
23469 if (epilogue_size_needed > 2 && !promoted_val)
23470 force_loopy_epilogue = true;
23471 if (count)
23473 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23475 /* If main algorithm works on QImode, no epilogue is needed.
23476 For small sizes just don't align anything. */
23477 if (size_needed == 1)
23478 desired_align = align;
23479 else
23480 goto epilogue;
23483 else
23485 label = gen_label_rtx ();
23486 emit_cmp_and_jump_insns (count_exp,
23487 GEN_INT (epilogue_size_needed),
23488 LTU, 0, counter_mode (count_exp), 1, label);
23489 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23490 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23491 else
23492 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23495 if (dynamic_check != -1)
23497 rtx hot_label = gen_label_rtx ();
23498 jump_around_label = gen_label_rtx ();
23499 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23500 LEU, 0, counter_mode (count_exp), 1, hot_label);
23501 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23502 set_storage_via_libcall (dst, count_exp, val_exp, false);
23503 emit_jump (jump_around_label);
23504 emit_label (hot_label);
23507 /* Step 2: Alignment prologue. */
23509 /* Do the expensive promotion once we branched off the small blocks. */
23510 if (!promoted_val)
23511 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23512 desired_align, align);
23513 gcc_assert (desired_align >= 1 && align >= 1);
23515 if (desired_align > align)
23517 if (align_bytes == 0)
23519 /* Except for the first move in epilogue, we no longer know
23520 constant offset in aliasing info. It don't seems to worth
23521 the pain to maintain it for the first move, so throw away
23522 the info early. */
23523 dst = change_address (dst, BLKmode, destreg);
23524 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23525 desired_align);
23527 else
23529 /* If we know how many bytes need to be stored before dst is
23530 sufficiently aligned, maintain aliasing info accurately. */
23531 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23532 desired_align, align_bytes);
23533 count_exp = plus_constant (counter_mode (count_exp),
23534 count_exp, -align_bytes);
23535 count -= align_bytes;
23537 if (need_zero_guard
23538 && (count < (unsigned HOST_WIDE_INT) size_needed
23539 || (align_bytes == 0
23540 && count < ((unsigned HOST_WIDE_INT) size_needed
23541 + desired_align - align))))
23543 /* It is possible that we copied enough so the main loop will not
23544 execute. */
23545 gcc_assert (size_needed > 1);
23546 if (label == NULL_RTX)
23547 label = gen_label_rtx ();
23548 emit_cmp_and_jump_insns (count_exp,
23549 GEN_INT (size_needed),
23550 LTU, 0, counter_mode (count_exp), 1, label);
23551 if (expected_size == -1
23552 || expected_size < (desired_align - align) / 2 + size_needed)
23553 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23554 else
23555 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23558 if (label && size_needed == 1)
23560 emit_label (label);
23561 LABEL_NUSES (label) = 1;
23562 label = NULL;
23563 promoted_val = val_exp;
23564 epilogue_size_needed = 1;
23566 else if (label == NULL_RTX)
23567 epilogue_size_needed = size_needed;
23569 /* Step 3: Main loop. */
23571 switch (alg)
23573 case libcall:
23574 case no_stringop:
23575 case last_alg:
23576 gcc_unreachable ();
23577 case loop_1_byte:
23578 case loop:
23579 case vector_loop:
23580 case unrolled_loop:
23581 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23582 count_exp, move_mode, unroll_factor,
23583 expected_size);
23584 break;
23585 case rep_prefix_8_byte:
23586 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23587 DImode, val_exp);
23588 break;
23589 case rep_prefix_4_byte:
23590 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23591 SImode, val_exp);
23592 break;
23593 case rep_prefix_1_byte:
23594 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23595 QImode, val_exp);
23596 break;
23598 /* Adjust properly the offset of src and dest memory for aliasing. */
23599 if (CONST_INT_P (count_exp))
23600 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23601 (count / size_needed) * size_needed);
23602 else
23603 dst = change_address (dst, BLKmode, destreg);
23605 /* Step 4: Epilogue to copy the remaining bytes. */
23607 if (label)
23609 /* When the main loop is done, COUNT_EXP might hold original count,
23610 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23611 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23612 bytes. Compensate if needed. */
23614 if (size_needed < epilogue_size_needed)
23616 tmp =
23617 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23618 GEN_INT (size_needed - 1), count_exp, 1,
23619 OPTAB_DIRECT);
23620 if (tmp != count_exp)
23621 emit_move_insn (count_exp, tmp);
23623 emit_label (label);
23624 LABEL_NUSES (label) = 1;
23626 epilogue:
23627 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23629 if (force_loopy_epilogue)
23630 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23631 epilogue_size_needed);
23632 else
23633 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23634 epilogue_size_needed);
23636 if (jump_around_label)
23637 emit_label (jump_around_label);
23638 return true;
23641 /* Expand the appropriate insns for doing strlen if not just doing
23642 repnz; scasb
23644 out = result, initialized with the start address
23645 align_rtx = alignment of the address.
23646 scratch = scratch register, initialized with the startaddress when
23647 not aligned, otherwise undefined
23649 This is just the body. It needs the initializations mentioned above and
23650 some address computing at the end. These things are done in i386.md. */
23652 static void
23653 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23655 int align;
23656 rtx tmp;
23657 rtx align_2_label = NULL_RTX;
23658 rtx align_3_label = NULL_RTX;
23659 rtx align_4_label = gen_label_rtx ();
23660 rtx end_0_label = gen_label_rtx ();
23661 rtx mem;
23662 rtx tmpreg = gen_reg_rtx (SImode);
23663 rtx scratch = gen_reg_rtx (SImode);
23664 rtx cmp;
23666 align = 0;
23667 if (CONST_INT_P (align_rtx))
23668 align = INTVAL (align_rtx);
23670 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23672 /* Is there a known alignment and is it less than 4? */
23673 if (align < 4)
23675 rtx scratch1 = gen_reg_rtx (Pmode);
23676 emit_move_insn (scratch1, out);
23677 /* Is there a known alignment and is it not 2? */
23678 if (align != 2)
23680 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23681 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23683 /* Leave just the 3 lower bits. */
23684 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23685 NULL_RTX, 0, OPTAB_WIDEN);
23687 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23688 Pmode, 1, align_4_label);
23689 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23690 Pmode, 1, align_2_label);
23691 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23692 Pmode, 1, align_3_label);
23694 else
23696 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23697 check if is aligned to 4 - byte. */
23699 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23700 NULL_RTX, 0, OPTAB_WIDEN);
23702 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23703 Pmode, 1, align_4_label);
23706 mem = change_address (src, QImode, out);
23708 /* Now compare the bytes. */
23710 /* Compare the first n unaligned byte on a byte per byte basis. */
23711 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23712 QImode, 1, end_0_label);
23714 /* Increment the address. */
23715 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23717 /* Not needed with an alignment of 2 */
23718 if (align != 2)
23720 emit_label (align_2_label);
23722 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23723 end_0_label);
23725 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23727 emit_label (align_3_label);
23730 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23731 end_0_label);
23733 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23736 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23737 align this loop. It gives only huge programs, but does not help to
23738 speed up. */
23739 emit_label (align_4_label);
23741 mem = change_address (src, SImode, out);
23742 emit_move_insn (scratch, mem);
23743 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23745 /* This formula yields a nonzero result iff one of the bytes is zero.
23746 This saves three branches inside loop and many cycles. */
23748 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23749 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23750 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23751 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23752 gen_int_mode (0x80808080, SImode)));
23753 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23754 align_4_label);
23756 if (TARGET_CMOVE)
23758 rtx reg = gen_reg_rtx (SImode);
23759 rtx reg2 = gen_reg_rtx (Pmode);
23760 emit_move_insn (reg, tmpreg);
23761 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23763 /* If zero is not in the first two bytes, move two bytes forward. */
23764 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23765 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23766 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23767 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23768 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23769 reg,
23770 tmpreg)));
23771 /* Emit lea manually to avoid clobbering of flags. */
23772 emit_insn (gen_rtx_SET (SImode, reg2,
23773 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23775 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23776 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23777 emit_insn (gen_rtx_SET (VOIDmode, out,
23778 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23779 reg2,
23780 out)));
23782 else
23784 rtx end_2_label = gen_label_rtx ();
23785 /* Is zero in the first two bytes? */
23787 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23788 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23789 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23790 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23791 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23792 pc_rtx);
23793 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23794 JUMP_LABEL (tmp) = end_2_label;
23796 /* Not in the first two. Move two bytes forward. */
23797 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23798 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23800 emit_label (end_2_label);
23804 /* Avoid branch in fixing the byte. */
23805 tmpreg = gen_lowpart (QImode, tmpreg);
23806 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23807 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23808 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23809 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23811 emit_label (end_0_label);
23814 /* Expand strlen. */
23816 bool
23817 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23819 rtx addr, scratch1, scratch2, scratch3, scratch4;
23821 /* The generic case of strlen expander is long. Avoid it's
23822 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23824 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23825 && !TARGET_INLINE_ALL_STRINGOPS
23826 && !optimize_insn_for_size_p ()
23827 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23828 return false;
23830 addr = force_reg (Pmode, XEXP (src, 0));
23831 scratch1 = gen_reg_rtx (Pmode);
23833 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23834 && !optimize_insn_for_size_p ())
23836 /* Well it seems that some optimizer does not combine a call like
23837 foo(strlen(bar), strlen(bar));
23838 when the move and the subtraction is done here. It does calculate
23839 the length just once when these instructions are done inside of
23840 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23841 often used and I use one fewer register for the lifetime of
23842 output_strlen_unroll() this is better. */
23844 emit_move_insn (out, addr);
23846 ix86_expand_strlensi_unroll_1 (out, src, align);
23848 /* strlensi_unroll_1 returns the address of the zero at the end of
23849 the string, like memchr(), so compute the length by subtracting
23850 the start address. */
23851 emit_insn (ix86_gen_sub3 (out, out, addr));
23853 else
23855 rtx unspec;
23857 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23858 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23859 return false;
23861 scratch2 = gen_reg_rtx (Pmode);
23862 scratch3 = gen_reg_rtx (Pmode);
23863 scratch4 = force_reg (Pmode, constm1_rtx);
23865 emit_move_insn (scratch3, addr);
23866 eoschar = force_reg (QImode, eoschar);
23868 src = replace_equiv_address_nv (src, scratch3);
23870 /* If .md starts supporting :P, this can be done in .md. */
23871 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23872 scratch4), UNSPEC_SCAS);
23873 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23874 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23875 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23877 return true;
23880 /* For given symbol (function) construct code to compute address of it's PLT
23881 entry in large x86-64 PIC model. */
23882 static rtx
23883 construct_plt_address (rtx symbol)
23885 rtx tmp, unspec;
23887 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23888 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
23889 gcc_assert (Pmode == DImode);
23891 tmp = gen_reg_rtx (Pmode);
23892 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23894 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23895 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23896 return tmp;
23900 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23901 rtx callarg2,
23902 rtx pop, bool sibcall)
23904 unsigned int const cregs_size
23905 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
23906 rtx vec[3 + cregs_size];
23907 rtx use = NULL, call;
23908 unsigned int vec_len = 0;
23910 if (pop == const0_rtx)
23911 pop = NULL;
23912 gcc_assert (!TARGET_64BIT || !pop);
23914 if (TARGET_MACHO && !TARGET_64BIT)
23916 #if TARGET_MACHO
23917 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23918 fnaddr = machopic_indirect_call_target (fnaddr);
23919 #endif
23921 else
23923 /* Static functions and indirect calls don't need the pic register. */
23924 if (flag_pic
23925 && (!TARGET_64BIT
23926 || (ix86_cmodel == CM_LARGE_PIC
23927 && DEFAULT_ABI != MS_ABI))
23928 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23929 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23930 use_reg (&use, pic_offset_table_rtx);
23933 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23935 rtx al = gen_rtx_REG (QImode, AX_REG);
23936 emit_move_insn (al, callarg2);
23937 use_reg (&use, al);
23940 if (ix86_cmodel == CM_LARGE_PIC
23941 && !TARGET_PECOFF
23942 && MEM_P (fnaddr)
23943 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23944 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23945 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23946 else if (sibcall
23947 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23948 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23950 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23951 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23954 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23955 if (retval)
23956 call = gen_rtx_SET (VOIDmode, retval, call);
23957 vec[vec_len++] = call;
23959 if (pop)
23961 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23962 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23963 vec[vec_len++] = pop;
23966 if (TARGET_64BIT_MS_ABI
23967 && (!callarg2 || INTVAL (callarg2) != -2))
23969 unsigned i;
23971 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23972 UNSPEC_MS_TO_SYSV_CALL);
23974 for (i = 0; i < cregs_size; i++)
23976 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
23977 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
23979 vec[vec_len++]
23980 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
23984 if (vec_len > 1)
23985 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23986 call = emit_call_insn (call);
23987 if (use)
23988 CALL_INSN_FUNCTION_USAGE (call) = use;
23990 return call;
23993 /* Output the assembly for a call instruction. */
23995 const char *
23996 ix86_output_call_insn (rtx insn, rtx call_op)
23998 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23999 bool seh_nop_p = false;
24000 const char *xasm;
24002 if (SIBLING_CALL_P (insn))
24004 if (direct_p)
24005 xasm = "jmp\t%P0";
24006 /* SEH epilogue detection requires the indirect branch case
24007 to include REX.W. */
24008 else if (TARGET_SEH)
24009 xasm = "rex.W jmp %A0";
24010 else
24011 xasm = "jmp\t%A0";
24013 output_asm_insn (xasm, &call_op);
24014 return "";
24017 /* SEH unwinding can require an extra nop to be emitted in several
24018 circumstances. Determine if we have one of those. */
24019 if (TARGET_SEH)
24021 rtx i;
24023 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24025 /* If we get to another real insn, we don't need the nop. */
24026 if (INSN_P (i))
24027 break;
24029 /* If we get to the epilogue note, prevent a catch region from
24030 being adjacent to the standard epilogue sequence. If non-
24031 call-exceptions, we'll have done this during epilogue emission. */
24032 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24033 && !flag_non_call_exceptions
24034 && !can_throw_internal (insn))
24036 seh_nop_p = true;
24037 break;
24041 /* If we didn't find a real insn following the call, prevent the
24042 unwinder from looking into the next function. */
24043 if (i == NULL)
24044 seh_nop_p = true;
24047 if (direct_p)
24048 xasm = "call\t%P0";
24049 else
24050 xasm = "call\t%A0";
24052 output_asm_insn (xasm, &call_op);
24054 if (seh_nop_p)
24055 return "nop";
24057 return "";
24060 /* Clear stack slot assignments remembered from previous functions.
24061 This is called from INIT_EXPANDERS once before RTL is emitted for each
24062 function. */
24064 static struct machine_function *
24065 ix86_init_machine_status (void)
24067 struct machine_function *f;
24069 f = ggc_alloc_cleared_machine_function ();
24070 f->use_fast_prologue_epilogue_nregs = -1;
24071 f->call_abi = ix86_abi;
24073 return f;
24076 /* Return a MEM corresponding to a stack slot with mode MODE.
24077 Allocate a new slot if necessary.
24079 The RTL for a function can have several slots available: N is
24080 which slot to use. */
24083 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24085 struct stack_local_entry *s;
24087 gcc_assert (n < MAX_386_STACK_LOCALS);
24089 for (s = ix86_stack_locals; s; s = s->next)
24090 if (s->mode == mode && s->n == n)
24091 return validize_mem (copy_rtx (s->rtl));
24093 s = ggc_alloc_stack_local_entry ();
24094 s->n = n;
24095 s->mode = mode;
24096 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24098 s->next = ix86_stack_locals;
24099 ix86_stack_locals = s;
24100 return validize_mem (s->rtl);
24103 static void
24104 ix86_instantiate_decls (void)
24106 struct stack_local_entry *s;
24108 for (s = ix86_stack_locals; s; s = s->next)
24109 if (s->rtl != NULL_RTX)
24110 instantiate_decl_rtl (s->rtl);
24113 /* Calculate the length of the memory address in the instruction encoding.
24114 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24115 or other prefixes. We never generate addr32 prefix for LEA insn. */
24118 memory_address_length (rtx addr, bool lea)
24120 struct ix86_address parts;
24121 rtx base, index, disp;
24122 int len;
24123 int ok;
24125 if (GET_CODE (addr) == PRE_DEC
24126 || GET_CODE (addr) == POST_INC
24127 || GET_CODE (addr) == PRE_MODIFY
24128 || GET_CODE (addr) == POST_MODIFY)
24129 return 0;
24131 ok = ix86_decompose_address (addr, &parts);
24132 gcc_assert (ok);
24134 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24136 /* If this is not LEA instruction, add the length of addr32 prefix. */
24137 if (TARGET_64BIT && !lea
24138 && (SImode_address_operand (addr, VOIDmode)
24139 || (parts.base && GET_MODE (parts.base) == SImode)
24140 || (parts.index && GET_MODE (parts.index) == SImode)))
24141 len++;
24143 base = parts.base;
24144 index = parts.index;
24145 disp = parts.disp;
24147 if (base && GET_CODE (base) == SUBREG)
24148 base = SUBREG_REG (base);
24149 if (index && GET_CODE (index) == SUBREG)
24150 index = SUBREG_REG (index);
24152 gcc_assert (base == NULL_RTX || REG_P (base));
24153 gcc_assert (index == NULL_RTX || REG_P (index));
24155 /* Rule of thumb:
24156 - esp as the base always wants an index,
24157 - ebp as the base always wants a displacement,
24158 - r12 as the base always wants an index,
24159 - r13 as the base always wants a displacement. */
24161 /* Register Indirect. */
24162 if (base && !index && !disp)
24164 /* esp (for its index) and ebp (for its displacement) need
24165 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24166 code. */
24167 if (base == arg_pointer_rtx
24168 || base == frame_pointer_rtx
24169 || REGNO (base) == SP_REG
24170 || REGNO (base) == BP_REG
24171 || REGNO (base) == R12_REG
24172 || REGNO (base) == R13_REG)
24173 len++;
24176 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24177 is not disp32, but disp32(%rip), so for disp32
24178 SIB byte is needed, unless print_operand_address
24179 optimizes it into disp32(%rip) or (%rip) is implied
24180 by UNSPEC. */
24181 else if (disp && !base && !index)
24183 len += 4;
24184 if (TARGET_64BIT)
24186 rtx symbol = disp;
24188 if (GET_CODE (disp) == CONST)
24189 symbol = XEXP (disp, 0);
24190 if (GET_CODE (symbol) == PLUS
24191 && CONST_INT_P (XEXP (symbol, 1)))
24192 symbol = XEXP (symbol, 0);
24194 if (GET_CODE (symbol) != LABEL_REF
24195 && (GET_CODE (symbol) != SYMBOL_REF
24196 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
24197 && (GET_CODE (symbol) != UNSPEC
24198 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
24199 && XINT (symbol, 1) != UNSPEC_PCREL
24200 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
24201 len++;
24204 else
24206 /* Find the length of the displacement constant. */
24207 if (disp)
24209 if (base && satisfies_constraint_K (disp))
24210 len += 1;
24211 else
24212 len += 4;
24214 /* ebp always wants a displacement. Similarly r13. */
24215 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24216 len++;
24218 /* An index requires the two-byte modrm form.... */
24219 if (index
24220 /* ...like esp (or r12), which always wants an index. */
24221 || base == arg_pointer_rtx
24222 || base == frame_pointer_rtx
24223 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24224 len++;
24227 return len;
24230 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24231 is set, expect that insn have 8bit immediate alternative. */
24233 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24235 int len = 0;
24236 int i;
24237 extract_insn_cached (insn);
24238 for (i = recog_data.n_operands - 1; i >= 0; --i)
24239 if (CONSTANT_P (recog_data.operand[i]))
24241 enum attr_mode mode = get_attr_mode (insn);
24243 gcc_assert (!len);
24244 if (shortform && CONST_INT_P (recog_data.operand[i]))
24246 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24247 switch (mode)
24249 case MODE_QI:
24250 len = 1;
24251 continue;
24252 case MODE_HI:
24253 ival = trunc_int_for_mode (ival, HImode);
24254 break;
24255 case MODE_SI:
24256 ival = trunc_int_for_mode (ival, SImode);
24257 break;
24258 default:
24259 break;
24261 if (IN_RANGE (ival, -128, 127))
24263 len = 1;
24264 continue;
24267 switch (mode)
24269 case MODE_QI:
24270 len = 1;
24271 break;
24272 case MODE_HI:
24273 len = 2;
24274 break;
24275 case MODE_SI:
24276 len = 4;
24277 break;
24278 /* Immediates for DImode instructions are encoded
24279 as 32bit sign extended values. */
24280 case MODE_DI:
24281 len = 4;
24282 break;
24283 default:
24284 fatal_insn ("unknown insn mode", insn);
24287 return len;
24290 /* Compute default value for "length_address" attribute. */
24292 ix86_attr_length_address_default (rtx insn)
24294 int i;
24296 if (get_attr_type (insn) == TYPE_LEA)
24298 rtx set = PATTERN (insn), addr;
24300 if (GET_CODE (set) == PARALLEL)
24301 set = XVECEXP (set, 0, 0);
24303 gcc_assert (GET_CODE (set) == SET);
24305 addr = SET_SRC (set);
24307 return memory_address_length (addr, true);
24310 extract_insn_cached (insn);
24311 for (i = recog_data.n_operands - 1; i >= 0; --i)
24312 if (MEM_P (recog_data.operand[i]))
24314 constrain_operands_cached (reload_completed);
24315 if (which_alternative != -1)
24317 const char *constraints = recog_data.constraints[i];
24318 int alt = which_alternative;
24320 while (*constraints == '=' || *constraints == '+')
24321 constraints++;
24322 while (alt-- > 0)
24323 while (*constraints++ != ',')
24325 /* Skip ignored operands. */
24326 if (*constraints == 'X')
24327 continue;
24329 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24331 return 0;
24334 /* Compute default value for "length_vex" attribute. It includes
24335 2 or 3 byte VEX prefix and 1 opcode byte. */
24338 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24340 int i;
24342 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24343 byte VEX prefix. */
24344 if (!has_0f_opcode || has_vex_w)
24345 return 3 + 1;
24347 /* We can always use 2 byte VEX prefix in 32bit. */
24348 if (!TARGET_64BIT)
24349 return 2 + 1;
24351 extract_insn_cached (insn);
24353 for (i = recog_data.n_operands - 1; i >= 0; --i)
24354 if (REG_P (recog_data.operand[i]))
24356 /* REX.W bit uses 3 byte VEX prefix. */
24357 if (GET_MODE (recog_data.operand[i]) == DImode
24358 && GENERAL_REG_P (recog_data.operand[i]))
24359 return 3 + 1;
24361 else
24363 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24364 if (MEM_P (recog_data.operand[i])
24365 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24366 return 3 + 1;
24369 return 2 + 1;
24372 /* Return the maximum number of instructions a cpu can issue. */
24374 static int
24375 ix86_issue_rate (void)
24377 switch (ix86_tune)
24379 case PROCESSOR_PENTIUM:
24380 case PROCESSOR_ATOM:
24381 case PROCESSOR_SLM:
24382 case PROCESSOR_K6:
24383 case PROCESSOR_BTVER2:
24384 return 2;
24386 case PROCESSOR_PENTIUMPRO:
24387 case PROCESSOR_PENTIUM4:
24388 case PROCESSOR_CORE2:
24389 case PROCESSOR_COREI7:
24390 case PROCESSOR_HASWELL:
24391 case PROCESSOR_ATHLON:
24392 case PROCESSOR_K8:
24393 case PROCESSOR_AMDFAM10:
24394 case PROCESSOR_NOCONA:
24395 case PROCESSOR_GENERIC32:
24396 case PROCESSOR_GENERIC64:
24397 case PROCESSOR_BDVER1:
24398 case PROCESSOR_BDVER2:
24399 case PROCESSOR_BDVER3:
24400 case PROCESSOR_BTVER1:
24401 return 3;
24403 default:
24404 return 1;
24408 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24409 by DEP_INSN and nothing set by DEP_INSN. */
24411 static bool
24412 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24414 rtx set, set2;
24416 /* Simplify the test for uninteresting insns. */
24417 if (insn_type != TYPE_SETCC
24418 && insn_type != TYPE_ICMOV
24419 && insn_type != TYPE_FCMOV
24420 && insn_type != TYPE_IBR)
24421 return false;
24423 if ((set = single_set (dep_insn)) != 0)
24425 set = SET_DEST (set);
24426 set2 = NULL_RTX;
24428 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24429 && XVECLEN (PATTERN (dep_insn), 0) == 2
24430 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24431 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24433 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24434 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24436 else
24437 return false;
24439 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24440 return false;
24442 /* This test is true if the dependent insn reads the flags but
24443 not any other potentially set register. */
24444 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24445 return false;
24447 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24448 return false;
24450 return true;
24453 /* Return true iff USE_INSN has a memory address with operands set by
24454 SET_INSN. */
24456 bool
24457 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24459 int i;
24460 extract_insn_cached (use_insn);
24461 for (i = recog_data.n_operands - 1; i >= 0; --i)
24462 if (MEM_P (recog_data.operand[i]))
24464 rtx addr = XEXP (recog_data.operand[i], 0);
24465 return modified_in_p (addr, set_insn) != 0;
24467 return false;
24470 /* Helper function for exact_store_load_dependency.
24471 Return true if addr is found in insn. */
24472 static bool
24473 exact_dependency_1 (rtx addr, rtx insn)
24475 enum rtx_code code;
24476 const char *format_ptr;
24477 int i, j;
24479 code = GET_CODE (insn);
24480 switch (code)
24482 case MEM:
24483 if (rtx_equal_p (addr, insn))
24484 return true;
24485 break;
24486 case REG:
24487 CASE_CONST_ANY:
24488 case SYMBOL_REF:
24489 case CODE_LABEL:
24490 case PC:
24491 case CC0:
24492 case EXPR_LIST:
24493 return false;
24494 default:
24495 break;
24498 format_ptr = GET_RTX_FORMAT (code);
24499 for (i = 0; i < GET_RTX_LENGTH (code); i++)
24501 switch (*format_ptr++)
24503 case 'e':
24504 if (exact_dependency_1 (addr, XEXP (insn, i)))
24505 return true;
24506 break;
24507 case 'E':
24508 for (j = 0; j < XVECLEN (insn, i); j++)
24509 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
24510 return true;
24511 break;
24514 return false;
24517 /* Return true if there exists exact dependency for store & load, i.e.
24518 the same memory address is used in them. */
24519 static bool
24520 exact_store_load_dependency (rtx store, rtx load)
24522 rtx set1, set2;
24524 set1 = single_set (store);
24525 if (!set1)
24526 return false;
24527 if (!MEM_P (SET_DEST (set1)))
24528 return false;
24529 set2 = single_set (load);
24530 if (!set2)
24531 return false;
24532 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
24533 return true;
24534 return false;
24537 static int
24538 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24540 enum attr_type insn_type, dep_insn_type;
24541 enum attr_memory memory;
24542 rtx set, set2;
24543 int dep_insn_code_number;
24545 /* Anti and output dependencies have zero cost on all CPUs. */
24546 if (REG_NOTE_KIND (link) != 0)
24547 return 0;
24549 dep_insn_code_number = recog_memoized (dep_insn);
24551 /* If we can't recognize the insns, we can't really do anything. */
24552 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24553 return cost;
24555 insn_type = get_attr_type (insn);
24556 dep_insn_type = get_attr_type (dep_insn);
24558 switch (ix86_tune)
24560 case PROCESSOR_PENTIUM:
24561 /* Address Generation Interlock adds a cycle of latency. */
24562 if (insn_type == TYPE_LEA)
24564 rtx addr = PATTERN (insn);
24566 if (GET_CODE (addr) == PARALLEL)
24567 addr = XVECEXP (addr, 0, 0);
24569 gcc_assert (GET_CODE (addr) == SET);
24571 addr = SET_SRC (addr);
24572 if (modified_in_p (addr, dep_insn))
24573 cost += 1;
24575 else if (ix86_agi_dependent (dep_insn, insn))
24576 cost += 1;
24578 /* ??? Compares pair with jump/setcc. */
24579 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24580 cost = 0;
24582 /* Floating point stores require value to be ready one cycle earlier. */
24583 if (insn_type == TYPE_FMOV
24584 && get_attr_memory (insn) == MEMORY_STORE
24585 && !ix86_agi_dependent (dep_insn, insn))
24586 cost += 1;
24587 break;
24589 case PROCESSOR_PENTIUMPRO:
24590 memory = get_attr_memory (insn);
24592 /* INT->FP conversion is expensive. */
24593 if (get_attr_fp_int_src (dep_insn))
24594 cost += 5;
24596 /* There is one cycle extra latency between an FP op and a store. */
24597 if (insn_type == TYPE_FMOV
24598 && (set = single_set (dep_insn)) != NULL_RTX
24599 && (set2 = single_set (insn)) != NULL_RTX
24600 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24601 && MEM_P (SET_DEST (set2)))
24602 cost += 1;
24604 /* Show ability of reorder buffer to hide latency of load by executing
24605 in parallel with previous instruction in case
24606 previous instruction is not needed to compute the address. */
24607 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24608 && !ix86_agi_dependent (dep_insn, insn))
24610 /* Claim moves to take one cycle, as core can issue one load
24611 at time and the next load can start cycle later. */
24612 if (dep_insn_type == TYPE_IMOV
24613 || dep_insn_type == TYPE_FMOV)
24614 cost = 1;
24615 else if (cost > 1)
24616 cost--;
24618 break;
24620 case PROCESSOR_K6:
24621 memory = get_attr_memory (insn);
24623 /* The esp dependency is resolved before the instruction is really
24624 finished. */
24625 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24626 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24627 return 1;
24629 /* INT->FP conversion is expensive. */
24630 if (get_attr_fp_int_src (dep_insn))
24631 cost += 5;
24633 /* Show ability of reorder buffer to hide latency of load by executing
24634 in parallel with previous instruction in case
24635 previous instruction is not needed to compute the address. */
24636 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24637 && !ix86_agi_dependent (dep_insn, insn))
24639 /* Claim moves to take one cycle, as core can issue one load
24640 at time and the next load can start cycle later. */
24641 if (dep_insn_type == TYPE_IMOV
24642 || dep_insn_type == TYPE_FMOV)
24643 cost = 1;
24644 else if (cost > 2)
24645 cost -= 2;
24646 else
24647 cost = 1;
24649 break;
24651 case PROCESSOR_ATHLON:
24652 case PROCESSOR_K8:
24653 case PROCESSOR_AMDFAM10:
24654 case PROCESSOR_BDVER1:
24655 case PROCESSOR_BDVER2:
24656 case PROCESSOR_BDVER3:
24657 case PROCESSOR_BTVER1:
24658 case PROCESSOR_BTVER2:
24659 case PROCESSOR_ATOM:
24660 case PROCESSOR_GENERIC32:
24661 case PROCESSOR_GENERIC64:
24662 memory = get_attr_memory (insn);
24664 /* Show ability of reorder buffer to hide latency of load by executing
24665 in parallel with previous instruction in case
24666 previous instruction is not needed to compute the address. */
24667 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24668 && !ix86_agi_dependent (dep_insn, insn))
24670 enum attr_unit unit = get_attr_unit (insn);
24671 int loadcost = 3;
24673 /* Because of the difference between the length of integer and
24674 floating unit pipeline preparation stages, the memory operands
24675 for floating point are cheaper.
24677 ??? For Athlon it the difference is most probably 2. */
24678 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24679 loadcost = 3;
24680 else
24681 loadcost = TARGET_ATHLON ? 2 : 0;
24683 if (cost >= loadcost)
24684 cost -= loadcost;
24685 else
24686 cost = 0;
24688 break;
24690 case PROCESSOR_SLM:
24691 if (!reload_completed)
24692 return cost;
24694 /* Increase cost of integer loads. */
24695 memory = get_attr_memory (dep_insn);
24696 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24698 enum attr_unit unit = get_attr_unit (dep_insn);
24699 if (unit == UNIT_INTEGER && cost == 1)
24701 if (memory == MEMORY_LOAD)
24702 cost = 3;
24703 else
24705 /* Increase cost of ld/st for short int types only
24706 because of store forwarding issue. */
24707 rtx set = single_set (dep_insn);
24708 if (set && (GET_MODE (SET_DEST (set)) == QImode
24709 || GET_MODE (SET_DEST (set)) == HImode))
24711 /* Increase cost of store/load insn if exact
24712 dependence exists and it is load insn. */
24713 enum attr_memory insn_memory = get_attr_memory (insn);
24714 if (insn_memory == MEMORY_LOAD
24715 && exact_store_load_dependency (dep_insn, insn))
24716 cost = 3;
24722 default:
24723 break;
24726 return cost;
24729 /* How many alternative schedules to try. This should be as wide as the
24730 scheduling freedom in the DFA, but no wider. Making this value too
24731 large results extra work for the scheduler. */
24733 static int
24734 ia32_multipass_dfa_lookahead (void)
24736 switch (ix86_tune)
24738 case PROCESSOR_PENTIUM:
24739 return 2;
24741 case PROCESSOR_PENTIUMPRO:
24742 case PROCESSOR_K6:
24743 return 1;
24745 case PROCESSOR_CORE2:
24746 case PROCESSOR_COREI7:
24747 case PROCESSOR_HASWELL:
24748 case PROCESSOR_ATOM:
24749 case PROCESSOR_SLM:
24750 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24751 as many instructions can be executed on a cycle, i.e.,
24752 issue_rate. I wonder why tuning for many CPUs does not do this. */
24753 if (reload_completed)
24754 return ix86_issue_rate ();
24755 /* Don't use lookahead for pre-reload schedule to save compile time. */
24756 return 0;
24758 default:
24759 return 0;
24763 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24764 execution. It is applied if
24765 (1) IMUL instruction is on the top of list;
24766 (2) There exists the only producer of independent IMUL instruction in
24767 ready list.
24768 Return index of IMUL producer if it was found and -1 otherwise. */
24769 static int
24770 do_reorder_for_imul (rtx *ready, int n_ready)
24772 rtx insn, set, insn1, insn2;
24773 sd_iterator_def sd_it;
24774 dep_t dep;
24775 int index = -1;
24776 int i;
24778 if (ix86_tune != PROCESSOR_ATOM)
24779 return index;
24781 /* Check that IMUL instruction is on the top of ready list. */
24782 insn = ready[n_ready - 1];
24783 set = single_set (insn);
24784 if (!set)
24785 return index;
24786 if (!(GET_CODE (SET_SRC (set)) == MULT
24787 && GET_MODE (SET_SRC (set)) == SImode))
24788 return index;
24790 /* Search for producer of independent IMUL instruction. */
24791 for (i = n_ready - 2; i >= 0; i--)
24793 insn = ready[i];
24794 if (!NONDEBUG_INSN_P (insn))
24795 continue;
24796 /* Skip IMUL instruction. */
24797 insn2 = PATTERN (insn);
24798 if (GET_CODE (insn2) == PARALLEL)
24799 insn2 = XVECEXP (insn2, 0, 0);
24800 if (GET_CODE (insn2) == SET
24801 && GET_CODE (SET_SRC (insn2)) == MULT
24802 && GET_MODE (SET_SRC (insn2)) == SImode)
24803 continue;
24805 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24807 rtx con;
24808 con = DEP_CON (dep);
24809 if (!NONDEBUG_INSN_P (con))
24810 continue;
24811 insn1 = PATTERN (con);
24812 if (GET_CODE (insn1) == PARALLEL)
24813 insn1 = XVECEXP (insn1, 0, 0);
24815 if (GET_CODE (insn1) == SET
24816 && GET_CODE (SET_SRC (insn1)) == MULT
24817 && GET_MODE (SET_SRC (insn1)) == SImode)
24819 sd_iterator_def sd_it1;
24820 dep_t dep1;
24821 /* Check if there is no other dependee for IMUL. */
24822 index = i;
24823 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24825 rtx pro;
24826 pro = DEP_PRO (dep1);
24827 if (!NONDEBUG_INSN_P (pro))
24828 continue;
24829 if (pro != insn)
24830 index = -1;
24832 if (index >= 0)
24833 break;
24836 if (index >= 0)
24837 break;
24839 return index;
24842 /* Try to find the best candidate on the top of ready list if two insns
24843 have the same priority - candidate is best if its dependees were
24844 scheduled earlier. Applied for Silvermont only.
24845 Return true if top 2 insns must be interchanged. */
24846 static bool
24847 swap_top_of_ready_list (rtx *ready, int n_ready)
24849 rtx top = ready[n_ready - 1];
24850 rtx next = ready[n_ready - 2];
24851 rtx set;
24852 sd_iterator_def sd_it;
24853 dep_t dep;
24854 int clock1 = -1;
24855 int clock2 = -1;
24856 #define INSN_TICK(INSN) (HID (INSN)->tick)
24858 if (ix86_tune != PROCESSOR_SLM)
24859 return false;
24861 if (!NONDEBUG_INSN_P (top))
24862 return false;
24863 if (!NONJUMP_INSN_P (top))
24864 return false;
24865 if (!NONDEBUG_INSN_P (next))
24866 return false;
24867 if (!NONJUMP_INSN_P (next))
24868 return false;
24869 set = single_set (top);
24870 if (!set)
24871 return false;
24872 set = single_set (next);
24873 if (!set)
24874 return false;
24876 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
24878 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
24879 return false;
24880 /* Determine winner more precise. */
24881 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
24883 rtx pro;
24884 pro = DEP_PRO (dep);
24885 if (!NONDEBUG_INSN_P (pro))
24886 continue;
24887 if (INSN_TICK (pro) > clock1)
24888 clock1 = INSN_TICK (pro);
24890 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
24892 rtx pro;
24893 pro = DEP_PRO (dep);
24894 if (!NONDEBUG_INSN_P (pro))
24895 continue;
24896 if (INSN_TICK (pro) > clock2)
24897 clock2 = INSN_TICK (pro);
24900 if (clock1 == clock2)
24902 /* Determine winner - load must win. */
24903 enum attr_memory memory1, memory2;
24904 memory1 = get_attr_memory (top);
24905 memory2 = get_attr_memory (next);
24906 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
24907 return true;
24909 return (bool) (clock2 < clock1);
24911 return false;
24912 #undef INSN_TICK
24915 /* Perform possible reodering of ready list for Atom/Silvermont only.
24916 Return issue rate. */
24917 static int
24918 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24919 int clock_var)
24921 int issue_rate = -1;
24922 int n_ready = *pn_ready;
24923 int i;
24924 rtx insn;
24925 int index = -1;
24927 /* Set up issue rate. */
24928 issue_rate = ix86_issue_rate ();
24930 /* Do reodering for Atom/SLM only. */
24931 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
24932 return issue_rate;
24934 /* Nothing to do if ready list contains only 1 instruction. */
24935 if (n_ready <= 1)
24936 return issue_rate;
24938 /* Do reodering for post-reload scheduler only. */
24939 if (!reload_completed)
24940 return issue_rate;
24942 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
24944 if (sched_verbose > 1)
24945 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
24946 INSN_UID (ready[index]));
24948 /* Put IMUL producer (ready[index]) at the top of ready list. */
24949 insn = ready[index];
24950 for (i = index; i < n_ready - 1; i++)
24951 ready[i] = ready[i + 1];
24952 ready[n_ready - 1] = insn;
24953 return issue_rate;
24955 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
24957 if (sched_verbose > 1)
24958 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
24959 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
24960 /* Swap 2 top elements of ready list. */
24961 insn = ready[n_ready - 1];
24962 ready[n_ready - 1] = ready[n_ready - 2];
24963 ready[n_ready - 2] = insn;
24965 return issue_rate;
24968 static bool
24969 ix86_class_likely_spilled_p (reg_class_t);
24971 /* Returns true if lhs of insn is HW function argument register and set up
24972 is_spilled to true if it is likely spilled HW register. */
24973 static bool
24974 insn_is_function_arg (rtx insn, bool* is_spilled)
24976 rtx dst;
24978 if (!NONDEBUG_INSN_P (insn))
24979 return false;
24980 /* Call instructions are not movable, ignore it. */
24981 if (CALL_P (insn))
24982 return false;
24983 insn = PATTERN (insn);
24984 if (GET_CODE (insn) == PARALLEL)
24985 insn = XVECEXP (insn, 0, 0);
24986 if (GET_CODE (insn) != SET)
24987 return false;
24988 dst = SET_DEST (insn);
24989 if (REG_P (dst) && HARD_REGISTER_P (dst)
24990 && ix86_function_arg_regno_p (REGNO (dst)))
24992 /* Is it likely spilled HW register? */
24993 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24994 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24995 *is_spilled = true;
24996 return true;
24998 return false;
25001 /* Add output dependencies for chain of function adjacent arguments if only
25002 there is a move to likely spilled HW register. Return first argument
25003 if at least one dependence was added or NULL otherwise. */
25004 static rtx
25005 add_parameter_dependencies (rtx call, rtx head)
25007 rtx insn;
25008 rtx last = call;
25009 rtx first_arg = NULL;
25010 bool is_spilled = false;
25012 head = PREV_INSN (head);
25014 /* Find nearest to call argument passing instruction. */
25015 while (true)
25017 last = PREV_INSN (last);
25018 if (last == head)
25019 return NULL;
25020 if (!NONDEBUG_INSN_P (last))
25021 continue;
25022 if (insn_is_function_arg (last, &is_spilled))
25023 break;
25024 return NULL;
25027 first_arg = last;
25028 while (true)
25030 insn = PREV_INSN (last);
25031 if (!INSN_P (insn))
25032 break;
25033 if (insn == head)
25034 break;
25035 if (!NONDEBUG_INSN_P (insn))
25037 last = insn;
25038 continue;
25040 if (insn_is_function_arg (insn, &is_spilled))
25042 /* Add output depdendence between two function arguments if chain
25043 of output arguments contains likely spilled HW registers. */
25044 if (is_spilled)
25045 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25046 first_arg = last = insn;
25048 else
25049 break;
25051 if (!is_spilled)
25052 return NULL;
25053 return first_arg;
25056 /* Add output or anti dependency from insn to first_arg to restrict its code
25057 motion. */
25058 static void
25059 avoid_func_arg_motion (rtx first_arg, rtx insn)
25061 rtx set;
25062 rtx tmp;
25064 set = single_set (insn);
25065 if (!set)
25066 return;
25067 tmp = SET_DEST (set);
25068 if (REG_P (tmp))
25070 /* Add output dependency to the first function argument. */
25071 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25072 return;
25074 /* Add anti dependency. */
25075 add_dependence (first_arg, insn, REG_DEP_ANTI);
25078 /* Avoid cross block motion of function argument through adding dependency
25079 from the first non-jump instruction in bb. */
25080 static void
25081 add_dependee_for_func_arg (rtx arg, basic_block bb)
25083 rtx insn = BB_END (bb);
25085 while (insn)
25087 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25089 rtx set = single_set (insn);
25090 if (set)
25092 avoid_func_arg_motion (arg, insn);
25093 return;
25096 if (insn == BB_HEAD (bb))
25097 return;
25098 insn = PREV_INSN (insn);
25102 /* Hook for pre-reload schedule - avoid motion of function arguments
25103 passed in likely spilled HW registers. */
25104 static void
25105 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25107 rtx insn;
25108 rtx first_arg = NULL;
25109 if (reload_completed)
25110 return;
25111 while (head != tail && DEBUG_INSN_P (head))
25112 head = NEXT_INSN (head);
25113 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25114 if (INSN_P (insn) && CALL_P (insn))
25116 first_arg = add_parameter_dependencies (insn, head);
25117 if (first_arg)
25119 /* Add dependee for first argument to predecessors if only
25120 region contains more than one block. */
25121 basic_block bb = BLOCK_FOR_INSN (insn);
25122 int rgn = CONTAINING_RGN (bb->index);
25123 int nr_blks = RGN_NR_BLOCKS (rgn);
25124 /* Skip trivial regions and region head blocks that can have
25125 predecessors outside of region. */
25126 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25128 edge e;
25129 edge_iterator ei;
25130 /* Assume that region is SCC, i.e. all immediate predecessors
25131 of non-head block are in the same region. */
25132 FOR_EACH_EDGE (e, ei, bb->preds)
25134 /* Avoid creating of loop-carried dependencies through
25135 using topological odering in region. */
25136 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25137 add_dependee_for_func_arg (first_arg, e->src);
25140 insn = first_arg;
25141 if (insn == head)
25142 break;
25145 else if (first_arg)
25146 avoid_func_arg_motion (first_arg, insn);
25149 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25150 HW registers to maximum, to schedule them at soon as possible. These are
25151 moves from function argument registers at the top of the function entry
25152 and moves from function return value registers after call. */
25153 static int
25154 ix86_adjust_priority (rtx insn, int priority)
25156 rtx set;
25158 if (reload_completed)
25159 return priority;
25161 if (!NONDEBUG_INSN_P (insn))
25162 return priority;
25164 set = single_set (insn);
25165 if (set)
25167 rtx tmp = SET_SRC (set);
25168 if (REG_P (tmp)
25169 && HARD_REGISTER_P (tmp)
25170 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25171 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25172 return current_sched_info->sched_max_insns_priority;
25175 return priority;
25178 /* Model decoder of Core 2/i7.
25179 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25180 track the instruction fetch block boundaries and make sure that long
25181 (9+ bytes) instructions are assigned to D0. */
25183 /* Maximum length of an insn that can be handled by
25184 a secondary decoder unit. '8' for Core 2/i7. */
25185 static int core2i7_secondary_decoder_max_insn_size;
25187 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25188 '16' for Core 2/i7. */
25189 static int core2i7_ifetch_block_size;
25191 /* Maximum number of instructions decoder can handle per cycle.
25192 '6' for Core 2/i7. */
25193 static int core2i7_ifetch_block_max_insns;
25195 typedef struct ix86_first_cycle_multipass_data_ *
25196 ix86_first_cycle_multipass_data_t;
25197 typedef const struct ix86_first_cycle_multipass_data_ *
25198 const_ix86_first_cycle_multipass_data_t;
25200 /* A variable to store target state across calls to max_issue within
25201 one cycle. */
25202 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25203 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25205 /* Initialize DATA. */
25206 static void
25207 core2i7_first_cycle_multipass_init (void *_data)
25209 ix86_first_cycle_multipass_data_t data
25210 = (ix86_first_cycle_multipass_data_t) _data;
25212 data->ifetch_block_len = 0;
25213 data->ifetch_block_n_insns = 0;
25214 data->ready_try_change = NULL;
25215 data->ready_try_change_size = 0;
25218 /* Advancing the cycle; reset ifetch block counts. */
25219 static void
25220 core2i7_dfa_post_advance_cycle (void)
25222 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25224 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25226 data->ifetch_block_len = 0;
25227 data->ifetch_block_n_insns = 0;
25230 static int min_insn_size (rtx);
25232 /* Filter out insns from ready_try that the core will not be able to issue
25233 on current cycle due to decoder. */
25234 static void
25235 core2i7_first_cycle_multipass_filter_ready_try
25236 (const_ix86_first_cycle_multipass_data_t data,
25237 char *ready_try, int n_ready, bool first_cycle_insn_p)
25239 while (n_ready--)
25241 rtx insn;
25242 int insn_size;
25244 if (ready_try[n_ready])
25245 continue;
25247 insn = get_ready_element (n_ready);
25248 insn_size = min_insn_size (insn);
25250 if (/* If this is a too long an insn for a secondary decoder ... */
25251 (!first_cycle_insn_p
25252 && insn_size > core2i7_secondary_decoder_max_insn_size)
25253 /* ... or it would not fit into the ifetch block ... */
25254 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25255 /* ... or the decoder is full already ... */
25256 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25257 /* ... mask the insn out. */
25259 ready_try[n_ready] = 1;
25261 if (data->ready_try_change)
25262 bitmap_set_bit (data->ready_try_change, n_ready);
25267 /* Prepare for a new round of multipass lookahead scheduling. */
25268 static void
25269 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25270 bool first_cycle_insn_p)
25272 ix86_first_cycle_multipass_data_t data
25273 = (ix86_first_cycle_multipass_data_t) _data;
25274 const_ix86_first_cycle_multipass_data_t prev_data
25275 = ix86_first_cycle_multipass_data;
25277 /* Restore the state from the end of the previous round. */
25278 data->ifetch_block_len = prev_data->ifetch_block_len;
25279 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
25281 /* Filter instructions that cannot be issued on current cycle due to
25282 decoder restrictions. */
25283 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25284 first_cycle_insn_p);
25287 /* INSN is being issued in current solution. Account for its impact on
25288 the decoder model. */
25289 static void
25290 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
25291 rtx insn, const void *_prev_data)
25293 ix86_first_cycle_multipass_data_t data
25294 = (ix86_first_cycle_multipass_data_t) _data;
25295 const_ix86_first_cycle_multipass_data_t prev_data
25296 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
25298 int insn_size = min_insn_size (insn);
25300 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
25301 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
25302 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
25303 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25305 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
25306 if (!data->ready_try_change)
25308 data->ready_try_change = sbitmap_alloc (n_ready);
25309 data->ready_try_change_size = n_ready;
25311 else if (data->ready_try_change_size < n_ready)
25313 data->ready_try_change = sbitmap_resize (data->ready_try_change,
25314 n_ready, 0);
25315 data->ready_try_change_size = n_ready;
25317 bitmap_clear (data->ready_try_change);
25319 /* Filter out insns from ready_try that the core will not be able to issue
25320 on current cycle due to decoder. */
25321 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25322 false);
25325 /* Revert the effect on ready_try. */
25326 static void
25327 core2i7_first_cycle_multipass_backtrack (const void *_data,
25328 char *ready_try,
25329 int n_ready ATTRIBUTE_UNUSED)
25331 const_ix86_first_cycle_multipass_data_t data
25332 = (const_ix86_first_cycle_multipass_data_t) _data;
25333 unsigned int i = 0;
25334 sbitmap_iterator sbi;
25336 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
25337 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
25339 ready_try[i] = 0;
25343 /* Save the result of multipass lookahead scheduling for the next round. */
25344 static void
25345 core2i7_first_cycle_multipass_end (const void *_data)
25347 const_ix86_first_cycle_multipass_data_t data
25348 = (const_ix86_first_cycle_multipass_data_t) _data;
25349 ix86_first_cycle_multipass_data_t next_data
25350 = ix86_first_cycle_multipass_data;
25352 if (data != NULL)
25354 next_data->ifetch_block_len = data->ifetch_block_len;
25355 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
25359 /* Deallocate target data. */
25360 static void
25361 core2i7_first_cycle_multipass_fini (void *_data)
25363 ix86_first_cycle_multipass_data_t data
25364 = (ix86_first_cycle_multipass_data_t) _data;
25366 if (data->ready_try_change)
25368 sbitmap_free (data->ready_try_change);
25369 data->ready_try_change = NULL;
25370 data->ready_try_change_size = 0;
25374 /* Prepare for scheduling pass. */
25375 static void
25376 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
25377 int verbose ATTRIBUTE_UNUSED,
25378 int max_uid ATTRIBUTE_UNUSED)
25380 /* Install scheduling hooks for current CPU. Some of these hooks are used
25381 in time-critical parts of the scheduler, so we only set them up when
25382 they are actually used. */
25383 switch (ix86_tune)
25385 case PROCESSOR_CORE2:
25386 case PROCESSOR_COREI7:
25387 case PROCESSOR_HASWELL:
25388 /* Do not perform multipass scheduling for pre-reload schedule
25389 to save compile time. */
25390 if (reload_completed)
25392 targetm.sched.dfa_post_advance_cycle
25393 = core2i7_dfa_post_advance_cycle;
25394 targetm.sched.first_cycle_multipass_init
25395 = core2i7_first_cycle_multipass_init;
25396 targetm.sched.first_cycle_multipass_begin
25397 = core2i7_first_cycle_multipass_begin;
25398 targetm.sched.first_cycle_multipass_issue
25399 = core2i7_first_cycle_multipass_issue;
25400 targetm.sched.first_cycle_multipass_backtrack
25401 = core2i7_first_cycle_multipass_backtrack;
25402 targetm.sched.first_cycle_multipass_end
25403 = core2i7_first_cycle_multipass_end;
25404 targetm.sched.first_cycle_multipass_fini
25405 = core2i7_first_cycle_multipass_fini;
25407 /* Set decoder parameters. */
25408 core2i7_secondary_decoder_max_insn_size = 8;
25409 core2i7_ifetch_block_size = 16;
25410 core2i7_ifetch_block_max_insns = 6;
25411 break;
25413 /* ... Fall through ... */
25414 default:
25415 targetm.sched.dfa_post_advance_cycle = NULL;
25416 targetm.sched.first_cycle_multipass_init = NULL;
25417 targetm.sched.first_cycle_multipass_begin = NULL;
25418 targetm.sched.first_cycle_multipass_issue = NULL;
25419 targetm.sched.first_cycle_multipass_backtrack = NULL;
25420 targetm.sched.first_cycle_multipass_end = NULL;
25421 targetm.sched.first_cycle_multipass_fini = NULL;
25422 break;
25427 /* Compute the alignment given to a constant that is being placed in memory.
25428 EXP is the constant and ALIGN is the alignment that the object would
25429 ordinarily have.
25430 The value of this function is used instead of that alignment to align
25431 the object. */
25434 ix86_constant_alignment (tree exp, int align)
25436 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
25437 || TREE_CODE (exp) == INTEGER_CST)
25439 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
25440 return 64;
25441 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
25442 return 128;
25444 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
25445 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
25446 return BITS_PER_WORD;
25448 return align;
25451 /* Compute the alignment for a static variable.
25452 TYPE is the data type, and ALIGN is the alignment that
25453 the object would ordinarily have. The value of this function is used
25454 instead of that alignment to align the object. */
25457 ix86_data_alignment (tree type, int align, bool opt)
25459 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
25461 if (opt
25462 && AGGREGATE_TYPE_P (type)
25463 && TYPE_SIZE (type)
25464 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25465 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
25466 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
25467 && align < max_align)
25468 align = max_align;
25470 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25471 to 16byte boundary. */
25472 if (TARGET_64BIT)
25474 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
25475 && TYPE_SIZE (type)
25476 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25477 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
25478 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25479 return 128;
25482 if (!opt)
25483 return align;
25485 if (TREE_CODE (type) == ARRAY_TYPE)
25487 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25488 return 64;
25489 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25490 return 128;
25492 else if (TREE_CODE (type) == COMPLEX_TYPE)
25495 if (TYPE_MODE (type) == DCmode && align < 64)
25496 return 64;
25497 if ((TYPE_MODE (type) == XCmode
25498 || TYPE_MODE (type) == TCmode) && align < 128)
25499 return 128;
25501 else if ((TREE_CODE (type) == RECORD_TYPE
25502 || TREE_CODE (type) == UNION_TYPE
25503 || TREE_CODE (type) == QUAL_UNION_TYPE)
25504 && TYPE_FIELDS (type))
25506 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25507 return 64;
25508 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25509 return 128;
25511 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25512 || TREE_CODE (type) == INTEGER_TYPE)
25514 if (TYPE_MODE (type) == DFmode && align < 64)
25515 return 64;
25516 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25517 return 128;
25520 return align;
25523 /* Compute the alignment for a local variable or a stack slot. EXP is
25524 the data type or decl itself, MODE is the widest mode available and
25525 ALIGN is the alignment that the object would ordinarily have. The
25526 value of this macro is used instead of that alignment to align the
25527 object. */
25529 unsigned int
25530 ix86_local_alignment (tree exp, enum machine_mode mode,
25531 unsigned int align)
25533 tree type, decl;
25535 if (exp && DECL_P (exp))
25537 type = TREE_TYPE (exp);
25538 decl = exp;
25540 else
25542 type = exp;
25543 decl = NULL;
25546 /* Don't do dynamic stack realignment for long long objects with
25547 -mpreferred-stack-boundary=2. */
25548 if (!TARGET_64BIT
25549 && align == 64
25550 && ix86_preferred_stack_boundary < 64
25551 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25552 && (!type || !TYPE_USER_ALIGN (type))
25553 && (!decl || !DECL_USER_ALIGN (decl)))
25554 align = 32;
25556 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25557 register in MODE. We will return the largest alignment of XF
25558 and DF. */
25559 if (!type)
25561 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25562 align = GET_MODE_ALIGNMENT (DFmode);
25563 return align;
25566 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25567 to 16byte boundary. Exact wording is:
25569 An array uses the same alignment as its elements, except that a local or
25570 global array variable of length at least 16 bytes or
25571 a C99 variable-length array variable always has alignment of at least 16 bytes.
25573 This was added to allow use of aligned SSE instructions at arrays. This
25574 rule is meant for static storage (where compiler can not do the analysis
25575 by itself). We follow it for automatic variables only when convenient.
25576 We fully control everything in the function compiled and functions from
25577 other unit can not rely on the alignment.
25579 Exclude va_list type. It is the common case of local array where
25580 we can not benefit from the alignment.
25582 TODO: Probably one should optimize for size only when var is not escaping. */
25583 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25584 && TARGET_SSE)
25586 if (AGGREGATE_TYPE_P (type)
25587 && (va_list_type_node == NULL_TREE
25588 || (TYPE_MAIN_VARIANT (type)
25589 != TYPE_MAIN_VARIANT (va_list_type_node)))
25590 && TYPE_SIZE (type)
25591 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25592 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25593 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25594 return 128;
25596 if (TREE_CODE (type) == ARRAY_TYPE)
25598 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25599 return 64;
25600 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25601 return 128;
25603 else if (TREE_CODE (type) == COMPLEX_TYPE)
25605 if (TYPE_MODE (type) == DCmode && align < 64)
25606 return 64;
25607 if ((TYPE_MODE (type) == XCmode
25608 || TYPE_MODE (type) == TCmode) && align < 128)
25609 return 128;
25611 else if ((TREE_CODE (type) == RECORD_TYPE
25612 || TREE_CODE (type) == UNION_TYPE
25613 || TREE_CODE (type) == QUAL_UNION_TYPE)
25614 && TYPE_FIELDS (type))
25616 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25617 return 64;
25618 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25619 return 128;
25621 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25622 || TREE_CODE (type) == INTEGER_TYPE)
25625 if (TYPE_MODE (type) == DFmode && align < 64)
25626 return 64;
25627 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25628 return 128;
25630 return align;
25633 /* Compute the minimum required alignment for dynamic stack realignment
25634 purposes for a local variable, parameter or a stack slot. EXP is
25635 the data type or decl itself, MODE is its mode and ALIGN is the
25636 alignment that the object would ordinarily have. */
25638 unsigned int
25639 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25640 unsigned int align)
25642 tree type, decl;
25644 if (exp && DECL_P (exp))
25646 type = TREE_TYPE (exp);
25647 decl = exp;
25649 else
25651 type = exp;
25652 decl = NULL;
25655 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25656 return align;
25658 /* Don't do dynamic stack realignment for long long objects with
25659 -mpreferred-stack-boundary=2. */
25660 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25661 && (!type || !TYPE_USER_ALIGN (type))
25662 && (!decl || !DECL_USER_ALIGN (decl)))
25663 return 32;
25665 return align;
25668 /* Find a location for the static chain incoming to a nested function.
25669 This is a register, unless all free registers are used by arguments. */
25671 static rtx
25672 ix86_static_chain (const_tree fndecl, bool incoming_p)
25674 unsigned regno;
25676 if (!DECL_STATIC_CHAIN (fndecl))
25677 return NULL;
25679 if (TARGET_64BIT)
25681 /* We always use R10 in 64-bit mode. */
25682 regno = R10_REG;
25684 else
25686 tree fntype;
25687 unsigned int ccvt;
25689 /* By default in 32-bit mode we use ECX to pass the static chain. */
25690 regno = CX_REG;
25692 fntype = TREE_TYPE (fndecl);
25693 ccvt = ix86_get_callcvt (fntype);
25694 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25696 /* Fastcall functions use ecx/edx for arguments, which leaves
25697 us with EAX for the static chain.
25698 Thiscall functions use ecx for arguments, which also
25699 leaves us with EAX for the static chain. */
25700 regno = AX_REG;
25702 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25704 /* Thiscall functions use ecx for arguments, which leaves
25705 us with EAX and EDX for the static chain.
25706 We are using for abi-compatibility EAX. */
25707 regno = AX_REG;
25709 else if (ix86_function_regparm (fntype, fndecl) == 3)
25711 /* For regparm 3, we have no free call-clobbered registers in
25712 which to store the static chain. In order to implement this,
25713 we have the trampoline push the static chain to the stack.
25714 However, we can't push a value below the return address when
25715 we call the nested function directly, so we have to use an
25716 alternate entry point. For this we use ESI, and have the
25717 alternate entry point push ESI, so that things appear the
25718 same once we're executing the nested function. */
25719 if (incoming_p)
25721 if (fndecl == current_function_decl)
25722 ix86_static_chain_on_stack = true;
25723 return gen_frame_mem (SImode,
25724 plus_constant (Pmode,
25725 arg_pointer_rtx, -8));
25727 regno = SI_REG;
25731 return gen_rtx_REG (Pmode, regno);
25734 /* Emit RTL insns to initialize the variable parts of a trampoline.
25735 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25736 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25737 to be passed to the target function. */
25739 static void
25740 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25742 rtx mem, fnaddr;
25743 int opcode;
25744 int offset = 0;
25746 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25748 if (TARGET_64BIT)
25750 int size;
25752 /* Load the function address to r11. Try to load address using
25753 the shorter movl instead of movabs. We may want to support
25754 movq for kernel mode, but kernel does not use trampolines at
25755 the moment. FNADDR is a 32bit address and may not be in
25756 DImode when ptr_mode == SImode. Always use movl in this
25757 case. */
25758 if (ptr_mode == SImode
25759 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25761 fnaddr = copy_addr_to_reg (fnaddr);
25763 mem = adjust_address (m_tramp, HImode, offset);
25764 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25766 mem = adjust_address (m_tramp, SImode, offset + 2);
25767 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25768 offset += 6;
25770 else
25772 mem = adjust_address (m_tramp, HImode, offset);
25773 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25775 mem = adjust_address (m_tramp, DImode, offset + 2);
25776 emit_move_insn (mem, fnaddr);
25777 offset += 10;
25780 /* Load static chain using movabs to r10. Use the shorter movl
25781 instead of movabs when ptr_mode == SImode. */
25782 if (ptr_mode == SImode)
25784 opcode = 0xba41;
25785 size = 6;
25787 else
25789 opcode = 0xba49;
25790 size = 10;
25793 mem = adjust_address (m_tramp, HImode, offset);
25794 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25796 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25797 emit_move_insn (mem, chain_value);
25798 offset += size;
25800 /* Jump to r11; the last (unused) byte is a nop, only there to
25801 pad the write out to a single 32-bit store. */
25802 mem = adjust_address (m_tramp, SImode, offset);
25803 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25804 offset += 4;
25806 else
25808 rtx disp, chain;
25810 /* Depending on the static chain location, either load a register
25811 with a constant, or push the constant to the stack. All of the
25812 instructions are the same size. */
25813 chain = ix86_static_chain (fndecl, true);
25814 if (REG_P (chain))
25816 switch (REGNO (chain))
25818 case AX_REG:
25819 opcode = 0xb8; break;
25820 case CX_REG:
25821 opcode = 0xb9; break;
25822 default:
25823 gcc_unreachable ();
25826 else
25827 opcode = 0x68;
25829 mem = adjust_address (m_tramp, QImode, offset);
25830 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25832 mem = adjust_address (m_tramp, SImode, offset + 1);
25833 emit_move_insn (mem, chain_value);
25834 offset += 5;
25836 mem = adjust_address (m_tramp, QImode, offset);
25837 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25839 mem = adjust_address (m_tramp, SImode, offset + 1);
25841 /* Compute offset from the end of the jmp to the target function.
25842 In the case in which the trampoline stores the static chain on
25843 the stack, we need to skip the first insn which pushes the
25844 (call-saved) register static chain; this push is 1 byte. */
25845 offset += 5;
25846 disp = expand_binop (SImode, sub_optab, fnaddr,
25847 plus_constant (Pmode, XEXP (m_tramp, 0),
25848 offset - (MEM_P (chain) ? 1 : 0)),
25849 NULL_RTX, 1, OPTAB_DIRECT);
25850 emit_move_insn (mem, disp);
25853 gcc_assert (offset <= TRAMPOLINE_SIZE);
25855 #ifdef HAVE_ENABLE_EXECUTE_STACK
25856 #ifdef CHECK_EXECUTE_STACK_ENABLED
25857 if (CHECK_EXECUTE_STACK_ENABLED)
25858 #endif
25859 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25860 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25861 #endif
25864 /* The following file contains several enumerations and data structures
25865 built from the definitions in i386-builtin-types.def. */
25867 #include "i386-builtin-types.inc"
25869 /* Table for the ix86 builtin non-function types. */
25870 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25872 /* Retrieve an element from the above table, building some of
25873 the types lazily. */
25875 static tree
25876 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25878 unsigned int index;
25879 tree type, itype;
25881 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25883 type = ix86_builtin_type_tab[(int) tcode];
25884 if (type != NULL)
25885 return type;
25887 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25888 if (tcode <= IX86_BT_LAST_VECT)
25890 enum machine_mode mode;
25892 index = tcode - IX86_BT_LAST_PRIM - 1;
25893 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25894 mode = ix86_builtin_type_vect_mode[index];
25896 type = build_vector_type_for_mode (itype, mode);
25898 else
25900 int quals;
25902 index = tcode - IX86_BT_LAST_VECT - 1;
25903 if (tcode <= IX86_BT_LAST_PTR)
25904 quals = TYPE_UNQUALIFIED;
25905 else
25906 quals = TYPE_QUAL_CONST;
25908 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25909 if (quals != TYPE_UNQUALIFIED)
25910 itype = build_qualified_type (itype, quals);
25912 type = build_pointer_type (itype);
25915 ix86_builtin_type_tab[(int) tcode] = type;
25916 return type;
25919 /* Table for the ix86 builtin function types. */
25920 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25922 /* Retrieve an element from the above table, building some of
25923 the types lazily. */
25925 static tree
25926 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25928 tree type;
25930 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25932 type = ix86_builtin_func_type_tab[(int) tcode];
25933 if (type != NULL)
25934 return type;
25936 if (tcode <= IX86_BT_LAST_FUNC)
25938 unsigned start = ix86_builtin_func_start[(int) tcode];
25939 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25940 tree rtype, atype, args = void_list_node;
25941 unsigned i;
25943 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25944 for (i = after - 1; i > start; --i)
25946 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25947 args = tree_cons (NULL, atype, args);
25950 type = build_function_type (rtype, args);
25952 else
25954 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25955 enum ix86_builtin_func_type icode;
25957 icode = ix86_builtin_func_alias_base[index];
25958 type = ix86_get_builtin_func_type (icode);
25961 ix86_builtin_func_type_tab[(int) tcode] = type;
25962 return type;
25966 /* Codes for all the SSE/MMX builtins. */
25967 enum ix86_builtins
25969 IX86_BUILTIN_ADDPS,
25970 IX86_BUILTIN_ADDSS,
25971 IX86_BUILTIN_DIVPS,
25972 IX86_BUILTIN_DIVSS,
25973 IX86_BUILTIN_MULPS,
25974 IX86_BUILTIN_MULSS,
25975 IX86_BUILTIN_SUBPS,
25976 IX86_BUILTIN_SUBSS,
25978 IX86_BUILTIN_CMPEQPS,
25979 IX86_BUILTIN_CMPLTPS,
25980 IX86_BUILTIN_CMPLEPS,
25981 IX86_BUILTIN_CMPGTPS,
25982 IX86_BUILTIN_CMPGEPS,
25983 IX86_BUILTIN_CMPNEQPS,
25984 IX86_BUILTIN_CMPNLTPS,
25985 IX86_BUILTIN_CMPNLEPS,
25986 IX86_BUILTIN_CMPNGTPS,
25987 IX86_BUILTIN_CMPNGEPS,
25988 IX86_BUILTIN_CMPORDPS,
25989 IX86_BUILTIN_CMPUNORDPS,
25990 IX86_BUILTIN_CMPEQSS,
25991 IX86_BUILTIN_CMPLTSS,
25992 IX86_BUILTIN_CMPLESS,
25993 IX86_BUILTIN_CMPNEQSS,
25994 IX86_BUILTIN_CMPNLTSS,
25995 IX86_BUILTIN_CMPNLESS,
25996 IX86_BUILTIN_CMPORDSS,
25997 IX86_BUILTIN_CMPUNORDSS,
25999 IX86_BUILTIN_COMIEQSS,
26000 IX86_BUILTIN_COMILTSS,
26001 IX86_BUILTIN_COMILESS,
26002 IX86_BUILTIN_COMIGTSS,
26003 IX86_BUILTIN_COMIGESS,
26004 IX86_BUILTIN_COMINEQSS,
26005 IX86_BUILTIN_UCOMIEQSS,
26006 IX86_BUILTIN_UCOMILTSS,
26007 IX86_BUILTIN_UCOMILESS,
26008 IX86_BUILTIN_UCOMIGTSS,
26009 IX86_BUILTIN_UCOMIGESS,
26010 IX86_BUILTIN_UCOMINEQSS,
26012 IX86_BUILTIN_CVTPI2PS,
26013 IX86_BUILTIN_CVTPS2PI,
26014 IX86_BUILTIN_CVTSI2SS,
26015 IX86_BUILTIN_CVTSI642SS,
26016 IX86_BUILTIN_CVTSS2SI,
26017 IX86_BUILTIN_CVTSS2SI64,
26018 IX86_BUILTIN_CVTTPS2PI,
26019 IX86_BUILTIN_CVTTSS2SI,
26020 IX86_BUILTIN_CVTTSS2SI64,
26022 IX86_BUILTIN_MAXPS,
26023 IX86_BUILTIN_MAXSS,
26024 IX86_BUILTIN_MINPS,
26025 IX86_BUILTIN_MINSS,
26027 IX86_BUILTIN_LOADUPS,
26028 IX86_BUILTIN_STOREUPS,
26029 IX86_BUILTIN_MOVSS,
26031 IX86_BUILTIN_MOVHLPS,
26032 IX86_BUILTIN_MOVLHPS,
26033 IX86_BUILTIN_LOADHPS,
26034 IX86_BUILTIN_LOADLPS,
26035 IX86_BUILTIN_STOREHPS,
26036 IX86_BUILTIN_STORELPS,
26038 IX86_BUILTIN_MASKMOVQ,
26039 IX86_BUILTIN_MOVMSKPS,
26040 IX86_BUILTIN_PMOVMSKB,
26042 IX86_BUILTIN_MOVNTPS,
26043 IX86_BUILTIN_MOVNTQ,
26045 IX86_BUILTIN_LOADDQU,
26046 IX86_BUILTIN_STOREDQU,
26048 IX86_BUILTIN_PACKSSWB,
26049 IX86_BUILTIN_PACKSSDW,
26050 IX86_BUILTIN_PACKUSWB,
26052 IX86_BUILTIN_PADDB,
26053 IX86_BUILTIN_PADDW,
26054 IX86_BUILTIN_PADDD,
26055 IX86_BUILTIN_PADDQ,
26056 IX86_BUILTIN_PADDSB,
26057 IX86_BUILTIN_PADDSW,
26058 IX86_BUILTIN_PADDUSB,
26059 IX86_BUILTIN_PADDUSW,
26060 IX86_BUILTIN_PSUBB,
26061 IX86_BUILTIN_PSUBW,
26062 IX86_BUILTIN_PSUBD,
26063 IX86_BUILTIN_PSUBQ,
26064 IX86_BUILTIN_PSUBSB,
26065 IX86_BUILTIN_PSUBSW,
26066 IX86_BUILTIN_PSUBUSB,
26067 IX86_BUILTIN_PSUBUSW,
26069 IX86_BUILTIN_PAND,
26070 IX86_BUILTIN_PANDN,
26071 IX86_BUILTIN_POR,
26072 IX86_BUILTIN_PXOR,
26074 IX86_BUILTIN_PAVGB,
26075 IX86_BUILTIN_PAVGW,
26077 IX86_BUILTIN_PCMPEQB,
26078 IX86_BUILTIN_PCMPEQW,
26079 IX86_BUILTIN_PCMPEQD,
26080 IX86_BUILTIN_PCMPGTB,
26081 IX86_BUILTIN_PCMPGTW,
26082 IX86_BUILTIN_PCMPGTD,
26084 IX86_BUILTIN_PMADDWD,
26086 IX86_BUILTIN_PMAXSW,
26087 IX86_BUILTIN_PMAXUB,
26088 IX86_BUILTIN_PMINSW,
26089 IX86_BUILTIN_PMINUB,
26091 IX86_BUILTIN_PMULHUW,
26092 IX86_BUILTIN_PMULHW,
26093 IX86_BUILTIN_PMULLW,
26095 IX86_BUILTIN_PSADBW,
26096 IX86_BUILTIN_PSHUFW,
26098 IX86_BUILTIN_PSLLW,
26099 IX86_BUILTIN_PSLLD,
26100 IX86_BUILTIN_PSLLQ,
26101 IX86_BUILTIN_PSRAW,
26102 IX86_BUILTIN_PSRAD,
26103 IX86_BUILTIN_PSRLW,
26104 IX86_BUILTIN_PSRLD,
26105 IX86_BUILTIN_PSRLQ,
26106 IX86_BUILTIN_PSLLWI,
26107 IX86_BUILTIN_PSLLDI,
26108 IX86_BUILTIN_PSLLQI,
26109 IX86_BUILTIN_PSRAWI,
26110 IX86_BUILTIN_PSRADI,
26111 IX86_BUILTIN_PSRLWI,
26112 IX86_BUILTIN_PSRLDI,
26113 IX86_BUILTIN_PSRLQI,
26115 IX86_BUILTIN_PUNPCKHBW,
26116 IX86_BUILTIN_PUNPCKHWD,
26117 IX86_BUILTIN_PUNPCKHDQ,
26118 IX86_BUILTIN_PUNPCKLBW,
26119 IX86_BUILTIN_PUNPCKLWD,
26120 IX86_BUILTIN_PUNPCKLDQ,
26122 IX86_BUILTIN_SHUFPS,
26124 IX86_BUILTIN_RCPPS,
26125 IX86_BUILTIN_RCPSS,
26126 IX86_BUILTIN_RSQRTPS,
26127 IX86_BUILTIN_RSQRTPS_NR,
26128 IX86_BUILTIN_RSQRTSS,
26129 IX86_BUILTIN_RSQRTF,
26130 IX86_BUILTIN_SQRTPS,
26131 IX86_BUILTIN_SQRTPS_NR,
26132 IX86_BUILTIN_SQRTSS,
26134 IX86_BUILTIN_UNPCKHPS,
26135 IX86_BUILTIN_UNPCKLPS,
26137 IX86_BUILTIN_ANDPS,
26138 IX86_BUILTIN_ANDNPS,
26139 IX86_BUILTIN_ORPS,
26140 IX86_BUILTIN_XORPS,
26142 IX86_BUILTIN_EMMS,
26143 IX86_BUILTIN_LDMXCSR,
26144 IX86_BUILTIN_STMXCSR,
26145 IX86_BUILTIN_SFENCE,
26147 IX86_BUILTIN_FXSAVE,
26148 IX86_BUILTIN_FXRSTOR,
26149 IX86_BUILTIN_FXSAVE64,
26150 IX86_BUILTIN_FXRSTOR64,
26152 IX86_BUILTIN_XSAVE,
26153 IX86_BUILTIN_XRSTOR,
26154 IX86_BUILTIN_XSAVE64,
26155 IX86_BUILTIN_XRSTOR64,
26157 IX86_BUILTIN_XSAVEOPT,
26158 IX86_BUILTIN_XSAVEOPT64,
26160 /* 3DNow! Original */
26161 IX86_BUILTIN_FEMMS,
26162 IX86_BUILTIN_PAVGUSB,
26163 IX86_BUILTIN_PF2ID,
26164 IX86_BUILTIN_PFACC,
26165 IX86_BUILTIN_PFADD,
26166 IX86_BUILTIN_PFCMPEQ,
26167 IX86_BUILTIN_PFCMPGE,
26168 IX86_BUILTIN_PFCMPGT,
26169 IX86_BUILTIN_PFMAX,
26170 IX86_BUILTIN_PFMIN,
26171 IX86_BUILTIN_PFMUL,
26172 IX86_BUILTIN_PFRCP,
26173 IX86_BUILTIN_PFRCPIT1,
26174 IX86_BUILTIN_PFRCPIT2,
26175 IX86_BUILTIN_PFRSQIT1,
26176 IX86_BUILTIN_PFRSQRT,
26177 IX86_BUILTIN_PFSUB,
26178 IX86_BUILTIN_PFSUBR,
26179 IX86_BUILTIN_PI2FD,
26180 IX86_BUILTIN_PMULHRW,
26182 /* 3DNow! Athlon Extensions */
26183 IX86_BUILTIN_PF2IW,
26184 IX86_BUILTIN_PFNACC,
26185 IX86_BUILTIN_PFPNACC,
26186 IX86_BUILTIN_PI2FW,
26187 IX86_BUILTIN_PSWAPDSI,
26188 IX86_BUILTIN_PSWAPDSF,
26190 /* SSE2 */
26191 IX86_BUILTIN_ADDPD,
26192 IX86_BUILTIN_ADDSD,
26193 IX86_BUILTIN_DIVPD,
26194 IX86_BUILTIN_DIVSD,
26195 IX86_BUILTIN_MULPD,
26196 IX86_BUILTIN_MULSD,
26197 IX86_BUILTIN_SUBPD,
26198 IX86_BUILTIN_SUBSD,
26200 IX86_BUILTIN_CMPEQPD,
26201 IX86_BUILTIN_CMPLTPD,
26202 IX86_BUILTIN_CMPLEPD,
26203 IX86_BUILTIN_CMPGTPD,
26204 IX86_BUILTIN_CMPGEPD,
26205 IX86_BUILTIN_CMPNEQPD,
26206 IX86_BUILTIN_CMPNLTPD,
26207 IX86_BUILTIN_CMPNLEPD,
26208 IX86_BUILTIN_CMPNGTPD,
26209 IX86_BUILTIN_CMPNGEPD,
26210 IX86_BUILTIN_CMPORDPD,
26211 IX86_BUILTIN_CMPUNORDPD,
26212 IX86_BUILTIN_CMPEQSD,
26213 IX86_BUILTIN_CMPLTSD,
26214 IX86_BUILTIN_CMPLESD,
26215 IX86_BUILTIN_CMPNEQSD,
26216 IX86_BUILTIN_CMPNLTSD,
26217 IX86_BUILTIN_CMPNLESD,
26218 IX86_BUILTIN_CMPORDSD,
26219 IX86_BUILTIN_CMPUNORDSD,
26221 IX86_BUILTIN_COMIEQSD,
26222 IX86_BUILTIN_COMILTSD,
26223 IX86_BUILTIN_COMILESD,
26224 IX86_BUILTIN_COMIGTSD,
26225 IX86_BUILTIN_COMIGESD,
26226 IX86_BUILTIN_COMINEQSD,
26227 IX86_BUILTIN_UCOMIEQSD,
26228 IX86_BUILTIN_UCOMILTSD,
26229 IX86_BUILTIN_UCOMILESD,
26230 IX86_BUILTIN_UCOMIGTSD,
26231 IX86_BUILTIN_UCOMIGESD,
26232 IX86_BUILTIN_UCOMINEQSD,
26234 IX86_BUILTIN_MAXPD,
26235 IX86_BUILTIN_MAXSD,
26236 IX86_BUILTIN_MINPD,
26237 IX86_BUILTIN_MINSD,
26239 IX86_BUILTIN_ANDPD,
26240 IX86_BUILTIN_ANDNPD,
26241 IX86_BUILTIN_ORPD,
26242 IX86_BUILTIN_XORPD,
26244 IX86_BUILTIN_SQRTPD,
26245 IX86_BUILTIN_SQRTSD,
26247 IX86_BUILTIN_UNPCKHPD,
26248 IX86_BUILTIN_UNPCKLPD,
26250 IX86_BUILTIN_SHUFPD,
26252 IX86_BUILTIN_LOADUPD,
26253 IX86_BUILTIN_STOREUPD,
26254 IX86_BUILTIN_MOVSD,
26256 IX86_BUILTIN_LOADHPD,
26257 IX86_BUILTIN_LOADLPD,
26259 IX86_BUILTIN_CVTDQ2PD,
26260 IX86_BUILTIN_CVTDQ2PS,
26262 IX86_BUILTIN_CVTPD2DQ,
26263 IX86_BUILTIN_CVTPD2PI,
26264 IX86_BUILTIN_CVTPD2PS,
26265 IX86_BUILTIN_CVTTPD2DQ,
26266 IX86_BUILTIN_CVTTPD2PI,
26268 IX86_BUILTIN_CVTPI2PD,
26269 IX86_BUILTIN_CVTSI2SD,
26270 IX86_BUILTIN_CVTSI642SD,
26272 IX86_BUILTIN_CVTSD2SI,
26273 IX86_BUILTIN_CVTSD2SI64,
26274 IX86_BUILTIN_CVTSD2SS,
26275 IX86_BUILTIN_CVTSS2SD,
26276 IX86_BUILTIN_CVTTSD2SI,
26277 IX86_BUILTIN_CVTTSD2SI64,
26279 IX86_BUILTIN_CVTPS2DQ,
26280 IX86_BUILTIN_CVTPS2PD,
26281 IX86_BUILTIN_CVTTPS2DQ,
26283 IX86_BUILTIN_MOVNTI,
26284 IX86_BUILTIN_MOVNTI64,
26285 IX86_BUILTIN_MOVNTPD,
26286 IX86_BUILTIN_MOVNTDQ,
26288 IX86_BUILTIN_MOVQ128,
26290 /* SSE2 MMX */
26291 IX86_BUILTIN_MASKMOVDQU,
26292 IX86_BUILTIN_MOVMSKPD,
26293 IX86_BUILTIN_PMOVMSKB128,
26295 IX86_BUILTIN_PACKSSWB128,
26296 IX86_BUILTIN_PACKSSDW128,
26297 IX86_BUILTIN_PACKUSWB128,
26299 IX86_BUILTIN_PADDB128,
26300 IX86_BUILTIN_PADDW128,
26301 IX86_BUILTIN_PADDD128,
26302 IX86_BUILTIN_PADDQ128,
26303 IX86_BUILTIN_PADDSB128,
26304 IX86_BUILTIN_PADDSW128,
26305 IX86_BUILTIN_PADDUSB128,
26306 IX86_BUILTIN_PADDUSW128,
26307 IX86_BUILTIN_PSUBB128,
26308 IX86_BUILTIN_PSUBW128,
26309 IX86_BUILTIN_PSUBD128,
26310 IX86_BUILTIN_PSUBQ128,
26311 IX86_BUILTIN_PSUBSB128,
26312 IX86_BUILTIN_PSUBSW128,
26313 IX86_BUILTIN_PSUBUSB128,
26314 IX86_BUILTIN_PSUBUSW128,
26316 IX86_BUILTIN_PAND128,
26317 IX86_BUILTIN_PANDN128,
26318 IX86_BUILTIN_POR128,
26319 IX86_BUILTIN_PXOR128,
26321 IX86_BUILTIN_PAVGB128,
26322 IX86_BUILTIN_PAVGW128,
26324 IX86_BUILTIN_PCMPEQB128,
26325 IX86_BUILTIN_PCMPEQW128,
26326 IX86_BUILTIN_PCMPEQD128,
26327 IX86_BUILTIN_PCMPGTB128,
26328 IX86_BUILTIN_PCMPGTW128,
26329 IX86_BUILTIN_PCMPGTD128,
26331 IX86_BUILTIN_PMADDWD128,
26333 IX86_BUILTIN_PMAXSW128,
26334 IX86_BUILTIN_PMAXUB128,
26335 IX86_BUILTIN_PMINSW128,
26336 IX86_BUILTIN_PMINUB128,
26338 IX86_BUILTIN_PMULUDQ,
26339 IX86_BUILTIN_PMULUDQ128,
26340 IX86_BUILTIN_PMULHUW128,
26341 IX86_BUILTIN_PMULHW128,
26342 IX86_BUILTIN_PMULLW128,
26344 IX86_BUILTIN_PSADBW128,
26345 IX86_BUILTIN_PSHUFHW,
26346 IX86_BUILTIN_PSHUFLW,
26347 IX86_BUILTIN_PSHUFD,
26349 IX86_BUILTIN_PSLLDQI128,
26350 IX86_BUILTIN_PSLLWI128,
26351 IX86_BUILTIN_PSLLDI128,
26352 IX86_BUILTIN_PSLLQI128,
26353 IX86_BUILTIN_PSRAWI128,
26354 IX86_BUILTIN_PSRADI128,
26355 IX86_BUILTIN_PSRLDQI128,
26356 IX86_BUILTIN_PSRLWI128,
26357 IX86_BUILTIN_PSRLDI128,
26358 IX86_BUILTIN_PSRLQI128,
26360 IX86_BUILTIN_PSLLDQ128,
26361 IX86_BUILTIN_PSLLW128,
26362 IX86_BUILTIN_PSLLD128,
26363 IX86_BUILTIN_PSLLQ128,
26364 IX86_BUILTIN_PSRAW128,
26365 IX86_BUILTIN_PSRAD128,
26366 IX86_BUILTIN_PSRLW128,
26367 IX86_BUILTIN_PSRLD128,
26368 IX86_BUILTIN_PSRLQ128,
26370 IX86_BUILTIN_PUNPCKHBW128,
26371 IX86_BUILTIN_PUNPCKHWD128,
26372 IX86_BUILTIN_PUNPCKHDQ128,
26373 IX86_BUILTIN_PUNPCKHQDQ128,
26374 IX86_BUILTIN_PUNPCKLBW128,
26375 IX86_BUILTIN_PUNPCKLWD128,
26376 IX86_BUILTIN_PUNPCKLDQ128,
26377 IX86_BUILTIN_PUNPCKLQDQ128,
26379 IX86_BUILTIN_CLFLUSH,
26380 IX86_BUILTIN_MFENCE,
26381 IX86_BUILTIN_LFENCE,
26382 IX86_BUILTIN_PAUSE,
26384 IX86_BUILTIN_BSRSI,
26385 IX86_BUILTIN_BSRDI,
26386 IX86_BUILTIN_RDPMC,
26387 IX86_BUILTIN_RDTSC,
26388 IX86_BUILTIN_RDTSCP,
26389 IX86_BUILTIN_ROLQI,
26390 IX86_BUILTIN_ROLHI,
26391 IX86_BUILTIN_RORQI,
26392 IX86_BUILTIN_RORHI,
26394 /* SSE3. */
26395 IX86_BUILTIN_ADDSUBPS,
26396 IX86_BUILTIN_HADDPS,
26397 IX86_BUILTIN_HSUBPS,
26398 IX86_BUILTIN_MOVSHDUP,
26399 IX86_BUILTIN_MOVSLDUP,
26400 IX86_BUILTIN_ADDSUBPD,
26401 IX86_BUILTIN_HADDPD,
26402 IX86_BUILTIN_HSUBPD,
26403 IX86_BUILTIN_LDDQU,
26405 IX86_BUILTIN_MONITOR,
26406 IX86_BUILTIN_MWAIT,
26408 /* SSSE3. */
26409 IX86_BUILTIN_PHADDW,
26410 IX86_BUILTIN_PHADDD,
26411 IX86_BUILTIN_PHADDSW,
26412 IX86_BUILTIN_PHSUBW,
26413 IX86_BUILTIN_PHSUBD,
26414 IX86_BUILTIN_PHSUBSW,
26415 IX86_BUILTIN_PMADDUBSW,
26416 IX86_BUILTIN_PMULHRSW,
26417 IX86_BUILTIN_PSHUFB,
26418 IX86_BUILTIN_PSIGNB,
26419 IX86_BUILTIN_PSIGNW,
26420 IX86_BUILTIN_PSIGND,
26421 IX86_BUILTIN_PALIGNR,
26422 IX86_BUILTIN_PABSB,
26423 IX86_BUILTIN_PABSW,
26424 IX86_BUILTIN_PABSD,
26426 IX86_BUILTIN_PHADDW128,
26427 IX86_BUILTIN_PHADDD128,
26428 IX86_BUILTIN_PHADDSW128,
26429 IX86_BUILTIN_PHSUBW128,
26430 IX86_BUILTIN_PHSUBD128,
26431 IX86_BUILTIN_PHSUBSW128,
26432 IX86_BUILTIN_PMADDUBSW128,
26433 IX86_BUILTIN_PMULHRSW128,
26434 IX86_BUILTIN_PSHUFB128,
26435 IX86_BUILTIN_PSIGNB128,
26436 IX86_BUILTIN_PSIGNW128,
26437 IX86_BUILTIN_PSIGND128,
26438 IX86_BUILTIN_PALIGNR128,
26439 IX86_BUILTIN_PABSB128,
26440 IX86_BUILTIN_PABSW128,
26441 IX86_BUILTIN_PABSD128,
26443 /* AMDFAM10 - SSE4A New Instructions. */
26444 IX86_BUILTIN_MOVNTSD,
26445 IX86_BUILTIN_MOVNTSS,
26446 IX86_BUILTIN_EXTRQI,
26447 IX86_BUILTIN_EXTRQ,
26448 IX86_BUILTIN_INSERTQI,
26449 IX86_BUILTIN_INSERTQ,
26451 /* SSE4.1. */
26452 IX86_BUILTIN_BLENDPD,
26453 IX86_BUILTIN_BLENDPS,
26454 IX86_BUILTIN_BLENDVPD,
26455 IX86_BUILTIN_BLENDVPS,
26456 IX86_BUILTIN_PBLENDVB128,
26457 IX86_BUILTIN_PBLENDW128,
26459 IX86_BUILTIN_DPPD,
26460 IX86_BUILTIN_DPPS,
26462 IX86_BUILTIN_INSERTPS128,
26464 IX86_BUILTIN_MOVNTDQA,
26465 IX86_BUILTIN_MPSADBW128,
26466 IX86_BUILTIN_PACKUSDW128,
26467 IX86_BUILTIN_PCMPEQQ,
26468 IX86_BUILTIN_PHMINPOSUW128,
26470 IX86_BUILTIN_PMAXSB128,
26471 IX86_BUILTIN_PMAXSD128,
26472 IX86_BUILTIN_PMAXUD128,
26473 IX86_BUILTIN_PMAXUW128,
26475 IX86_BUILTIN_PMINSB128,
26476 IX86_BUILTIN_PMINSD128,
26477 IX86_BUILTIN_PMINUD128,
26478 IX86_BUILTIN_PMINUW128,
26480 IX86_BUILTIN_PMOVSXBW128,
26481 IX86_BUILTIN_PMOVSXBD128,
26482 IX86_BUILTIN_PMOVSXBQ128,
26483 IX86_BUILTIN_PMOVSXWD128,
26484 IX86_BUILTIN_PMOVSXWQ128,
26485 IX86_BUILTIN_PMOVSXDQ128,
26487 IX86_BUILTIN_PMOVZXBW128,
26488 IX86_BUILTIN_PMOVZXBD128,
26489 IX86_BUILTIN_PMOVZXBQ128,
26490 IX86_BUILTIN_PMOVZXWD128,
26491 IX86_BUILTIN_PMOVZXWQ128,
26492 IX86_BUILTIN_PMOVZXDQ128,
26494 IX86_BUILTIN_PMULDQ128,
26495 IX86_BUILTIN_PMULLD128,
26497 IX86_BUILTIN_ROUNDSD,
26498 IX86_BUILTIN_ROUNDSS,
26500 IX86_BUILTIN_ROUNDPD,
26501 IX86_BUILTIN_ROUNDPS,
26503 IX86_BUILTIN_FLOORPD,
26504 IX86_BUILTIN_CEILPD,
26505 IX86_BUILTIN_TRUNCPD,
26506 IX86_BUILTIN_RINTPD,
26507 IX86_BUILTIN_ROUNDPD_AZ,
26509 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26510 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26511 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26513 IX86_BUILTIN_FLOORPS,
26514 IX86_BUILTIN_CEILPS,
26515 IX86_BUILTIN_TRUNCPS,
26516 IX86_BUILTIN_RINTPS,
26517 IX86_BUILTIN_ROUNDPS_AZ,
26519 IX86_BUILTIN_FLOORPS_SFIX,
26520 IX86_BUILTIN_CEILPS_SFIX,
26521 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26523 IX86_BUILTIN_PTESTZ,
26524 IX86_BUILTIN_PTESTC,
26525 IX86_BUILTIN_PTESTNZC,
26527 IX86_BUILTIN_VEC_INIT_V2SI,
26528 IX86_BUILTIN_VEC_INIT_V4HI,
26529 IX86_BUILTIN_VEC_INIT_V8QI,
26530 IX86_BUILTIN_VEC_EXT_V2DF,
26531 IX86_BUILTIN_VEC_EXT_V2DI,
26532 IX86_BUILTIN_VEC_EXT_V4SF,
26533 IX86_BUILTIN_VEC_EXT_V4SI,
26534 IX86_BUILTIN_VEC_EXT_V8HI,
26535 IX86_BUILTIN_VEC_EXT_V2SI,
26536 IX86_BUILTIN_VEC_EXT_V4HI,
26537 IX86_BUILTIN_VEC_EXT_V16QI,
26538 IX86_BUILTIN_VEC_SET_V2DI,
26539 IX86_BUILTIN_VEC_SET_V4SF,
26540 IX86_BUILTIN_VEC_SET_V4SI,
26541 IX86_BUILTIN_VEC_SET_V8HI,
26542 IX86_BUILTIN_VEC_SET_V4HI,
26543 IX86_BUILTIN_VEC_SET_V16QI,
26545 IX86_BUILTIN_VEC_PACK_SFIX,
26546 IX86_BUILTIN_VEC_PACK_SFIX256,
26548 /* SSE4.2. */
26549 IX86_BUILTIN_CRC32QI,
26550 IX86_BUILTIN_CRC32HI,
26551 IX86_BUILTIN_CRC32SI,
26552 IX86_BUILTIN_CRC32DI,
26554 IX86_BUILTIN_PCMPESTRI128,
26555 IX86_BUILTIN_PCMPESTRM128,
26556 IX86_BUILTIN_PCMPESTRA128,
26557 IX86_BUILTIN_PCMPESTRC128,
26558 IX86_BUILTIN_PCMPESTRO128,
26559 IX86_BUILTIN_PCMPESTRS128,
26560 IX86_BUILTIN_PCMPESTRZ128,
26561 IX86_BUILTIN_PCMPISTRI128,
26562 IX86_BUILTIN_PCMPISTRM128,
26563 IX86_BUILTIN_PCMPISTRA128,
26564 IX86_BUILTIN_PCMPISTRC128,
26565 IX86_BUILTIN_PCMPISTRO128,
26566 IX86_BUILTIN_PCMPISTRS128,
26567 IX86_BUILTIN_PCMPISTRZ128,
26569 IX86_BUILTIN_PCMPGTQ,
26571 /* AES instructions */
26572 IX86_BUILTIN_AESENC128,
26573 IX86_BUILTIN_AESENCLAST128,
26574 IX86_BUILTIN_AESDEC128,
26575 IX86_BUILTIN_AESDECLAST128,
26576 IX86_BUILTIN_AESIMC128,
26577 IX86_BUILTIN_AESKEYGENASSIST128,
26579 /* PCLMUL instruction */
26580 IX86_BUILTIN_PCLMULQDQ128,
26582 /* AVX */
26583 IX86_BUILTIN_ADDPD256,
26584 IX86_BUILTIN_ADDPS256,
26585 IX86_BUILTIN_ADDSUBPD256,
26586 IX86_BUILTIN_ADDSUBPS256,
26587 IX86_BUILTIN_ANDPD256,
26588 IX86_BUILTIN_ANDPS256,
26589 IX86_BUILTIN_ANDNPD256,
26590 IX86_BUILTIN_ANDNPS256,
26591 IX86_BUILTIN_BLENDPD256,
26592 IX86_BUILTIN_BLENDPS256,
26593 IX86_BUILTIN_BLENDVPD256,
26594 IX86_BUILTIN_BLENDVPS256,
26595 IX86_BUILTIN_DIVPD256,
26596 IX86_BUILTIN_DIVPS256,
26597 IX86_BUILTIN_DPPS256,
26598 IX86_BUILTIN_HADDPD256,
26599 IX86_BUILTIN_HADDPS256,
26600 IX86_BUILTIN_HSUBPD256,
26601 IX86_BUILTIN_HSUBPS256,
26602 IX86_BUILTIN_MAXPD256,
26603 IX86_BUILTIN_MAXPS256,
26604 IX86_BUILTIN_MINPD256,
26605 IX86_BUILTIN_MINPS256,
26606 IX86_BUILTIN_MULPD256,
26607 IX86_BUILTIN_MULPS256,
26608 IX86_BUILTIN_ORPD256,
26609 IX86_BUILTIN_ORPS256,
26610 IX86_BUILTIN_SHUFPD256,
26611 IX86_BUILTIN_SHUFPS256,
26612 IX86_BUILTIN_SUBPD256,
26613 IX86_BUILTIN_SUBPS256,
26614 IX86_BUILTIN_XORPD256,
26615 IX86_BUILTIN_XORPS256,
26616 IX86_BUILTIN_CMPSD,
26617 IX86_BUILTIN_CMPSS,
26618 IX86_BUILTIN_CMPPD,
26619 IX86_BUILTIN_CMPPS,
26620 IX86_BUILTIN_CMPPD256,
26621 IX86_BUILTIN_CMPPS256,
26622 IX86_BUILTIN_CVTDQ2PD256,
26623 IX86_BUILTIN_CVTDQ2PS256,
26624 IX86_BUILTIN_CVTPD2PS256,
26625 IX86_BUILTIN_CVTPS2DQ256,
26626 IX86_BUILTIN_CVTPS2PD256,
26627 IX86_BUILTIN_CVTTPD2DQ256,
26628 IX86_BUILTIN_CVTPD2DQ256,
26629 IX86_BUILTIN_CVTTPS2DQ256,
26630 IX86_BUILTIN_EXTRACTF128PD256,
26631 IX86_BUILTIN_EXTRACTF128PS256,
26632 IX86_BUILTIN_EXTRACTF128SI256,
26633 IX86_BUILTIN_VZEROALL,
26634 IX86_BUILTIN_VZEROUPPER,
26635 IX86_BUILTIN_VPERMILVARPD,
26636 IX86_BUILTIN_VPERMILVARPS,
26637 IX86_BUILTIN_VPERMILVARPD256,
26638 IX86_BUILTIN_VPERMILVARPS256,
26639 IX86_BUILTIN_VPERMILPD,
26640 IX86_BUILTIN_VPERMILPS,
26641 IX86_BUILTIN_VPERMILPD256,
26642 IX86_BUILTIN_VPERMILPS256,
26643 IX86_BUILTIN_VPERMIL2PD,
26644 IX86_BUILTIN_VPERMIL2PS,
26645 IX86_BUILTIN_VPERMIL2PD256,
26646 IX86_BUILTIN_VPERMIL2PS256,
26647 IX86_BUILTIN_VPERM2F128PD256,
26648 IX86_BUILTIN_VPERM2F128PS256,
26649 IX86_BUILTIN_VPERM2F128SI256,
26650 IX86_BUILTIN_VBROADCASTSS,
26651 IX86_BUILTIN_VBROADCASTSD256,
26652 IX86_BUILTIN_VBROADCASTSS256,
26653 IX86_BUILTIN_VBROADCASTPD256,
26654 IX86_BUILTIN_VBROADCASTPS256,
26655 IX86_BUILTIN_VINSERTF128PD256,
26656 IX86_BUILTIN_VINSERTF128PS256,
26657 IX86_BUILTIN_VINSERTF128SI256,
26658 IX86_BUILTIN_LOADUPD256,
26659 IX86_BUILTIN_LOADUPS256,
26660 IX86_BUILTIN_STOREUPD256,
26661 IX86_BUILTIN_STOREUPS256,
26662 IX86_BUILTIN_LDDQU256,
26663 IX86_BUILTIN_MOVNTDQ256,
26664 IX86_BUILTIN_MOVNTPD256,
26665 IX86_BUILTIN_MOVNTPS256,
26666 IX86_BUILTIN_LOADDQU256,
26667 IX86_BUILTIN_STOREDQU256,
26668 IX86_BUILTIN_MASKLOADPD,
26669 IX86_BUILTIN_MASKLOADPS,
26670 IX86_BUILTIN_MASKSTOREPD,
26671 IX86_BUILTIN_MASKSTOREPS,
26672 IX86_BUILTIN_MASKLOADPD256,
26673 IX86_BUILTIN_MASKLOADPS256,
26674 IX86_BUILTIN_MASKSTOREPD256,
26675 IX86_BUILTIN_MASKSTOREPS256,
26676 IX86_BUILTIN_MOVSHDUP256,
26677 IX86_BUILTIN_MOVSLDUP256,
26678 IX86_BUILTIN_MOVDDUP256,
26680 IX86_BUILTIN_SQRTPD256,
26681 IX86_BUILTIN_SQRTPS256,
26682 IX86_BUILTIN_SQRTPS_NR256,
26683 IX86_BUILTIN_RSQRTPS256,
26684 IX86_BUILTIN_RSQRTPS_NR256,
26686 IX86_BUILTIN_RCPPS256,
26688 IX86_BUILTIN_ROUNDPD256,
26689 IX86_BUILTIN_ROUNDPS256,
26691 IX86_BUILTIN_FLOORPD256,
26692 IX86_BUILTIN_CEILPD256,
26693 IX86_BUILTIN_TRUNCPD256,
26694 IX86_BUILTIN_RINTPD256,
26695 IX86_BUILTIN_ROUNDPD_AZ256,
26697 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26698 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26699 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26701 IX86_BUILTIN_FLOORPS256,
26702 IX86_BUILTIN_CEILPS256,
26703 IX86_BUILTIN_TRUNCPS256,
26704 IX86_BUILTIN_RINTPS256,
26705 IX86_BUILTIN_ROUNDPS_AZ256,
26707 IX86_BUILTIN_FLOORPS_SFIX256,
26708 IX86_BUILTIN_CEILPS_SFIX256,
26709 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26711 IX86_BUILTIN_UNPCKHPD256,
26712 IX86_BUILTIN_UNPCKLPD256,
26713 IX86_BUILTIN_UNPCKHPS256,
26714 IX86_BUILTIN_UNPCKLPS256,
26716 IX86_BUILTIN_SI256_SI,
26717 IX86_BUILTIN_PS256_PS,
26718 IX86_BUILTIN_PD256_PD,
26719 IX86_BUILTIN_SI_SI256,
26720 IX86_BUILTIN_PS_PS256,
26721 IX86_BUILTIN_PD_PD256,
26723 IX86_BUILTIN_VTESTZPD,
26724 IX86_BUILTIN_VTESTCPD,
26725 IX86_BUILTIN_VTESTNZCPD,
26726 IX86_BUILTIN_VTESTZPS,
26727 IX86_BUILTIN_VTESTCPS,
26728 IX86_BUILTIN_VTESTNZCPS,
26729 IX86_BUILTIN_VTESTZPD256,
26730 IX86_BUILTIN_VTESTCPD256,
26731 IX86_BUILTIN_VTESTNZCPD256,
26732 IX86_BUILTIN_VTESTZPS256,
26733 IX86_BUILTIN_VTESTCPS256,
26734 IX86_BUILTIN_VTESTNZCPS256,
26735 IX86_BUILTIN_PTESTZ256,
26736 IX86_BUILTIN_PTESTC256,
26737 IX86_BUILTIN_PTESTNZC256,
26739 IX86_BUILTIN_MOVMSKPD256,
26740 IX86_BUILTIN_MOVMSKPS256,
26742 /* AVX2 */
26743 IX86_BUILTIN_MPSADBW256,
26744 IX86_BUILTIN_PABSB256,
26745 IX86_BUILTIN_PABSW256,
26746 IX86_BUILTIN_PABSD256,
26747 IX86_BUILTIN_PACKSSDW256,
26748 IX86_BUILTIN_PACKSSWB256,
26749 IX86_BUILTIN_PACKUSDW256,
26750 IX86_BUILTIN_PACKUSWB256,
26751 IX86_BUILTIN_PADDB256,
26752 IX86_BUILTIN_PADDW256,
26753 IX86_BUILTIN_PADDD256,
26754 IX86_BUILTIN_PADDQ256,
26755 IX86_BUILTIN_PADDSB256,
26756 IX86_BUILTIN_PADDSW256,
26757 IX86_BUILTIN_PADDUSB256,
26758 IX86_BUILTIN_PADDUSW256,
26759 IX86_BUILTIN_PALIGNR256,
26760 IX86_BUILTIN_AND256I,
26761 IX86_BUILTIN_ANDNOT256I,
26762 IX86_BUILTIN_PAVGB256,
26763 IX86_BUILTIN_PAVGW256,
26764 IX86_BUILTIN_PBLENDVB256,
26765 IX86_BUILTIN_PBLENDVW256,
26766 IX86_BUILTIN_PCMPEQB256,
26767 IX86_BUILTIN_PCMPEQW256,
26768 IX86_BUILTIN_PCMPEQD256,
26769 IX86_BUILTIN_PCMPEQQ256,
26770 IX86_BUILTIN_PCMPGTB256,
26771 IX86_BUILTIN_PCMPGTW256,
26772 IX86_BUILTIN_PCMPGTD256,
26773 IX86_BUILTIN_PCMPGTQ256,
26774 IX86_BUILTIN_PHADDW256,
26775 IX86_BUILTIN_PHADDD256,
26776 IX86_BUILTIN_PHADDSW256,
26777 IX86_BUILTIN_PHSUBW256,
26778 IX86_BUILTIN_PHSUBD256,
26779 IX86_BUILTIN_PHSUBSW256,
26780 IX86_BUILTIN_PMADDUBSW256,
26781 IX86_BUILTIN_PMADDWD256,
26782 IX86_BUILTIN_PMAXSB256,
26783 IX86_BUILTIN_PMAXSW256,
26784 IX86_BUILTIN_PMAXSD256,
26785 IX86_BUILTIN_PMAXUB256,
26786 IX86_BUILTIN_PMAXUW256,
26787 IX86_BUILTIN_PMAXUD256,
26788 IX86_BUILTIN_PMINSB256,
26789 IX86_BUILTIN_PMINSW256,
26790 IX86_BUILTIN_PMINSD256,
26791 IX86_BUILTIN_PMINUB256,
26792 IX86_BUILTIN_PMINUW256,
26793 IX86_BUILTIN_PMINUD256,
26794 IX86_BUILTIN_PMOVMSKB256,
26795 IX86_BUILTIN_PMOVSXBW256,
26796 IX86_BUILTIN_PMOVSXBD256,
26797 IX86_BUILTIN_PMOVSXBQ256,
26798 IX86_BUILTIN_PMOVSXWD256,
26799 IX86_BUILTIN_PMOVSXWQ256,
26800 IX86_BUILTIN_PMOVSXDQ256,
26801 IX86_BUILTIN_PMOVZXBW256,
26802 IX86_BUILTIN_PMOVZXBD256,
26803 IX86_BUILTIN_PMOVZXBQ256,
26804 IX86_BUILTIN_PMOVZXWD256,
26805 IX86_BUILTIN_PMOVZXWQ256,
26806 IX86_BUILTIN_PMOVZXDQ256,
26807 IX86_BUILTIN_PMULDQ256,
26808 IX86_BUILTIN_PMULHRSW256,
26809 IX86_BUILTIN_PMULHUW256,
26810 IX86_BUILTIN_PMULHW256,
26811 IX86_BUILTIN_PMULLW256,
26812 IX86_BUILTIN_PMULLD256,
26813 IX86_BUILTIN_PMULUDQ256,
26814 IX86_BUILTIN_POR256,
26815 IX86_BUILTIN_PSADBW256,
26816 IX86_BUILTIN_PSHUFB256,
26817 IX86_BUILTIN_PSHUFD256,
26818 IX86_BUILTIN_PSHUFHW256,
26819 IX86_BUILTIN_PSHUFLW256,
26820 IX86_BUILTIN_PSIGNB256,
26821 IX86_BUILTIN_PSIGNW256,
26822 IX86_BUILTIN_PSIGND256,
26823 IX86_BUILTIN_PSLLDQI256,
26824 IX86_BUILTIN_PSLLWI256,
26825 IX86_BUILTIN_PSLLW256,
26826 IX86_BUILTIN_PSLLDI256,
26827 IX86_BUILTIN_PSLLD256,
26828 IX86_BUILTIN_PSLLQI256,
26829 IX86_BUILTIN_PSLLQ256,
26830 IX86_BUILTIN_PSRAWI256,
26831 IX86_BUILTIN_PSRAW256,
26832 IX86_BUILTIN_PSRADI256,
26833 IX86_BUILTIN_PSRAD256,
26834 IX86_BUILTIN_PSRLDQI256,
26835 IX86_BUILTIN_PSRLWI256,
26836 IX86_BUILTIN_PSRLW256,
26837 IX86_BUILTIN_PSRLDI256,
26838 IX86_BUILTIN_PSRLD256,
26839 IX86_BUILTIN_PSRLQI256,
26840 IX86_BUILTIN_PSRLQ256,
26841 IX86_BUILTIN_PSUBB256,
26842 IX86_BUILTIN_PSUBW256,
26843 IX86_BUILTIN_PSUBD256,
26844 IX86_BUILTIN_PSUBQ256,
26845 IX86_BUILTIN_PSUBSB256,
26846 IX86_BUILTIN_PSUBSW256,
26847 IX86_BUILTIN_PSUBUSB256,
26848 IX86_BUILTIN_PSUBUSW256,
26849 IX86_BUILTIN_PUNPCKHBW256,
26850 IX86_BUILTIN_PUNPCKHWD256,
26851 IX86_BUILTIN_PUNPCKHDQ256,
26852 IX86_BUILTIN_PUNPCKHQDQ256,
26853 IX86_BUILTIN_PUNPCKLBW256,
26854 IX86_BUILTIN_PUNPCKLWD256,
26855 IX86_BUILTIN_PUNPCKLDQ256,
26856 IX86_BUILTIN_PUNPCKLQDQ256,
26857 IX86_BUILTIN_PXOR256,
26858 IX86_BUILTIN_MOVNTDQA256,
26859 IX86_BUILTIN_VBROADCASTSS_PS,
26860 IX86_BUILTIN_VBROADCASTSS_PS256,
26861 IX86_BUILTIN_VBROADCASTSD_PD256,
26862 IX86_BUILTIN_VBROADCASTSI256,
26863 IX86_BUILTIN_PBLENDD256,
26864 IX86_BUILTIN_PBLENDD128,
26865 IX86_BUILTIN_PBROADCASTB256,
26866 IX86_BUILTIN_PBROADCASTW256,
26867 IX86_BUILTIN_PBROADCASTD256,
26868 IX86_BUILTIN_PBROADCASTQ256,
26869 IX86_BUILTIN_PBROADCASTB128,
26870 IX86_BUILTIN_PBROADCASTW128,
26871 IX86_BUILTIN_PBROADCASTD128,
26872 IX86_BUILTIN_PBROADCASTQ128,
26873 IX86_BUILTIN_VPERMVARSI256,
26874 IX86_BUILTIN_VPERMDF256,
26875 IX86_BUILTIN_VPERMVARSF256,
26876 IX86_BUILTIN_VPERMDI256,
26877 IX86_BUILTIN_VPERMTI256,
26878 IX86_BUILTIN_VEXTRACT128I256,
26879 IX86_BUILTIN_VINSERT128I256,
26880 IX86_BUILTIN_MASKLOADD,
26881 IX86_BUILTIN_MASKLOADQ,
26882 IX86_BUILTIN_MASKLOADD256,
26883 IX86_BUILTIN_MASKLOADQ256,
26884 IX86_BUILTIN_MASKSTORED,
26885 IX86_BUILTIN_MASKSTOREQ,
26886 IX86_BUILTIN_MASKSTORED256,
26887 IX86_BUILTIN_MASKSTOREQ256,
26888 IX86_BUILTIN_PSLLVV4DI,
26889 IX86_BUILTIN_PSLLVV2DI,
26890 IX86_BUILTIN_PSLLVV8SI,
26891 IX86_BUILTIN_PSLLVV4SI,
26892 IX86_BUILTIN_PSRAVV8SI,
26893 IX86_BUILTIN_PSRAVV4SI,
26894 IX86_BUILTIN_PSRLVV4DI,
26895 IX86_BUILTIN_PSRLVV2DI,
26896 IX86_BUILTIN_PSRLVV8SI,
26897 IX86_BUILTIN_PSRLVV4SI,
26899 IX86_BUILTIN_GATHERSIV2DF,
26900 IX86_BUILTIN_GATHERSIV4DF,
26901 IX86_BUILTIN_GATHERDIV2DF,
26902 IX86_BUILTIN_GATHERDIV4DF,
26903 IX86_BUILTIN_GATHERSIV4SF,
26904 IX86_BUILTIN_GATHERSIV8SF,
26905 IX86_BUILTIN_GATHERDIV4SF,
26906 IX86_BUILTIN_GATHERDIV8SF,
26907 IX86_BUILTIN_GATHERSIV2DI,
26908 IX86_BUILTIN_GATHERSIV4DI,
26909 IX86_BUILTIN_GATHERDIV2DI,
26910 IX86_BUILTIN_GATHERDIV4DI,
26911 IX86_BUILTIN_GATHERSIV4SI,
26912 IX86_BUILTIN_GATHERSIV8SI,
26913 IX86_BUILTIN_GATHERDIV4SI,
26914 IX86_BUILTIN_GATHERDIV8SI,
26916 /* Alternate 4 element gather for the vectorizer where
26917 all operands are 32-byte wide. */
26918 IX86_BUILTIN_GATHERALTSIV4DF,
26919 IX86_BUILTIN_GATHERALTDIV8SF,
26920 IX86_BUILTIN_GATHERALTSIV4DI,
26921 IX86_BUILTIN_GATHERALTDIV8SI,
26923 /* TFmode support builtins. */
26924 IX86_BUILTIN_INFQ,
26925 IX86_BUILTIN_HUGE_VALQ,
26926 IX86_BUILTIN_FABSQ,
26927 IX86_BUILTIN_COPYSIGNQ,
26929 /* Vectorizer support builtins. */
26930 IX86_BUILTIN_CPYSGNPS,
26931 IX86_BUILTIN_CPYSGNPD,
26932 IX86_BUILTIN_CPYSGNPS256,
26933 IX86_BUILTIN_CPYSGNPD256,
26935 /* FMA4 instructions. */
26936 IX86_BUILTIN_VFMADDSS,
26937 IX86_BUILTIN_VFMADDSD,
26938 IX86_BUILTIN_VFMADDPS,
26939 IX86_BUILTIN_VFMADDPD,
26940 IX86_BUILTIN_VFMADDPS256,
26941 IX86_BUILTIN_VFMADDPD256,
26942 IX86_BUILTIN_VFMADDSUBPS,
26943 IX86_BUILTIN_VFMADDSUBPD,
26944 IX86_BUILTIN_VFMADDSUBPS256,
26945 IX86_BUILTIN_VFMADDSUBPD256,
26947 /* FMA3 instructions. */
26948 IX86_BUILTIN_VFMADDSS3,
26949 IX86_BUILTIN_VFMADDSD3,
26951 /* XOP instructions. */
26952 IX86_BUILTIN_VPCMOV,
26953 IX86_BUILTIN_VPCMOV_V2DI,
26954 IX86_BUILTIN_VPCMOV_V4SI,
26955 IX86_BUILTIN_VPCMOV_V8HI,
26956 IX86_BUILTIN_VPCMOV_V16QI,
26957 IX86_BUILTIN_VPCMOV_V4SF,
26958 IX86_BUILTIN_VPCMOV_V2DF,
26959 IX86_BUILTIN_VPCMOV256,
26960 IX86_BUILTIN_VPCMOV_V4DI256,
26961 IX86_BUILTIN_VPCMOV_V8SI256,
26962 IX86_BUILTIN_VPCMOV_V16HI256,
26963 IX86_BUILTIN_VPCMOV_V32QI256,
26964 IX86_BUILTIN_VPCMOV_V8SF256,
26965 IX86_BUILTIN_VPCMOV_V4DF256,
26967 IX86_BUILTIN_VPPERM,
26969 IX86_BUILTIN_VPMACSSWW,
26970 IX86_BUILTIN_VPMACSWW,
26971 IX86_BUILTIN_VPMACSSWD,
26972 IX86_BUILTIN_VPMACSWD,
26973 IX86_BUILTIN_VPMACSSDD,
26974 IX86_BUILTIN_VPMACSDD,
26975 IX86_BUILTIN_VPMACSSDQL,
26976 IX86_BUILTIN_VPMACSSDQH,
26977 IX86_BUILTIN_VPMACSDQL,
26978 IX86_BUILTIN_VPMACSDQH,
26979 IX86_BUILTIN_VPMADCSSWD,
26980 IX86_BUILTIN_VPMADCSWD,
26982 IX86_BUILTIN_VPHADDBW,
26983 IX86_BUILTIN_VPHADDBD,
26984 IX86_BUILTIN_VPHADDBQ,
26985 IX86_BUILTIN_VPHADDWD,
26986 IX86_BUILTIN_VPHADDWQ,
26987 IX86_BUILTIN_VPHADDDQ,
26988 IX86_BUILTIN_VPHADDUBW,
26989 IX86_BUILTIN_VPHADDUBD,
26990 IX86_BUILTIN_VPHADDUBQ,
26991 IX86_BUILTIN_VPHADDUWD,
26992 IX86_BUILTIN_VPHADDUWQ,
26993 IX86_BUILTIN_VPHADDUDQ,
26994 IX86_BUILTIN_VPHSUBBW,
26995 IX86_BUILTIN_VPHSUBWD,
26996 IX86_BUILTIN_VPHSUBDQ,
26998 IX86_BUILTIN_VPROTB,
26999 IX86_BUILTIN_VPROTW,
27000 IX86_BUILTIN_VPROTD,
27001 IX86_BUILTIN_VPROTQ,
27002 IX86_BUILTIN_VPROTB_IMM,
27003 IX86_BUILTIN_VPROTW_IMM,
27004 IX86_BUILTIN_VPROTD_IMM,
27005 IX86_BUILTIN_VPROTQ_IMM,
27007 IX86_BUILTIN_VPSHLB,
27008 IX86_BUILTIN_VPSHLW,
27009 IX86_BUILTIN_VPSHLD,
27010 IX86_BUILTIN_VPSHLQ,
27011 IX86_BUILTIN_VPSHAB,
27012 IX86_BUILTIN_VPSHAW,
27013 IX86_BUILTIN_VPSHAD,
27014 IX86_BUILTIN_VPSHAQ,
27016 IX86_BUILTIN_VFRCZSS,
27017 IX86_BUILTIN_VFRCZSD,
27018 IX86_BUILTIN_VFRCZPS,
27019 IX86_BUILTIN_VFRCZPD,
27020 IX86_BUILTIN_VFRCZPS256,
27021 IX86_BUILTIN_VFRCZPD256,
27023 IX86_BUILTIN_VPCOMEQUB,
27024 IX86_BUILTIN_VPCOMNEUB,
27025 IX86_BUILTIN_VPCOMLTUB,
27026 IX86_BUILTIN_VPCOMLEUB,
27027 IX86_BUILTIN_VPCOMGTUB,
27028 IX86_BUILTIN_VPCOMGEUB,
27029 IX86_BUILTIN_VPCOMFALSEUB,
27030 IX86_BUILTIN_VPCOMTRUEUB,
27032 IX86_BUILTIN_VPCOMEQUW,
27033 IX86_BUILTIN_VPCOMNEUW,
27034 IX86_BUILTIN_VPCOMLTUW,
27035 IX86_BUILTIN_VPCOMLEUW,
27036 IX86_BUILTIN_VPCOMGTUW,
27037 IX86_BUILTIN_VPCOMGEUW,
27038 IX86_BUILTIN_VPCOMFALSEUW,
27039 IX86_BUILTIN_VPCOMTRUEUW,
27041 IX86_BUILTIN_VPCOMEQUD,
27042 IX86_BUILTIN_VPCOMNEUD,
27043 IX86_BUILTIN_VPCOMLTUD,
27044 IX86_BUILTIN_VPCOMLEUD,
27045 IX86_BUILTIN_VPCOMGTUD,
27046 IX86_BUILTIN_VPCOMGEUD,
27047 IX86_BUILTIN_VPCOMFALSEUD,
27048 IX86_BUILTIN_VPCOMTRUEUD,
27050 IX86_BUILTIN_VPCOMEQUQ,
27051 IX86_BUILTIN_VPCOMNEUQ,
27052 IX86_BUILTIN_VPCOMLTUQ,
27053 IX86_BUILTIN_VPCOMLEUQ,
27054 IX86_BUILTIN_VPCOMGTUQ,
27055 IX86_BUILTIN_VPCOMGEUQ,
27056 IX86_BUILTIN_VPCOMFALSEUQ,
27057 IX86_BUILTIN_VPCOMTRUEUQ,
27059 IX86_BUILTIN_VPCOMEQB,
27060 IX86_BUILTIN_VPCOMNEB,
27061 IX86_BUILTIN_VPCOMLTB,
27062 IX86_BUILTIN_VPCOMLEB,
27063 IX86_BUILTIN_VPCOMGTB,
27064 IX86_BUILTIN_VPCOMGEB,
27065 IX86_BUILTIN_VPCOMFALSEB,
27066 IX86_BUILTIN_VPCOMTRUEB,
27068 IX86_BUILTIN_VPCOMEQW,
27069 IX86_BUILTIN_VPCOMNEW,
27070 IX86_BUILTIN_VPCOMLTW,
27071 IX86_BUILTIN_VPCOMLEW,
27072 IX86_BUILTIN_VPCOMGTW,
27073 IX86_BUILTIN_VPCOMGEW,
27074 IX86_BUILTIN_VPCOMFALSEW,
27075 IX86_BUILTIN_VPCOMTRUEW,
27077 IX86_BUILTIN_VPCOMEQD,
27078 IX86_BUILTIN_VPCOMNED,
27079 IX86_BUILTIN_VPCOMLTD,
27080 IX86_BUILTIN_VPCOMLED,
27081 IX86_BUILTIN_VPCOMGTD,
27082 IX86_BUILTIN_VPCOMGED,
27083 IX86_BUILTIN_VPCOMFALSED,
27084 IX86_BUILTIN_VPCOMTRUED,
27086 IX86_BUILTIN_VPCOMEQQ,
27087 IX86_BUILTIN_VPCOMNEQ,
27088 IX86_BUILTIN_VPCOMLTQ,
27089 IX86_BUILTIN_VPCOMLEQ,
27090 IX86_BUILTIN_VPCOMGTQ,
27091 IX86_BUILTIN_VPCOMGEQ,
27092 IX86_BUILTIN_VPCOMFALSEQ,
27093 IX86_BUILTIN_VPCOMTRUEQ,
27095 /* LWP instructions. */
27096 IX86_BUILTIN_LLWPCB,
27097 IX86_BUILTIN_SLWPCB,
27098 IX86_BUILTIN_LWPVAL32,
27099 IX86_BUILTIN_LWPVAL64,
27100 IX86_BUILTIN_LWPINS32,
27101 IX86_BUILTIN_LWPINS64,
27103 IX86_BUILTIN_CLZS,
27105 /* RTM */
27106 IX86_BUILTIN_XBEGIN,
27107 IX86_BUILTIN_XEND,
27108 IX86_BUILTIN_XABORT,
27109 IX86_BUILTIN_XTEST,
27111 /* BMI instructions. */
27112 IX86_BUILTIN_BEXTR32,
27113 IX86_BUILTIN_BEXTR64,
27114 IX86_BUILTIN_CTZS,
27116 /* TBM instructions. */
27117 IX86_BUILTIN_BEXTRI32,
27118 IX86_BUILTIN_BEXTRI64,
27120 /* BMI2 instructions. */
27121 IX86_BUILTIN_BZHI32,
27122 IX86_BUILTIN_BZHI64,
27123 IX86_BUILTIN_PDEP32,
27124 IX86_BUILTIN_PDEP64,
27125 IX86_BUILTIN_PEXT32,
27126 IX86_BUILTIN_PEXT64,
27128 /* ADX instructions. */
27129 IX86_BUILTIN_ADDCARRYX32,
27130 IX86_BUILTIN_ADDCARRYX64,
27132 /* FSGSBASE instructions. */
27133 IX86_BUILTIN_RDFSBASE32,
27134 IX86_BUILTIN_RDFSBASE64,
27135 IX86_BUILTIN_RDGSBASE32,
27136 IX86_BUILTIN_RDGSBASE64,
27137 IX86_BUILTIN_WRFSBASE32,
27138 IX86_BUILTIN_WRFSBASE64,
27139 IX86_BUILTIN_WRGSBASE32,
27140 IX86_BUILTIN_WRGSBASE64,
27142 /* RDRND instructions. */
27143 IX86_BUILTIN_RDRAND16_STEP,
27144 IX86_BUILTIN_RDRAND32_STEP,
27145 IX86_BUILTIN_RDRAND64_STEP,
27147 /* RDSEED instructions. */
27148 IX86_BUILTIN_RDSEED16_STEP,
27149 IX86_BUILTIN_RDSEED32_STEP,
27150 IX86_BUILTIN_RDSEED64_STEP,
27152 /* F16C instructions. */
27153 IX86_BUILTIN_CVTPH2PS,
27154 IX86_BUILTIN_CVTPH2PS256,
27155 IX86_BUILTIN_CVTPS2PH,
27156 IX86_BUILTIN_CVTPS2PH256,
27158 /* CFString built-in for darwin */
27159 IX86_BUILTIN_CFSTRING,
27161 /* Builtins to get CPU type and supported features. */
27162 IX86_BUILTIN_CPU_INIT,
27163 IX86_BUILTIN_CPU_IS,
27164 IX86_BUILTIN_CPU_SUPPORTS,
27166 IX86_BUILTIN_MAX
27169 /* Table for the ix86 builtin decls. */
27170 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27172 /* Table of all of the builtin functions that are possible with different ISA's
27173 but are waiting to be built until a function is declared to use that
27174 ISA. */
27175 struct builtin_isa {
27176 const char *name; /* function name */
27177 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27178 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27179 bool const_p; /* true if the declaration is constant */
27180 bool set_and_not_built_p;
27183 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27186 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27187 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27188 function decl in the ix86_builtins array. Returns the function decl or
27189 NULL_TREE, if the builtin was not added.
27191 If the front end has a special hook for builtin functions, delay adding
27192 builtin functions that aren't in the current ISA until the ISA is changed
27193 with function specific optimization. Doing so, can save about 300K for the
27194 default compiler. When the builtin is expanded, check at that time whether
27195 it is valid.
27197 If the front end doesn't have a special hook, record all builtins, even if
27198 it isn't an instruction set in the current ISA in case the user uses
27199 function specific options for a different ISA, so that we don't get scope
27200 errors if a builtin is added in the middle of a function scope. */
27202 static inline tree
27203 def_builtin (HOST_WIDE_INT mask, const char *name,
27204 enum ix86_builtin_func_type tcode,
27205 enum ix86_builtins code)
27207 tree decl = NULL_TREE;
27209 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27211 ix86_builtins_isa[(int) code].isa = mask;
27213 mask &= ~OPTION_MASK_ISA_64BIT;
27214 if (mask == 0
27215 || (mask & ix86_isa_flags) != 0
27216 || (lang_hooks.builtin_function
27217 == lang_hooks.builtin_function_ext_scope))
27220 tree type = ix86_get_builtin_func_type (tcode);
27221 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27222 NULL, NULL_TREE);
27223 ix86_builtins[(int) code] = decl;
27224 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27226 else
27228 ix86_builtins[(int) code] = NULL_TREE;
27229 ix86_builtins_isa[(int) code].tcode = tcode;
27230 ix86_builtins_isa[(int) code].name = name;
27231 ix86_builtins_isa[(int) code].const_p = false;
27232 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27236 return decl;
27239 /* Like def_builtin, but also marks the function decl "const". */
27241 static inline tree
27242 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27243 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27245 tree decl = def_builtin (mask, name, tcode, code);
27246 if (decl)
27247 TREE_READONLY (decl) = 1;
27248 else
27249 ix86_builtins_isa[(int) code].const_p = true;
27251 return decl;
27254 /* Add any new builtin functions for a given ISA that may not have been
27255 declared. This saves a bit of space compared to adding all of the
27256 declarations to the tree, even if we didn't use them. */
27258 static void
27259 ix86_add_new_builtins (HOST_WIDE_INT isa)
27261 int i;
27263 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27265 if ((ix86_builtins_isa[i].isa & isa) != 0
27266 && ix86_builtins_isa[i].set_and_not_built_p)
27268 tree decl, type;
27270 /* Don't define the builtin again. */
27271 ix86_builtins_isa[i].set_and_not_built_p = false;
27273 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
27274 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
27275 type, i, BUILT_IN_MD, NULL,
27276 NULL_TREE);
27278 ix86_builtins[i] = decl;
27279 if (ix86_builtins_isa[i].const_p)
27280 TREE_READONLY (decl) = 1;
27285 /* Bits for builtin_description.flag. */
27287 /* Set when we don't support the comparison natively, and should
27288 swap_comparison in order to support it. */
27289 #define BUILTIN_DESC_SWAP_OPERANDS 1
27291 struct builtin_description
27293 const HOST_WIDE_INT mask;
27294 const enum insn_code icode;
27295 const char *const name;
27296 const enum ix86_builtins code;
27297 const enum rtx_code comparison;
27298 const int flag;
27301 static const struct builtin_description bdesc_comi[] =
27303 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
27304 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
27305 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
27306 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
27307 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
27308 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
27309 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
27310 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
27311 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
27312 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
27313 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
27314 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
27315 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
27316 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
27317 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
27318 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
27319 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
27320 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
27321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
27322 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
27323 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
27324 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
27325 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
27326 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
27329 static const struct builtin_description bdesc_pcmpestr[] =
27331 /* SSE4.2 */
27332 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
27333 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
27334 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
27335 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
27336 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
27337 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
27338 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
27341 static const struct builtin_description bdesc_pcmpistr[] =
27343 /* SSE4.2 */
27344 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
27345 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
27346 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
27347 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
27348 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
27349 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
27350 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
27353 /* Special builtins with variable number of arguments. */
27354 static const struct builtin_description bdesc_special_args[] =
27356 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
27357 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
27358 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
27360 /* MMX */
27361 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27363 /* 3DNow! */
27364 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27366 /* FXSR, XSAVE and XSAVEOPT */
27367 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
27368 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
27369 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27370 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27371 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27373 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27374 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27375 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27376 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27377 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27379 /* SSE */
27380 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27381 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27382 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27384 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27385 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27386 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27387 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27389 /* SSE or 3DNow!A */
27390 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27391 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
27393 /* SSE2 */
27394 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27395 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27396 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27397 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
27398 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27399 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
27400 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
27401 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
27402 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
27403 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27405 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27406 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27408 /* SSE3 */
27409 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27411 /* SSE4.1 */
27412 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
27414 /* SSE4A */
27415 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27416 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27418 /* AVX */
27419 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
27420 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
27422 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27423 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27424 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27425 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
27426 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
27428 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27429 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27430 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27431 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27432 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27433 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
27434 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27436 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
27437 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27438 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27440 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
27441 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
27442 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
27443 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
27444 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
27445 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
27446 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
27447 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
27449 /* AVX2 */
27450 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
27451 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
27452 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
27453 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
27454 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
27455 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
27456 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
27457 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
27458 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
27460 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
27461 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
27462 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
27463 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
27464 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
27465 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
27467 /* FSGSBASE */
27468 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27469 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27470 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27471 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27472 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27473 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27474 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27475 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27477 /* RTM */
27478 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27479 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
27480 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
27483 /* Builtins with variable number of arguments. */
27484 static const struct builtin_description bdesc_args[] =
27486 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
27487 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
27488 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
27489 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27490 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27491 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27492 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27494 /* MMX */
27495 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27496 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27497 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27498 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27499 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27500 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27502 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27503 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27504 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27505 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27506 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27507 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27508 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27509 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27511 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27512 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27514 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27515 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27516 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27517 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27519 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27520 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27521 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27522 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27523 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27524 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27526 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27527 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27528 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27529 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27530 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27531 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27533 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27534 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27535 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27537 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27539 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27540 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27541 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27542 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27543 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27544 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27546 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27547 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27548 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27549 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27550 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27551 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27553 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27554 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27555 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27556 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27558 /* 3DNow! */
27559 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27560 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27561 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27562 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27564 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27565 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27566 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27567 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27568 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27569 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27570 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27571 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27572 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27573 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27574 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27575 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27576 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27577 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27578 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27580 /* 3DNow!A */
27581 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27582 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27583 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27584 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27585 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27586 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27588 /* SSE */
27589 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27590 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27591 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27592 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27593 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27594 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27595 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27596 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27597 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27598 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27599 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27600 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27602 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27604 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27605 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27606 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27607 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27608 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27609 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27610 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27611 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27613 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27614 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27615 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27616 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27617 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27618 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27619 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27620 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27621 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27622 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27623 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27624 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27625 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27626 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27627 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27628 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27629 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27630 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27631 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27632 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27634 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27635 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27636 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27637 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27639 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27640 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27641 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27642 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27644 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27646 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27647 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27648 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27649 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27650 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27652 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27653 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27654 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27656 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27658 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27659 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27660 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27662 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27663 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27665 /* SSE MMX or 3Dnow!A */
27666 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27667 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27668 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27670 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27671 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27672 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27673 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27675 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27676 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27678 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27680 /* SSE2 */
27681 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27683 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27684 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27685 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27686 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27687 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27689 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27690 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27691 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27692 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27693 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27695 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27697 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27698 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27699 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27700 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27702 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27703 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27704 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27706 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27707 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27708 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27709 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27710 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27711 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27712 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27713 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27715 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27716 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27717 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27718 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27719 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27720 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27721 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27722 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27723 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27724 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27725 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27726 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27727 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27728 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27729 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27730 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27731 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27732 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27733 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27734 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27736 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27737 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27738 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27739 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27741 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27742 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27743 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27744 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27746 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27748 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27749 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27750 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27752 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27754 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27755 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27756 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27757 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27758 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27759 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27760 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27761 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27763 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27764 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27765 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27766 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27767 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27768 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27769 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27770 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27772 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27773 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27775 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27776 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27777 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27778 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27780 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27781 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27783 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27784 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27786 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27787 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27788 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27790 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27791 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27792 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27793 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27795 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27796 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27797 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27798 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27799 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27800 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27801 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27802 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27804 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27805 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27806 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27808 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27809 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27811 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27812 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27814 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27816 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27817 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27818 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27819 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27821 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27822 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27823 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27824 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27825 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27826 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27827 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27829 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27830 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27831 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27832 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27833 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27834 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27835 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27837 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27838 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27839 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27840 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27842 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27843 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27844 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27846 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27848 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27850 /* SSE2 MMX */
27851 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27852 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27854 /* SSE3 */
27855 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27856 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27858 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27859 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27860 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27861 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27862 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27863 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27865 /* SSSE3 */
27866 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27867 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27868 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27869 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27870 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27871 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27873 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27874 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27875 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27876 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27877 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27878 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27879 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27880 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27881 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27882 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27883 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27884 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27885 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27886 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27887 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27888 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27889 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27890 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27891 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27892 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27893 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27894 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27895 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27896 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27898 /* SSSE3. */
27899 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27900 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27902 /* SSE4.1 */
27903 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27904 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27905 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27906 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27907 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27908 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27909 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27910 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27911 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27912 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27914 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27915 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27916 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27917 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27918 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27919 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27920 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27921 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27922 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27923 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27924 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27925 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27926 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27928 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27929 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27930 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27931 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27932 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27933 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27934 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27935 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27936 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27937 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27938 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27939 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27941 /* SSE4.1 */
27942 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27943 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27944 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27945 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27947 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27948 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27949 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27950 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27952 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27953 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27955 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27956 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27958 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27959 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27960 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27961 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27963 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27964 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27966 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27967 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27969 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27970 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27971 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27973 /* SSE4.2 */
27974 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27975 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27976 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27977 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27978 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27980 /* SSE4A */
27981 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27982 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27983 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27984 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27986 /* AES */
27987 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27988 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27990 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27991 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27992 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27993 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27995 /* PCLMUL */
27996 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27998 /* AVX */
27999 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28000 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28001 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28002 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28003 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28004 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28005 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28006 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28007 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28008 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28009 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28010 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28011 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28012 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28013 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28014 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28015 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28016 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28017 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28018 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28019 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28020 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28021 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28022 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28023 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28024 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28026 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28027 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28028 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28029 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28031 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28032 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28033 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28034 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28035 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28036 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28037 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28038 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28039 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28040 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28041 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28042 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28043 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28044 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28045 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28046 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28047 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28048 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28049 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28050 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28051 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28052 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28053 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28054 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28055 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28056 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28057 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28058 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28059 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28060 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28061 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28062 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28063 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28064 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28066 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28067 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28068 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28070 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28071 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28072 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28073 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28074 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28076 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28078 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28079 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28081 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28082 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28083 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28084 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28086 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28087 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28089 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28090 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28092 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28093 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28094 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28095 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28097 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28098 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28100 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28101 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28103 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28104 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28105 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28106 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28108 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28109 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28110 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28111 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28112 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28113 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28115 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28116 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28117 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28118 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28119 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28120 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28121 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28122 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28123 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28124 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28125 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28126 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28127 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28128 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28129 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28131 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28132 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28134 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28135 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28137 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28139 /* AVX2 */
28140 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28141 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28142 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28143 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28144 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28145 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28146 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28147 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28148 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28149 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28150 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28151 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28152 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28153 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28154 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28155 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28156 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28157 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28158 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28159 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28160 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28161 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28162 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28163 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28164 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28165 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28166 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28167 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28168 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28169 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28170 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28171 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28172 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28173 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28174 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28175 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28176 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28177 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28178 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28179 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28180 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28181 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28182 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28183 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28184 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28185 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28186 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28187 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28188 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28189 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28190 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28191 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28192 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28193 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28194 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28195 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28196 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28197 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28198 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28199 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28200 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28201 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28202 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28203 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28204 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28205 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28206 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28207 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28208 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28209 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28210 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28211 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28212 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28213 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28214 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28215 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28216 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28217 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28218 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28219 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28220 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28221 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28222 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28223 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28224 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28225 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28226 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28227 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28228 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28229 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28230 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28231 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28232 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28233 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28234 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28235 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28236 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28237 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28238 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28239 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28240 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28241 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28242 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28243 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28244 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28245 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28246 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28247 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28248 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28249 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28250 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28251 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28252 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28253 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28254 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28255 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28256 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28257 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28258 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28259 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28260 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28261 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28262 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28263 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28264 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28265 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28266 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28267 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28268 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28269 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28270 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28271 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28272 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
28273 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
28274 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
28275 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
28276 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28277 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28278 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28279 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28280 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28281 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28282 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28283 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28284 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28285 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28287 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28289 /* BMI */
28290 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28291 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28292 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28294 /* TBM */
28295 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28296 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28298 /* F16C */
28299 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
28300 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
28301 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
28302 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
28304 /* BMI2 */
28305 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28306 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28307 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28308 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28309 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28310 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28313 /* FMA4 and XOP. */
28314 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
28315 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
28316 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
28317 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
28318 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
28319 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
28320 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
28321 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
28322 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
28323 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
28324 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
28325 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
28326 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
28327 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
28328 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
28329 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
28330 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
28331 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
28332 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
28333 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
28334 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
28335 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
28336 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
28337 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
28338 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
28339 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
28340 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
28341 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
28342 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
28343 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
28344 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
28345 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
28346 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
28347 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
28348 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
28349 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
28350 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
28351 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
28352 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
28353 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
28354 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
28355 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
28356 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
28357 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
28358 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
28359 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
28360 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
28361 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
28362 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
28363 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
28364 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
28365 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
28367 static const struct builtin_description bdesc_multi_arg[] =
28369 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
28370 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
28371 UNKNOWN, (int)MULTI_ARG_3_SF },
28372 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
28373 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
28374 UNKNOWN, (int)MULTI_ARG_3_DF },
28376 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
28377 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
28378 UNKNOWN, (int)MULTI_ARG_3_SF },
28379 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
28380 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
28381 UNKNOWN, (int)MULTI_ARG_3_DF },
28383 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
28384 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
28385 UNKNOWN, (int)MULTI_ARG_3_SF },
28386 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
28387 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
28388 UNKNOWN, (int)MULTI_ARG_3_DF },
28389 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
28390 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
28391 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28392 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
28393 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
28394 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28396 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
28397 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
28398 UNKNOWN, (int)MULTI_ARG_3_SF },
28399 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
28400 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
28401 UNKNOWN, (int)MULTI_ARG_3_DF },
28402 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
28403 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
28404 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28405 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
28406 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
28407 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28409 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
28410 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
28411 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
28412 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
28413 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
28414 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
28415 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
28417 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28418 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28419 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
28420 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
28421 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
28422 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
28423 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
28425 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
28427 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28428 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28429 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28430 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28431 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28432 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28433 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28434 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28435 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28436 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28437 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28438 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28440 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28441 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
28442 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
28443 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
28444 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
28445 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
28446 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
28447 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
28448 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28449 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
28450 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
28451 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
28452 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28453 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
28454 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
28455 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
28457 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
28458 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
28459 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
28460 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
28461 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
28462 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
28464 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28465 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28466 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28467 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28468 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28469 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28470 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28471 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28472 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28473 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28474 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28475 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28476 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28477 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28478 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28480 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
28481 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28482 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28483 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
28484 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
28485 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
28486 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
28488 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
28489 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28490 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28491 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
28492 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28493 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28494 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28496 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28497 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28498 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28499 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28500 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28501 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28502 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28504 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28505 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28506 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28507 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28508 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28509 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28510 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28512 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28513 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28514 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28515 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28516 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28517 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28518 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28520 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28521 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28522 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28523 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28524 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28525 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28526 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28528 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28529 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28530 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28531 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28532 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28533 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28534 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28536 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28537 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28538 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28539 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28540 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28541 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28542 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28544 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28545 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28546 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28547 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28548 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28549 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28550 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28551 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28553 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28554 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28555 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28556 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28557 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28558 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28559 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28560 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28562 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28563 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28564 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28565 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28569 /* TM vector builtins. */
28571 /* Reuse the existing x86-specific `struct builtin_description' cause
28572 we're lazy. Add casts to make them fit. */
28573 static const struct builtin_description bdesc_tm[] =
28575 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28576 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28577 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28578 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28579 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28580 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28581 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28583 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28584 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28585 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28586 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28587 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28588 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28589 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28591 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28592 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28593 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28594 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28595 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28596 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28597 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28599 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28600 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28601 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28604 /* TM callbacks. */
28606 /* Return the builtin decl needed to load a vector of TYPE. */
28608 static tree
28609 ix86_builtin_tm_load (tree type)
28611 if (TREE_CODE (type) == VECTOR_TYPE)
28613 switch (tree_low_cst (TYPE_SIZE (type), 1))
28615 case 64:
28616 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28617 case 128:
28618 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28619 case 256:
28620 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28623 return NULL_TREE;
28626 /* Return the builtin decl needed to store a vector of TYPE. */
28628 static tree
28629 ix86_builtin_tm_store (tree type)
28631 if (TREE_CODE (type) == VECTOR_TYPE)
28633 switch (tree_low_cst (TYPE_SIZE (type), 1))
28635 case 64:
28636 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28637 case 128:
28638 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28639 case 256:
28640 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28643 return NULL_TREE;
28646 /* Initialize the transactional memory vector load/store builtins. */
28648 static void
28649 ix86_init_tm_builtins (void)
28651 enum ix86_builtin_func_type ftype;
28652 const struct builtin_description *d;
28653 size_t i;
28654 tree decl;
28655 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28656 tree attrs_log, attrs_type_log;
28658 if (!flag_tm)
28659 return;
28661 /* If there are no builtins defined, we must be compiling in a
28662 language without trans-mem support. */
28663 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28664 return;
28666 /* Use whatever attributes a normal TM load has. */
28667 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28668 attrs_load = DECL_ATTRIBUTES (decl);
28669 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28670 /* Use whatever attributes a normal TM store has. */
28671 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28672 attrs_store = DECL_ATTRIBUTES (decl);
28673 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28674 /* Use whatever attributes a normal TM log has. */
28675 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28676 attrs_log = DECL_ATTRIBUTES (decl);
28677 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28679 for (i = 0, d = bdesc_tm;
28680 i < ARRAY_SIZE (bdesc_tm);
28681 i++, d++)
28683 if ((d->mask & ix86_isa_flags) != 0
28684 || (lang_hooks.builtin_function
28685 == lang_hooks.builtin_function_ext_scope))
28687 tree type, attrs, attrs_type;
28688 enum built_in_function code = (enum built_in_function) d->code;
28690 ftype = (enum ix86_builtin_func_type) d->flag;
28691 type = ix86_get_builtin_func_type (ftype);
28693 if (BUILTIN_TM_LOAD_P (code))
28695 attrs = attrs_load;
28696 attrs_type = attrs_type_load;
28698 else if (BUILTIN_TM_STORE_P (code))
28700 attrs = attrs_store;
28701 attrs_type = attrs_type_store;
28703 else
28705 attrs = attrs_log;
28706 attrs_type = attrs_type_log;
28708 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28709 /* The builtin without the prefix for
28710 calling it directly. */
28711 d->name + strlen ("__builtin_"),
28712 attrs);
28713 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28714 set the TYPE_ATTRIBUTES. */
28715 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28717 set_builtin_decl (code, decl, false);
28722 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28723 in the current target ISA to allow the user to compile particular modules
28724 with different target specific options that differ from the command line
28725 options. */
28726 static void
28727 ix86_init_mmx_sse_builtins (void)
28729 const struct builtin_description * d;
28730 enum ix86_builtin_func_type ftype;
28731 size_t i;
28733 /* Add all special builtins with variable number of operands. */
28734 for (i = 0, d = bdesc_special_args;
28735 i < ARRAY_SIZE (bdesc_special_args);
28736 i++, d++)
28738 if (d->name == 0)
28739 continue;
28741 ftype = (enum ix86_builtin_func_type) d->flag;
28742 def_builtin (d->mask, d->name, ftype, d->code);
28745 /* Add all builtins with variable number of operands. */
28746 for (i = 0, d = bdesc_args;
28747 i < ARRAY_SIZE (bdesc_args);
28748 i++, d++)
28750 if (d->name == 0)
28751 continue;
28753 ftype = (enum ix86_builtin_func_type) d->flag;
28754 def_builtin_const (d->mask, d->name, ftype, d->code);
28757 /* pcmpestr[im] insns. */
28758 for (i = 0, d = bdesc_pcmpestr;
28759 i < ARRAY_SIZE (bdesc_pcmpestr);
28760 i++, d++)
28762 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28763 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28764 else
28765 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28766 def_builtin_const (d->mask, d->name, ftype, d->code);
28769 /* pcmpistr[im] insns. */
28770 for (i = 0, d = bdesc_pcmpistr;
28771 i < ARRAY_SIZE (bdesc_pcmpistr);
28772 i++, d++)
28774 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28775 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28776 else
28777 ftype = INT_FTYPE_V16QI_V16QI_INT;
28778 def_builtin_const (d->mask, d->name, ftype, d->code);
28781 /* comi/ucomi insns. */
28782 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28784 if (d->mask == OPTION_MASK_ISA_SSE2)
28785 ftype = INT_FTYPE_V2DF_V2DF;
28786 else
28787 ftype = INT_FTYPE_V4SF_V4SF;
28788 def_builtin_const (d->mask, d->name, ftype, d->code);
28791 /* SSE */
28792 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28793 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28794 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28795 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28797 /* SSE or 3DNow!A */
28798 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28799 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28800 IX86_BUILTIN_MASKMOVQ);
28802 /* SSE2 */
28803 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28804 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28806 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28807 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28808 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28809 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28811 /* SSE3. */
28812 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28813 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28814 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28815 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28817 /* AES */
28818 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28819 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28820 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28821 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28822 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28823 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28824 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28825 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28826 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28827 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28828 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28829 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28831 /* PCLMUL */
28832 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28833 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28835 /* RDRND */
28836 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28837 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28838 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28839 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28840 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28841 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28842 IX86_BUILTIN_RDRAND64_STEP);
28844 /* AVX2 */
28845 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28846 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28847 IX86_BUILTIN_GATHERSIV2DF);
28849 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28850 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28851 IX86_BUILTIN_GATHERSIV4DF);
28853 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28854 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28855 IX86_BUILTIN_GATHERDIV2DF);
28857 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28858 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28859 IX86_BUILTIN_GATHERDIV4DF);
28861 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28862 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28863 IX86_BUILTIN_GATHERSIV4SF);
28865 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28866 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28867 IX86_BUILTIN_GATHERSIV8SF);
28869 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28870 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28871 IX86_BUILTIN_GATHERDIV4SF);
28873 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28874 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28875 IX86_BUILTIN_GATHERDIV8SF);
28877 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28878 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28879 IX86_BUILTIN_GATHERSIV2DI);
28881 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28882 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28883 IX86_BUILTIN_GATHERSIV4DI);
28885 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28886 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28887 IX86_BUILTIN_GATHERDIV2DI);
28889 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28890 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28891 IX86_BUILTIN_GATHERDIV4DI);
28893 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28894 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28895 IX86_BUILTIN_GATHERSIV4SI);
28897 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28898 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28899 IX86_BUILTIN_GATHERSIV8SI);
28901 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28902 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28903 IX86_BUILTIN_GATHERDIV4SI);
28905 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28906 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28907 IX86_BUILTIN_GATHERDIV8SI);
28909 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28910 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28911 IX86_BUILTIN_GATHERALTSIV4DF);
28913 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28914 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28915 IX86_BUILTIN_GATHERALTDIV8SF);
28917 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28918 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28919 IX86_BUILTIN_GATHERALTSIV4DI);
28921 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28922 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28923 IX86_BUILTIN_GATHERALTDIV8SI);
28925 /* RTM. */
28926 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28927 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28929 /* MMX access to the vec_init patterns. */
28930 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28931 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28933 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28934 V4HI_FTYPE_HI_HI_HI_HI,
28935 IX86_BUILTIN_VEC_INIT_V4HI);
28937 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28938 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28939 IX86_BUILTIN_VEC_INIT_V8QI);
28941 /* Access to the vec_extract patterns. */
28942 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28943 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28944 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28945 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28946 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28947 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28948 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28949 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28950 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28951 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28953 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28954 "__builtin_ia32_vec_ext_v4hi",
28955 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28957 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28958 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28960 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28961 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28963 /* Access to the vec_set patterns. */
28964 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28965 "__builtin_ia32_vec_set_v2di",
28966 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28968 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28969 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28971 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28972 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28974 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28975 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28977 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28978 "__builtin_ia32_vec_set_v4hi",
28979 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28981 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28982 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28984 /* RDSEED */
28985 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28986 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28987 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28988 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28989 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28990 "__builtin_ia32_rdseed_di_step",
28991 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28993 /* ADCX */
28994 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28995 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28996 def_builtin (OPTION_MASK_ISA_64BIT,
28997 "__builtin_ia32_addcarryx_u64",
28998 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28999 IX86_BUILTIN_ADDCARRYX64);
29001 /* Add FMA4 multi-arg argument instructions */
29002 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29004 if (d->name == 0)
29005 continue;
29007 ftype = (enum ix86_builtin_func_type) d->flag;
29008 def_builtin_const (d->mask, d->name, ftype, d->code);
29012 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29013 to return a pointer to VERSION_DECL if the outcome of the expression
29014 formed by PREDICATE_CHAIN is true. This function will be called during
29015 version dispatch to decide which function version to execute. It returns
29016 the basic block at the end, to which more conditions can be added. */
29018 static basic_block
29019 add_condition_to_bb (tree function_decl, tree version_decl,
29020 tree predicate_chain, basic_block new_bb)
29022 gimple return_stmt;
29023 tree convert_expr, result_var;
29024 gimple convert_stmt;
29025 gimple call_cond_stmt;
29026 gimple if_else_stmt;
29028 basic_block bb1, bb2, bb3;
29029 edge e12, e23;
29031 tree cond_var, and_expr_var = NULL_TREE;
29032 gimple_seq gseq;
29034 tree predicate_decl, predicate_arg;
29036 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29038 gcc_assert (new_bb != NULL);
29039 gseq = bb_seq (new_bb);
29042 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29043 build_fold_addr_expr (version_decl));
29044 result_var = create_tmp_var (ptr_type_node, NULL);
29045 convert_stmt = gimple_build_assign (result_var, convert_expr);
29046 return_stmt = gimple_build_return (result_var);
29048 if (predicate_chain == NULL_TREE)
29050 gimple_seq_add_stmt (&gseq, convert_stmt);
29051 gimple_seq_add_stmt (&gseq, return_stmt);
29052 set_bb_seq (new_bb, gseq);
29053 gimple_set_bb (convert_stmt, new_bb);
29054 gimple_set_bb (return_stmt, new_bb);
29055 pop_cfun ();
29056 return new_bb;
29059 while (predicate_chain != NULL)
29061 cond_var = create_tmp_var (integer_type_node, NULL);
29062 predicate_decl = TREE_PURPOSE (predicate_chain);
29063 predicate_arg = TREE_VALUE (predicate_chain);
29064 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29065 gimple_call_set_lhs (call_cond_stmt, cond_var);
29067 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29068 gimple_set_bb (call_cond_stmt, new_bb);
29069 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29071 predicate_chain = TREE_CHAIN (predicate_chain);
29073 if (and_expr_var == NULL)
29074 and_expr_var = cond_var;
29075 else
29077 gimple assign_stmt;
29078 /* Use MIN_EXPR to check if any integer is zero?.
29079 and_expr_var = min_expr <cond_var, and_expr_var> */
29080 assign_stmt = gimple_build_assign (and_expr_var,
29081 build2 (MIN_EXPR, integer_type_node,
29082 cond_var, and_expr_var));
29084 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29085 gimple_set_bb (assign_stmt, new_bb);
29086 gimple_seq_add_stmt (&gseq, assign_stmt);
29090 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29091 integer_zero_node,
29092 NULL_TREE, NULL_TREE);
29093 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29094 gimple_set_bb (if_else_stmt, new_bb);
29095 gimple_seq_add_stmt (&gseq, if_else_stmt);
29097 gimple_seq_add_stmt (&gseq, convert_stmt);
29098 gimple_seq_add_stmt (&gseq, return_stmt);
29099 set_bb_seq (new_bb, gseq);
29101 bb1 = new_bb;
29102 e12 = split_block (bb1, if_else_stmt);
29103 bb2 = e12->dest;
29104 e12->flags &= ~EDGE_FALLTHRU;
29105 e12->flags |= EDGE_TRUE_VALUE;
29107 e23 = split_block (bb2, return_stmt);
29109 gimple_set_bb (convert_stmt, bb2);
29110 gimple_set_bb (return_stmt, bb2);
29112 bb3 = e23->dest;
29113 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29115 remove_edge (e23);
29116 make_edge (bb2, EXIT_BLOCK_PTR, 0);
29118 pop_cfun ();
29120 return bb3;
29123 /* This parses the attribute arguments to target in DECL and determines
29124 the right builtin to use to match the platform specification.
29125 It returns the priority value for this version decl. If PREDICATE_LIST
29126 is not NULL, it stores the list of cpu features that need to be checked
29127 before dispatching this function. */
29129 static unsigned int
29130 get_builtin_code_for_version (tree decl, tree *predicate_list)
29132 tree attrs;
29133 struct cl_target_option cur_target;
29134 tree target_node;
29135 struct cl_target_option *new_target;
29136 const char *arg_str = NULL;
29137 const char *attrs_str = NULL;
29138 char *tok_str = NULL;
29139 char *token;
29141 /* Priority of i386 features, greater value is higher priority. This is
29142 used to decide the order in which function dispatch must happen. For
29143 instance, a version specialized for SSE4.2 should be checked for dispatch
29144 before a version for SSE3, as SSE4.2 implies SSE3. */
29145 enum feature_priority
29147 P_ZERO = 0,
29148 P_MMX,
29149 P_SSE,
29150 P_SSE2,
29151 P_SSE3,
29152 P_SSSE3,
29153 P_PROC_SSSE3,
29154 P_SSE4_a,
29155 P_PROC_SSE4_a,
29156 P_SSE4_1,
29157 P_SSE4_2,
29158 P_PROC_SSE4_2,
29159 P_POPCNT,
29160 P_AVX,
29161 P_AVX2,
29162 P_FMA,
29163 P_PROC_FMA
29166 enum feature_priority priority = P_ZERO;
29168 /* These are the target attribute strings for which a dispatcher is
29169 available, from fold_builtin_cpu. */
29171 static struct _feature_list
29173 const char *const name;
29174 const enum feature_priority priority;
29176 const feature_list[] =
29178 {"mmx", P_MMX},
29179 {"sse", P_SSE},
29180 {"sse2", P_SSE2},
29181 {"sse3", P_SSE3},
29182 {"ssse3", P_SSSE3},
29183 {"sse4.1", P_SSE4_1},
29184 {"sse4.2", P_SSE4_2},
29185 {"popcnt", P_POPCNT},
29186 {"avx", P_AVX},
29187 {"avx2", P_AVX2}
29191 static unsigned int NUM_FEATURES
29192 = sizeof (feature_list) / sizeof (struct _feature_list);
29194 unsigned int i;
29196 tree predicate_chain = NULL_TREE;
29197 tree predicate_decl, predicate_arg;
29199 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29200 gcc_assert (attrs != NULL);
29202 attrs = TREE_VALUE (TREE_VALUE (attrs));
29204 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29205 attrs_str = TREE_STRING_POINTER (attrs);
29207 /* Return priority zero for default function. */
29208 if (strcmp (attrs_str, "default") == 0)
29209 return 0;
29211 /* Handle arch= if specified. For priority, set it to be 1 more than
29212 the best instruction set the processor can handle. For instance, if
29213 there is a version for atom and a version for ssse3 (the highest ISA
29214 priority for atom), the atom version must be checked for dispatch
29215 before the ssse3 version. */
29216 if (strstr (attrs_str, "arch=") != NULL)
29218 cl_target_option_save (&cur_target, &global_options);
29219 target_node = ix86_valid_target_attribute_tree (attrs);
29221 gcc_assert (target_node);
29222 new_target = TREE_TARGET_OPTION (target_node);
29223 gcc_assert (new_target);
29225 if (new_target->arch_specified && new_target->arch > 0)
29227 switch (new_target->arch)
29229 case PROCESSOR_CORE2:
29230 arg_str = "core2";
29231 priority = P_PROC_SSSE3;
29232 break;
29233 case PROCESSOR_COREI7:
29234 arg_str = "corei7";
29235 priority = P_PROC_SSE4_2;
29236 break;
29237 case PROCESSOR_ATOM:
29238 arg_str = "atom";
29239 priority = P_PROC_SSSE3;
29240 break;
29241 case PROCESSOR_AMDFAM10:
29242 arg_str = "amdfam10h";
29243 priority = P_PROC_SSE4_a;
29244 break;
29245 case PROCESSOR_BDVER1:
29246 arg_str = "bdver1";
29247 priority = P_PROC_FMA;
29248 break;
29249 case PROCESSOR_BDVER2:
29250 arg_str = "bdver2";
29251 priority = P_PROC_FMA;
29252 break;
29256 cl_target_option_restore (&global_options, &cur_target);
29258 if (predicate_list && arg_str == NULL)
29260 error_at (DECL_SOURCE_LOCATION (decl),
29261 "No dispatcher found for the versioning attributes");
29262 return 0;
29265 if (predicate_list)
29267 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
29268 /* For a C string literal the length includes the trailing NULL. */
29269 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
29270 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29271 predicate_chain);
29275 /* Process feature name. */
29276 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
29277 strcpy (tok_str, attrs_str);
29278 token = strtok (tok_str, ",");
29279 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
29281 while (token != NULL)
29283 /* Do not process "arch=" */
29284 if (strncmp (token, "arch=", 5) == 0)
29286 token = strtok (NULL, ",");
29287 continue;
29289 for (i = 0; i < NUM_FEATURES; ++i)
29291 if (strcmp (token, feature_list[i].name) == 0)
29293 if (predicate_list)
29295 predicate_arg = build_string_literal (
29296 strlen (feature_list[i].name) + 1,
29297 feature_list[i].name);
29298 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29299 predicate_chain);
29301 /* Find the maximum priority feature. */
29302 if (feature_list[i].priority > priority)
29303 priority = feature_list[i].priority;
29305 break;
29308 if (predicate_list && i == NUM_FEATURES)
29310 error_at (DECL_SOURCE_LOCATION (decl),
29311 "No dispatcher found for %s", token);
29312 return 0;
29314 token = strtok (NULL, ",");
29316 free (tok_str);
29318 if (predicate_list && predicate_chain == NULL_TREE)
29320 error_at (DECL_SOURCE_LOCATION (decl),
29321 "No dispatcher found for the versioning attributes : %s",
29322 attrs_str);
29323 return 0;
29325 else if (predicate_list)
29327 predicate_chain = nreverse (predicate_chain);
29328 *predicate_list = predicate_chain;
29331 return priority;
29334 /* This compares the priority of target features in function DECL1
29335 and DECL2. It returns positive value if DECL1 is higher priority,
29336 negative value if DECL2 is higher priority and 0 if they are the
29337 same. */
29339 static int
29340 ix86_compare_version_priority (tree decl1, tree decl2)
29342 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
29343 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
29345 return (int)priority1 - (int)priority2;
29348 /* V1 and V2 point to function versions with different priorities
29349 based on the target ISA. This function compares their priorities. */
29351 static int
29352 feature_compare (const void *v1, const void *v2)
29354 typedef struct _function_version_info
29356 tree version_decl;
29357 tree predicate_chain;
29358 unsigned int dispatch_priority;
29359 } function_version_info;
29361 const function_version_info c1 = *(const function_version_info *)v1;
29362 const function_version_info c2 = *(const function_version_info *)v2;
29363 return (c2.dispatch_priority - c1.dispatch_priority);
29366 /* This function generates the dispatch function for
29367 multi-versioned functions. DISPATCH_DECL is the function which will
29368 contain the dispatch logic. FNDECLS are the function choices for
29369 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
29370 in DISPATCH_DECL in which the dispatch code is generated. */
29372 static int
29373 dispatch_function_versions (tree dispatch_decl,
29374 void *fndecls_p,
29375 basic_block *empty_bb)
29377 tree default_decl;
29378 gimple ifunc_cpu_init_stmt;
29379 gimple_seq gseq;
29380 int ix;
29381 tree ele;
29382 vec<tree> *fndecls;
29383 unsigned int num_versions = 0;
29384 unsigned int actual_versions = 0;
29385 unsigned int i;
29387 struct _function_version_info
29389 tree version_decl;
29390 tree predicate_chain;
29391 unsigned int dispatch_priority;
29392 }*function_version_info;
29394 gcc_assert (dispatch_decl != NULL
29395 && fndecls_p != NULL
29396 && empty_bb != NULL);
29398 /*fndecls_p is actually a vector. */
29399 fndecls = static_cast<vec<tree> *> (fndecls_p);
29401 /* At least one more version other than the default. */
29402 num_versions = fndecls->length ();
29403 gcc_assert (num_versions >= 2);
29405 function_version_info = (struct _function_version_info *)
29406 XNEWVEC (struct _function_version_info, (num_versions - 1));
29408 /* The first version in the vector is the default decl. */
29409 default_decl = (*fndecls)[0];
29411 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
29413 gseq = bb_seq (*empty_bb);
29414 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
29415 constructors, so explicity call __builtin_cpu_init here. */
29416 ifunc_cpu_init_stmt = gimple_build_call_vec (
29417 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
29418 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
29419 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
29420 set_bb_seq (*empty_bb, gseq);
29422 pop_cfun ();
29425 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
29427 tree version_decl = ele;
29428 tree predicate_chain = NULL_TREE;
29429 unsigned int priority;
29430 /* Get attribute string, parse it and find the right predicate decl.
29431 The predicate function could be a lengthy combination of many
29432 features, like arch-type and various isa-variants. */
29433 priority = get_builtin_code_for_version (version_decl,
29434 &predicate_chain);
29436 if (predicate_chain == NULL_TREE)
29437 continue;
29439 function_version_info [actual_versions].version_decl = version_decl;
29440 function_version_info [actual_versions].predicate_chain
29441 = predicate_chain;
29442 function_version_info [actual_versions].dispatch_priority = priority;
29443 actual_versions++;
29446 /* Sort the versions according to descending order of dispatch priority. The
29447 priority is based on the ISA. This is not a perfect solution. There
29448 could still be ambiguity. If more than one function version is suitable
29449 to execute, which one should be dispatched? In future, allow the user
29450 to specify a dispatch priority next to the version. */
29451 qsort (function_version_info, actual_versions,
29452 sizeof (struct _function_version_info), feature_compare);
29454 for (i = 0; i < actual_versions; ++i)
29455 *empty_bb = add_condition_to_bb (dispatch_decl,
29456 function_version_info[i].version_decl,
29457 function_version_info[i].predicate_chain,
29458 *empty_bb);
29460 /* dispatch default version at the end. */
29461 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
29462 NULL, *empty_bb);
29464 free (function_version_info);
29465 return 0;
29468 /* Comparator function to be used in qsort routine to sort attribute
29469 specification strings to "target". */
29471 static int
29472 attr_strcmp (const void *v1, const void *v2)
29474 const char *c1 = *(char *const*)v1;
29475 const char *c2 = *(char *const*)v2;
29476 return strcmp (c1, c2);
29479 /* ARGLIST is the argument to target attribute. This function tokenizes
29480 the comma separated arguments, sorts them and returns a string which
29481 is a unique identifier for the comma separated arguments. It also
29482 replaces non-identifier characters "=,-" with "_". */
29484 static char *
29485 sorted_attr_string (tree arglist)
29487 tree arg;
29488 size_t str_len_sum = 0;
29489 char **args = NULL;
29490 char *attr_str, *ret_str;
29491 char *attr = NULL;
29492 unsigned int argnum = 1;
29493 unsigned int i;
29495 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29497 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29498 size_t len = strlen (str);
29499 str_len_sum += len + 1;
29500 if (arg != arglist)
29501 argnum++;
29502 for (i = 0; i < strlen (str); i++)
29503 if (str[i] == ',')
29504 argnum++;
29507 attr_str = XNEWVEC (char, str_len_sum);
29508 str_len_sum = 0;
29509 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29511 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29512 size_t len = strlen (str);
29513 memcpy (attr_str + str_len_sum, str, len);
29514 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29515 str_len_sum += len + 1;
29518 /* Replace "=,-" with "_". */
29519 for (i = 0; i < strlen (attr_str); i++)
29520 if (attr_str[i] == '=' || attr_str[i]== '-')
29521 attr_str[i] = '_';
29523 if (argnum == 1)
29524 return attr_str;
29526 args = XNEWVEC (char *, argnum);
29528 i = 0;
29529 attr = strtok (attr_str, ",");
29530 while (attr != NULL)
29532 args[i] = attr;
29533 i++;
29534 attr = strtok (NULL, ",");
29537 qsort (args, argnum, sizeof (char *), attr_strcmp);
29539 ret_str = XNEWVEC (char, str_len_sum);
29540 str_len_sum = 0;
29541 for (i = 0; i < argnum; i++)
29543 size_t len = strlen (args[i]);
29544 memcpy (ret_str + str_len_sum, args[i], len);
29545 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29546 str_len_sum += len + 1;
29549 XDELETEVEC (args);
29550 XDELETEVEC (attr_str);
29551 return ret_str;
29554 /* This function changes the assembler name for functions that are
29555 versions. If DECL is a function version and has a "target"
29556 attribute, it appends the attribute string to its assembler name. */
29558 static tree
29559 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29561 tree version_attr;
29562 const char *orig_name, *version_string;
29563 char *attr_str, *assembler_name;
29565 if (DECL_DECLARED_INLINE_P (decl)
29566 && lookup_attribute ("gnu_inline",
29567 DECL_ATTRIBUTES (decl)))
29568 error_at (DECL_SOURCE_LOCATION (decl),
29569 "Function versions cannot be marked as gnu_inline,"
29570 " bodies have to be generated");
29572 if (DECL_VIRTUAL_P (decl)
29573 || DECL_VINDEX (decl))
29574 sorry ("Virtual function multiversioning not supported");
29576 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29578 /* target attribute string cannot be NULL. */
29579 gcc_assert (version_attr != NULL_TREE);
29581 orig_name = IDENTIFIER_POINTER (id);
29582 version_string
29583 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29585 if (strcmp (version_string, "default") == 0)
29586 return id;
29588 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29589 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29591 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29593 /* Allow assembler name to be modified if already set. */
29594 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29595 SET_DECL_RTL (decl, NULL);
29597 tree ret = get_identifier (assembler_name);
29598 XDELETEVEC (attr_str);
29599 XDELETEVEC (assembler_name);
29600 return ret;
29603 /* This function returns true if FN1 and FN2 are versions of the same function,
29604 that is, the target strings of the function decls are different. This assumes
29605 that FN1 and FN2 have the same signature. */
29607 static bool
29608 ix86_function_versions (tree fn1, tree fn2)
29610 tree attr1, attr2;
29611 char *target1, *target2;
29612 bool result;
29614 if (TREE_CODE (fn1) != FUNCTION_DECL
29615 || TREE_CODE (fn2) != FUNCTION_DECL)
29616 return false;
29618 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29619 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29621 /* At least one function decl should have the target attribute specified. */
29622 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29623 return false;
29625 /* Diagnose missing target attribute if one of the decls is already
29626 multi-versioned. */
29627 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29629 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29631 if (attr2 != NULL_TREE)
29633 tree tem = fn1;
29634 fn1 = fn2;
29635 fn2 = tem;
29636 attr1 = attr2;
29638 error_at (DECL_SOURCE_LOCATION (fn2),
29639 "missing %<target%> attribute for multi-versioned %D",
29640 fn2);
29641 inform (DECL_SOURCE_LOCATION (fn1),
29642 "previous declaration of %D", fn1);
29643 /* Prevent diagnosing of the same error multiple times. */
29644 DECL_ATTRIBUTES (fn2)
29645 = tree_cons (get_identifier ("target"),
29646 copy_node (TREE_VALUE (attr1)),
29647 DECL_ATTRIBUTES (fn2));
29649 return false;
29652 target1 = sorted_attr_string (TREE_VALUE (attr1));
29653 target2 = sorted_attr_string (TREE_VALUE (attr2));
29655 /* The sorted target strings must be different for fn1 and fn2
29656 to be versions. */
29657 if (strcmp (target1, target2) == 0)
29658 result = false;
29659 else
29660 result = true;
29662 XDELETEVEC (target1);
29663 XDELETEVEC (target2);
29665 return result;
29668 static tree
29669 ix86_mangle_decl_assembler_name (tree decl, tree id)
29671 /* For function version, add the target suffix to the assembler name. */
29672 if (TREE_CODE (decl) == FUNCTION_DECL
29673 && DECL_FUNCTION_VERSIONED (decl))
29674 id = ix86_mangle_function_version_assembler_name (decl, id);
29675 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29676 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29677 #endif
29679 return id;
29682 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29683 is true, append the full path name of the source file. */
29685 static char *
29686 make_name (tree decl, const char *suffix, bool make_unique)
29688 char *global_var_name;
29689 int name_len;
29690 const char *name;
29691 const char *unique_name = NULL;
29693 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29695 /* Get a unique name that can be used globally without any chances
29696 of collision at link time. */
29697 if (make_unique)
29698 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29700 name_len = strlen (name) + strlen (suffix) + 2;
29702 if (make_unique)
29703 name_len += strlen (unique_name) + 1;
29704 global_var_name = XNEWVEC (char, name_len);
29706 /* Use '.' to concatenate names as it is demangler friendly. */
29707 if (make_unique)
29708 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29709 suffix);
29710 else
29711 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29713 return global_var_name;
29716 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29718 /* Make a dispatcher declaration for the multi-versioned function DECL.
29719 Calls to DECL function will be replaced with calls to the dispatcher
29720 by the front-end. Return the decl created. */
29722 static tree
29723 make_dispatcher_decl (const tree decl)
29725 tree func_decl;
29726 char *func_name;
29727 tree fn_type, func_type;
29728 bool is_uniq = false;
29730 if (TREE_PUBLIC (decl) == 0)
29731 is_uniq = true;
29733 func_name = make_name (decl, "ifunc", is_uniq);
29735 fn_type = TREE_TYPE (decl);
29736 func_type = build_function_type (TREE_TYPE (fn_type),
29737 TYPE_ARG_TYPES (fn_type));
29739 func_decl = build_fn_decl (func_name, func_type);
29740 XDELETEVEC (func_name);
29741 TREE_USED (func_decl) = 1;
29742 DECL_CONTEXT (func_decl) = NULL_TREE;
29743 DECL_INITIAL (func_decl) = error_mark_node;
29744 DECL_ARTIFICIAL (func_decl) = 1;
29745 /* Mark this func as external, the resolver will flip it again if
29746 it gets generated. */
29747 DECL_EXTERNAL (func_decl) = 1;
29748 /* This will be of type IFUNCs have to be externally visible. */
29749 TREE_PUBLIC (func_decl) = 1;
29751 return func_decl;
29754 #endif
29756 /* Returns true if decl is multi-versioned and DECL is the default function,
29757 that is it is not tagged with target specific optimization. */
29759 static bool
29760 is_function_default_version (const tree decl)
29762 if (TREE_CODE (decl) != FUNCTION_DECL
29763 || !DECL_FUNCTION_VERSIONED (decl))
29764 return false;
29765 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29766 gcc_assert (attr);
29767 attr = TREE_VALUE (TREE_VALUE (attr));
29768 return (TREE_CODE (attr) == STRING_CST
29769 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29772 /* Make a dispatcher declaration for the multi-versioned function DECL.
29773 Calls to DECL function will be replaced with calls to the dispatcher
29774 by the front-end. Returns the decl of the dispatcher function. */
29776 static tree
29777 ix86_get_function_versions_dispatcher (void *decl)
29779 tree fn = (tree) decl;
29780 struct cgraph_node *node = NULL;
29781 struct cgraph_node *default_node = NULL;
29782 struct cgraph_function_version_info *node_v = NULL;
29783 struct cgraph_function_version_info *first_v = NULL;
29785 tree dispatch_decl = NULL;
29787 struct cgraph_function_version_info *default_version_info = NULL;
29789 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29791 node = cgraph_get_node (fn);
29792 gcc_assert (node != NULL);
29794 node_v = get_cgraph_node_version (node);
29795 gcc_assert (node_v != NULL);
29797 if (node_v->dispatcher_resolver != NULL)
29798 return node_v->dispatcher_resolver;
29800 /* Find the default version and make it the first node. */
29801 first_v = node_v;
29802 /* Go to the beginning of the chain. */
29803 while (first_v->prev != NULL)
29804 first_v = first_v->prev;
29805 default_version_info = first_v;
29806 while (default_version_info != NULL)
29808 if (is_function_default_version
29809 (default_version_info->this_node->symbol.decl))
29810 break;
29811 default_version_info = default_version_info->next;
29814 /* If there is no default node, just return NULL. */
29815 if (default_version_info == NULL)
29816 return NULL;
29818 /* Make default info the first node. */
29819 if (first_v != default_version_info)
29821 default_version_info->prev->next = default_version_info->next;
29822 if (default_version_info->next)
29823 default_version_info->next->prev = default_version_info->prev;
29824 first_v->prev = default_version_info;
29825 default_version_info->next = first_v;
29826 default_version_info->prev = NULL;
29829 default_node = default_version_info->this_node;
29831 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29832 if (targetm.has_ifunc_p ())
29834 struct cgraph_function_version_info *it_v = NULL;
29835 struct cgraph_node *dispatcher_node = NULL;
29836 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29838 /* Right now, the dispatching is done via ifunc. */
29839 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29841 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29842 gcc_assert (dispatcher_node != NULL);
29843 dispatcher_node->dispatcher_function = 1;
29844 dispatcher_version_info
29845 = insert_new_cgraph_node_version (dispatcher_node);
29846 dispatcher_version_info->next = default_version_info;
29847 dispatcher_node->symbol.definition = 1;
29849 /* Set the dispatcher for all the versions. */
29850 it_v = default_version_info;
29851 while (it_v != NULL)
29853 it_v->dispatcher_resolver = dispatch_decl;
29854 it_v = it_v->next;
29857 else
29858 #endif
29860 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29861 "multiversioning needs ifunc which is not supported "
29862 "on this target");
29865 return dispatch_decl;
29868 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29869 it to CHAIN. */
29871 static tree
29872 make_attribute (const char *name, const char *arg_name, tree chain)
29874 tree attr_name;
29875 tree attr_arg_name;
29876 tree attr_args;
29877 tree attr;
29879 attr_name = get_identifier (name);
29880 attr_arg_name = build_string (strlen (arg_name), arg_name);
29881 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29882 attr = tree_cons (attr_name, attr_args, chain);
29883 return attr;
29886 /* Make the resolver function decl to dispatch the versions of
29887 a multi-versioned function, DEFAULT_DECL. Create an
29888 empty basic block in the resolver and store the pointer in
29889 EMPTY_BB. Return the decl of the resolver function. */
29891 static tree
29892 make_resolver_func (const tree default_decl,
29893 const tree dispatch_decl,
29894 basic_block *empty_bb)
29896 char *resolver_name;
29897 tree decl, type, decl_name, t;
29898 bool is_uniq = false;
29900 /* IFUNC's have to be globally visible. So, if the default_decl is
29901 not, then the name of the IFUNC should be made unique. */
29902 if (TREE_PUBLIC (default_decl) == 0)
29903 is_uniq = true;
29905 /* Append the filename to the resolver function if the versions are
29906 not externally visible. This is because the resolver function has
29907 to be externally visible for the loader to find it. So, appending
29908 the filename will prevent conflicts with a resolver function from
29909 another module which is based on the same version name. */
29910 resolver_name = make_name (default_decl, "resolver", is_uniq);
29912 /* The resolver function should return a (void *). */
29913 type = build_function_type_list (ptr_type_node, NULL_TREE);
29915 decl = build_fn_decl (resolver_name, type);
29916 decl_name = get_identifier (resolver_name);
29917 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29919 DECL_NAME (decl) = decl_name;
29920 TREE_USED (decl) = 1;
29921 DECL_ARTIFICIAL (decl) = 1;
29922 DECL_IGNORED_P (decl) = 0;
29923 /* IFUNC resolvers have to be externally visible. */
29924 TREE_PUBLIC (decl) = 1;
29925 DECL_UNINLINABLE (decl) = 0;
29927 /* Resolver is not external, body is generated. */
29928 DECL_EXTERNAL (decl) = 0;
29929 DECL_EXTERNAL (dispatch_decl) = 0;
29931 DECL_CONTEXT (decl) = NULL_TREE;
29932 DECL_INITIAL (decl) = make_node (BLOCK);
29933 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29935 if (DECL_COMDAT_GROUP (default_decl)
29936 || TREE_PUBLIC (default_decl))
29938 /* In this case, each translation unit with a call to this
29939 versioned function will put out a resolver. Ensure it
29940 is comdat to keep just one copy. */
29941 DECL_COMDAT (decl) = 1;
29942 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29944 /* Build result decl and add to function_decl. */
29945 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29946 DECL_ARTIFICIAL (t) = 1;
29947 DECL_IGNORED_P (t) = 1;
29948 DECL_RESULT (decl) = t;
29950 gimplify_function_tree (decl);
29951 push_cfun (DECL_STRUCT_FUNCTION (decl));
29952 *empty_bb = init_lowered_empty_function (decl, false);
29954 cgraph_add_new_function (decl, true);
29955 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29957 pop_cfun ();
29959 gcc_assert (dispatch_decl != NULL);
29960 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29961 DECL_ATTRIBUTES (dispatch_decl)
29962 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29964 /* Create the alias for dispatch to resolver here. */
29965 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29966 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29967 XDELETEVEC (resolver_name);
29968 return decl;
29971 /* Generate the dispatching code body to dispatch multi-versioned function
29972 DECL. The target hook is called to process the "target" attributes and
29973 provide the code to dispatch the right function at run-time. NODE points
29974 to the dispatcher decl whose body will be created. */
29976 static tree
29977 ix86_generate_version_dispatcher_body (void *node_p)
29979 tree resolver_decl;
29980 basic_block empty_bb;
29981 vec<tree> fn_ver_vec = vNULL;
29982 tree default_ver_decl;
29983 struct cgraph_node *versn;
29984 struct cgraph_node *node;
29986 struct cgraph_function_version_info *node_version_info = NULL;
29987 struct cgraph_function_version_info *versn_info = NULL;
29989 node = (cgraph_node *)node_p;
29991 node_version_info = get_cgraph_node_version (node);
29992 gcc_assert (node->dispatcher_function
29993 && node_version_info != NULL);
29995 if (node_version_info->dispatcher_resolver)
29996 return node_version_info->dispatcher_resolver;
29998 /* The first version in the chain corresponds to the default version. */
29999 default_ver_decl = node_version_info->next->this_node->symbol.decl;
30001 /* node is going to be an alias, so remove the finalized bit. */
30002 node->symbol.definition = false;
30004 resolver_decl = make_resolver_func (default_ver_decl,
30005 node->symbol.decl, &empty_bb);
30007 node_version_info->dispatcher_resolver = resolver_decl;
30009 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30011 fn_ver_vec.create (2);
30013 for (versn_info = node_version_info->next; versn_info;
30014 versn_info = versn_info->next)
30016 versn = versn_info->this_node;
30017 /* Check for virtual functions here again, as by this time it should
30018 have been determined if this function needs a vtable index or
30019 not. This happens for methods in derived classes that override
30020 virtual methods in base classes but are not explicitly marked as
30021 virtual. */
30022 if (DECL_VINDEX (versn->symbol.decl))
30023 sorry ("Virtual function multiversioning not supported");
30025 fn_ver_vec.safe_push (versn->symbol.decl);
30028 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30029 fn_ver_vec.release ();
30030 rebuild_cgraph_edges ();
30031 pop_cfun ();
30032 return resolver_decl;
30034 /* This builds the processor_model struct type defined in
30035 libgcc/config/i386/cpuinfo.c */
30037 static tree
30038 build_processor_model_struct (void)
30040 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30041 "__cpu_features"};
30042 tree field = NULL_TREE, field_chain = NULL_TREE;
30043 int i;
30044 tree type = make_node (RECORD_TYPE);
30046 /* The first 3 fields are unsigned int. */
30047 for (i = 0; i < 3; ++i)
30049 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30050 get_identifier (field_name[i]), unsigned_type_node);
30051 if (field_chain != NULL_TREE)
30052 DECL_CHAIN (field) = field_chain;
30053 field_chain = field;
30056 /* The last field is an array of unsigned integers of size one. */
30057 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30058 get_identifier (field_name[3]),
30059 build_array_type (unsigned_type_node,
30060 build_index_type (size_one_node)));
30061 if (field_chain != NULL_TREE)
30062 DECL_CHAIN (field) = field_chain;
30063 field_chain = field;
30065 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30066 return type;
30069 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30071 static tree
30072 make_var_decl (tree type, const char *name)
30074 tree new_decl;
30076 new_decl = build_decl (UNKNOWN_LOCATION,
30077 VAR_DECL,
30078 get_identifier(name),
30079 type);
30081 DECL_EXTERNAL (new_decl) = 1;
30082 TREE_STATIC (new_decl) = 1;
30083 TREE_PUBLIC (new_decl) = 1;
30084 DECL_INITIAL (new_decl) = 0;
30085 DECL_ARTIFICIAL (new_decl) = 0;
30086 DECL_PRESERVE_P (new_decl) = 1;
30088 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30089 assemble_variable (new_decl, 0, 0, 0);
30091 return new_decl;
30094 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30095 into an integer defined in libgcc/config/i386/cpuinfo.c */
30097 static tree
30098 fold_builtin_cpu (tree fndecl, tree *args)
30100 unsigned int i;
30101 enum ix86_builtins fn_code = (enum ix86_builtins)
30102 DECL_FUNCTION_CODE (fndecl);
30103 tree param_string_cst = NULL;
30105 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30106 enum processor_features
30108 F_CMOV = 0,
30109 F_MMX,
30110 F_POPCNT,
30111 F_SSE,
30112 F_SSE2,
30113 F_SSE3,
30114 F_SSSE3,
30115 F_SSE4_1,
30116 F_SSE4_2,
30117 F_AVX,
30118 F_AVX2,
30119 F_MAX
30122 /* These are the values for vendor types and cpu types and subtypes
30123 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30124 the corresponding start value. */
30125 enum processor_model
30127 M_INTEL = 1,
30128 M_AMD,
30129 M_CPU_TYPE_START,
30130 M_INTEL_ATOM,
30131 M_INTEL_CORE2,
30132 M_INTEL_COREI7,
30133 M_AMDFAM10H,
30134 M_AMDFAM15H,
30135 M_INTEL_SLM,
30136 M_CPU_SUBTYPE_START,
30137 M_INTEL_COREI7_NEHALEM,
30138 M_INTEL_COREI7_WESTMERE,
30139 M_INTEL_COREI7_SANDYBRIDGE,
30140 M_AMDFAM10H_BARCELONA,
30141 M_AMDFAM10H_SHANGHAI,
30142 M_AMDFAM10H_ISTANBUL,
30143 M_AMDFAM15H_BDVER1,
30144 M_AMDFAM15H_BDVER2,
30145 M_AMDFAM15H_BDVER3
30148 static struct _arch_names_table
30150 const char *const name;
30151 const enum processor_model model;
30153 const arch_names_table[] =
30155 {"amd", M_AMD},
30156 {"intel", M_INTEL},
30157 {"atom", M_INTEL_ATOM},
30158 {"slm", M_INTEL_SLM},
30159 {"core2", M_INTEL_CORE2},
30160 {"corei7", M_INTEL_COREI7},
30161 {"nehalem", M_INTEL_COREI7_NEHALEM},
30162 {"westmere", M_INTEL_COREI7_WESTMERE},
30163 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30164 {"amdfam10h", M_AMDFAM10H},
30165 {"barcelona", M_AMDFAM10H_BARCELONA},
30166 {"shanghai", M_AMDFAM10H_SHANGHAI},
30167 {"istanbul", M_AMDFAM10H_ISTANBUL},
30168 {"amdfam15h", M_AMDFAM15H},
30169 {"bdver1", M_AMDFAM15H_BDVER1},
30170 {"bdver2", M_AMDFAM15H_BDVER2},
30171 {"bdver3", M_AMDFAM15H_BDVER3},
30174 static struct _isa_names_table
30176 const char *const name;
30177 const enum processor_features feature;
30179 const isa_names_table[] =
30181 {"cmov", F_CMOV},
30182 {"mmx", F_MMX},
30183 {"popcnt", F_POPCNT},
30184 {"sse", F_SSE},
30185 {"sse2", F_SSE2},
30186 {"sse3", F_SSE3},
30187 {"ssse3", F_SSSE3},
30188 {"sse4.1", F_SSE4_1},
30189 {"sse4.2", F_SSE4_2},
30190 {"avx", F_AVX},
30191 {"avx2", F_AVX2}
30194 tree __processor_model_type = build_processor_model_struct ();
30195 tree __cpu_model_var = make_var_decl (__processor_model_type,
30196 "__cpu_model");
30199 varpool_add_new_variable (__cpu_model_var);
30201 gcc_assert ((args != NULL) && (*args != NULL));
30203 param_string_cst = *args;
30204 while (param_string_cst
30205 && TREE_CODE (param_string_cst) != STRING_CST)
30207 /* *args must be a expr that can contain other EXPRS leading to a
30208 STRING_CST. */
30209 if (!EXPR_P (param_string_cst))
30211 error ("Parameter to builtin must be a string constant or literal");
30212 return integer_zero_node;
30214 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30217 gcc_assert (param_string_cst);
30219 if (fn_code == IX86_BUILTIN_CPU_IS)
30221 tree ref;
30222 tree field;
30223 tree final;
30225 unsigned int field_val = 0;
30226 unsigned int NUM_ARCH_NAMES
30227 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30229 for (i = 0; i < NUM_ARCH_NAMES; i++)
30230 if (strcmp (arch_names_table[i].name,
30231 TREE_STRING_POINTER (param_string_cst)) == 0)
30232 break;
30234 if (i == NUM_ARCH_NAMES)
30236 error ("Parameter to builtin not valid: %s",
30237 TREE_STRING_POINTER (param_string_cst));
30238 return integer_zero_node;
30241 field = TYPE_FIELDS (__processor_model_type);
30242 field_val = arch_names_table[i].model;
30244 /* CPU types are stored in the next field. */
30245 if (field_val > M_CPU_TYPE_START
30246 && field_val < M_CPU_SUBTYPE_START)
30248 field = DECL_CHAIN (field);
30249 field_val -= M_CPU_TYPE_START;
30252 /* CPU subtypes are stored in the next field. */
30253 if (field_val > M_CPU_SUBTYPE_START)
30255 field = DECL_CHAIN ( DECL_CHAIN (field));
30256 field_val -= M_CPU_SUBTYPE_START;
30259 /* Get the appropriate field in __cpu_model. */
30260 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30261 field, NULL_TREE);
30263 /* Check the value. */
30264 final = build2 (EQ_EXPR, unsigned_type_node, ref,
30265 build_int_cstu (unsigned_type_node, field_val));
30266 return build1 (CONVERT_EXPR, integer_type_node, final);
30268 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30270 tree ref;
30271 tree array_elt;
30272 tree field;
30273 tree final;
30275 unsigned int field_val = 0;
30276 unsigned int NUM_ISA_NAMES
30277 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
30279 for (i = 0; i < NUM_ISA_NAMES; i++)
30280 if (strcmp (isa_names_table[i].name,
30281 TREE_STRING_POINTER (param_string_cst)) == 0)
30282 break;
30284 if (i == NUM_ISA_NAMES)
30286 error ("Parameter to builtin not valid: %s",
30287 TREE_STRING_POINTER (param_string_cst));
30288 return integer_zero_node;
30291 field = TYPE_FIELDS (__processor_model_type);
30292 /* Get the last field, which is __cpu_features. */
30293 while (DECL_CHAIN (field))
30294 field = DECL_CHAIN (field);
30296 /* Get the appropriate field: __cpu_model.__cpu_features */
30297 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30298 field, NULL_TREE);
30300 /* Access the 0th element of __cpu_features array. */
30301 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
30302 integer_zero_node, NULL_TREE, NULL_TREE);
30304 field_val = (1 << isa_names_table[i].feature);
30305 /* Return __cpu_model.__cpu_features[0] & field_val */
30306 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
30307 build_int_cstu (unsigned_type_node, field_val));
30308 return build1 (CONVERT_EXPR, integer_type_node, final);
30310 gcc_unreachable ();
30313 static tree
30314 ix86_fold_builtin (tree fndecl, int n_args,
30315 tree *args, bool ignore ATTRIBUTE_UNUSED)
30317 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30319 enum ix86_builtins fn_code = (enum ix86_builtins)
30320 DECL_FUNCTION_CODE (fndecl);
30321 if (fn_code == IX86_BUILTIN_CPU_IS
30322 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30324 gcc_assert (n_args == 1);
30325 return fold_builtin_cpu (fndecl, args);
30329 #ifdef SUBTARGET_FOLD_BUILTIN
30330 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
30331 #endif
30333 return NULL_TREE;
30336 /* Make builtins to detect cpu type and features supported. NAME is
30337 the builtin name, CODE is the builtin code, and FTYPE is the function
30338 type of the builtin. */
30340 static void
30341 make_cpu_type_builtin (const char* name, int code,
30342 enum ix86_builtin_func_type ftype, bool is_const)
30344 tree decl;
30345 tree type;
30347 type = ix86_get_builtin_func_type (ftype);
30348 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30349 NULL, NULL_TREE);
30350 gcc_assert (decl != NULL_TREE);
30351 ix86_builtins[(int) code] = decl;
30352 TREE_READONLY (decl) = is_const;
30355 /* Make builtins to get CPU type and features supported. The created
30356 builtins are :
30358 __builtin_cpu_init (), to detect cpu type and features,
30359 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
30360 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
30363 static void
30364 ix86_init_platform_type_builtins (void)
30366 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
30367 INT_FTYPE_VOID, false);
30368 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
30369 INT_FTYPE_PCCHAR, true);
30370 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
30371 INT_FTYPE_PCCHAR, true);
30374 /* Internal method for ix86_init_builtins. */
30376 static void
30377 ix86_init_builtins_va_builtins_abi (void)
30379 tree ms_va_ref, sysv_va_ref;
30380 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
30381 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
30382 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
30383 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
30385 if (!TARGET_64BIT)
30386 return;
30387 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
30388 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
30389 ms_va_ref = build_reference_type (ms_va_list_type_node);
30390 sysv_va_ref =
30391 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
30393 fnvoid_va_end_ms =
30394 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30395 fnvoid_va_start_ms =
30396 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30397 fnvoid_va_end_sysv =
30398 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
30399 fnvoid_va_start_sysv =
30400 build_varargs_function_type_list (void_type_node, sysv_va_ref,
30401 NULL_TREE);
30402 fnvoid_va_copy_ms =
30403 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
30404 NULL_TREE);
30405 fnvoid_va_copy_sysv =
30406 build_function_type_list (void_type_node, sysv_va_ref,
30407 sysv_va_ref, NULL_TREE);
30409 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
30410 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
30411 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
30412 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
30413 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
30414 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
30415 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
30416 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30417 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
30418 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30419 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
30420 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30423 static void
30424 ix86_init_builtin_types (void)
30426 tree float128_type_node, float80_type_node;
30428 /* The __float80 type. */
30429 float80_type_node = long_double_type_node;
30430 if (TYPE_MODE (float80_type_node) != XFmode)
30432 /* The __float80 type. */
30433 float80_type_node = make_node (REAL_TYPE);
30435 TYPE_PRECISION (float80_type_node) = 80;
30436 layout_type (float80_type_node);
30438 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
30440 /* The __float128 type. */
30441 float128_type_node = make_node (REAL_TYPE);
30442 TYPE_PRECISION (float128_type_node) = 128;
30443 layout_type (float128_type_node);
30444 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
30446 /* This macro is built by i386-builtin-types.awk. */
30447 DEFINE_BUILTIN_PRIMITIVE_TYPES;
30450 static void
30451 ix86_init_builtins (void)
30453 tree t;
30455 ix86_init_builtin_types ();
30457 /* Builtins to get CPU type and features. */
30458 ix86_init_platform_type_builtins ();
30460 /* TFmode support builtins. */
30461 def_builtin_const (0, "__builtin_infq",
30462 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
30463 def_builtin_const (0, "__builtin_huge_valq",
30464 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
30466 /* We will expand them to normal call if SSE isn't available since
30467 they are used by libgcc. */
30468 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
30469 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
30470 BUILT_IN_MD, "__fabstf2", NULL_TREE);
30471 TREE_READONLY (t) = 1;
30472 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
30474 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
30475 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
30476 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
30477 TREE_READONLY (t) = 1;
30478 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
30480 ix86_init_tm_builtins ();
30481 ix86_init_mmx_sse_builtins ();
30483 if (TARGET_LP64)
30484 ix86_init_builtins_va_builtins_abi ();
30486 #ifdef SUBTARGET_INIT_BUILTINS
30487 SUBTARGET_INIT_BUILTINS;
30488 #endif
30491 /* Return the ix86 builtin for CODE. */
30493 static tree
30494 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
30496 if (code >= IX86_BUILTIN_MAX)
30497 return error_mark_node;
30499 return ix86_builtins[code];
30502 /* Errors in the source file can cause expand_expr to return const0_rtx
30503 where we expect a vector. To avoid crashing, use one of the vector
30504 clear instructions. */
30505 static rtx
30506 safe_vector_operand (rtx x, enum machine_mode mode)
30508 if (x == const0_rtx)
30509 x = CONST0_RTX (mode);
30510 return x;
30513 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30515 static rtx
30516 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30518 rtx pat;
30519 tree arg0 = CALL_EXPR_ARG (exp, 0);
30520 tree arg1 = CALL_EXPR_ARG (exp, 1);
30521 rtx op0 = expand_normal (arg0);
30522 rtx op1 = expand_normal (arg1);
30523 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30524 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30525 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30527 if (VECTOR_MODE_P (mode0))
30528 op0 = safe_vector_operand (op0, mode0);
30529 if (VECTOR_MODE_P (mode1))
30530 op1 = safe_vector_operand (op1, mode1);
30532 if (optimize || !target
30533 || GET_MODE (target) != tmode
30534 || !insn_data[icode].operand[0].predicate (target, tmode))
30535 target = gen_reg_rtx (tmode);
30537 if (GET_MODE (op1) == SImode && mode1 == TImode)
30539 rtx x = gen_reg_rtx (V4SImode);
30540 emit_insn (gen_sse2_loadd (x, op1));
30541 op1 = gen_lowpart (TImode, x);
30544 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30545 op0 = copy_to_mode_reg (mode0, op0);
30546 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30547 op1 = copy_to_mode_reg (mode1, op1);
30549 pat = GEN_FCN (icode) (target, op0, op1);
30550 if (! pat)
30551 return 0;
30553 emit_insn (pat);
30555 return target;
30558 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30560 static rtx
30561 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30562 enum ix86_builtin_func_type m_type,
30563 enum rtx_code sub_code)
30565 rtx pat;
30566 int i;
30567 int nargs;
30568 bool comparison_p = false;
30569 bool tf_p = false;
30570 bool last_arg_constant = false;
30571 int num_memory = 0;
30572 struct {
30573 rtx op;
30574 enum machine_mode mode;
30575 } args[4];
30577 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30579 switch (m_type)
30581 case MULTI_ARG_4_DF2_DI_I:
30582 case MULTI_ARG_4_DF2_DI_I1:
30583 case MULTI_ARG_4_SF2_SI_I:
30584 case MULTI_ARG_4_SF2_SI_I1:
30585 nargs = 4;
30586 last_arg_constant = true;
30587 break;
30589 case MULTI_ARG_3_SF:
30590 case MULTI_ARG_3_DF:
30591 case MULTI_ARG_3_SF2:
30592 case MULTI_ARG_3_DF2:
30593 case MULTI_ARG_3_DI:
30594 case MULTI_ARG_3_SI:
30595 case MULTI_ARG_3_SI_DI:
30596 case MULTI_ARG_3_HI:
30597 case MULTI_ARG_3_HI_SI:
30598 case MULTI_ARG_3_QI:
30599 case MULTI_ARG_3_DI2:
30600 case MULTI_ARG_3_SI2:
30601 case MULTI_ARG_3_HI2:
30602 case MULTI_ARG_3_QI2:
30603 nargs = 3;
30604 break;
30606 case MULTI_ARG_2_SF:
30607 case MULTI_ARG_2_DF:
30608 case MULTI_ARG_2_DI:
30609 case MULTI_ARG_2_SI:
30610 case MULTI_ARG_2_HI:
30611 case MULTI_ARG_2_QI:
30612 nargs = 2;
30613 break;
30615 case MULTI_ARG_2_DI_IMM:
30616 case MULTI_ARG_2_SI_IMM:
30617 case MULTI_ARG_2_HI_IMM:
30618 case MULTI_ARG_2_QI_IMM:
30619 nargs = 2;
30620 last_arg_constant = true;
30621 break;
30623 case MULTI_ARG_1_SF:
30624 case MULTI_ARG_1_DF:
30625 case MULTI_ARG_1_SF2:
30626 case MULTI_ARG_1_DF2:
30627 case MULTI_ARG_1_DI:
30628 case MULTI_ARG_1_SI:
30629 case MULTI_ARG_1_HI:
30630 case MULTI_ARG_1_QI:
30631 case MULTI_ARG_1_SI_DI:
30632 case MULTI_ARG_1_HI_DI:
30633 case MULTI_ARG_1_HI_SI:
30634 case MULTI_ARG_1_QI_DI:
30635 case MULTI_ARG_1_QI_SI:
30636 case MULTI_ARG_1_QI_HI:
30637 nargs = 1;
30638 break;
30640 case MULTI_ARG_2_DI_CMP:
30641 case MULTI_ARG_2_SI_CMP:
30642 case MULTI_ARG_2_HI_CMP:
30643 case MULTI_ARG_2_QI_CMP:
30644 nargs = 2;
30645 comparison_p = true;
30646 break;
30648 case MULTI_ARG_2_SF_TF:
30649 case MULTI_ARG_2_DF_TF:
30650 case MULTI_ARG_2_DI_TF:
30651 case MULTI_ARG_2_SI_TF:
30652 case MULTI_ARG_2_HI_TF:
30653 case MULTI_ARG_2_QI_TF:
30654 nargs = 2;
30655 tf_p = true;
30656 break;
30658 default:
30659 gcc_unreachable ();
30662 if (optimize || !target
30663 || GET_MODE (target) != tmode
30664 || !insn_data[icode].operand[0].predicate (target, tmode))
30665 target = gen_reg_rtx (tmode);
30667 gcc_assert (nargs <= 4);
30669 for (i = 0; i < nargs; i++)
30671 tree arg = CALL_EXPR_ARG (exp, i);
30672 rtx op = expand_normal (arg);
30673 int adjust = (comparison_p) ? 1 : 0;
30674 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30676 if (last_arg_constant && i == nargs - 1)
30678 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30680 enum insn_code new_icode = icode;
30681 switch (icode)
30683 case CODE_FOR_xop_vpermil2v2df3:
30684 case CODE_FOR_xop_vpermil2v4sf3:
30685 case CODE_FOR_xop_vpermil2v4df3:
30686 case CODE_FOR_xop_vpermil2v8sf3:
30687 error ("the last argument must be a 2-bit immediate");
30688 return gen_reg_rtx (tmode);
30689 case CODE_FOR_xop_rotlv2di3:
30690 new_icode = CODE_FOR_rotlv2di3;
30691 goto xop_rotl;
30692 case CODE_FOR_xop_rotlv4si3:
30693 new_icode = CODE_FOR_rotlv4si3;
30694 goto xop_rotl;
30695 case CODE_FOR_xop_rotlv8hi3:
30696 new_icode = CODE_FOR_rotlv8hi3;
30697 goto xop_rotl;
30698 case CODE_FOR_xop_rotlv16qi3:
30699 new_icode = CODE_FOR_rotlv16qi3;
30700 xop_rotl:
30701 if (CONST_INT_P (op))
30703 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30704 op = GEN_INT (INTVAL (op) & mask);
30705 gcc_checking_assert
30706 (insn_data[icode].operand[i + 1].predicate (op, mode));
30708 else
30710 gcc_checking_assert
30711 (nargs == 2
30712 && insn_data[new_icode].operand[0].mode == tmode
30713 && insn_data[new_icode].operand[1].mode == tmode
30714 && insn_data[new_icode].operand[2].mode == mode
30715 && insn_data[new_icode].operand[0].predicate
30716 == insn_data[icode].operand[0].predicate
30717 && insn_data[new_icode].operand[1].predicate
30718 == insn_data[icode].operand[1].predicate);
30719 icode = new_icode;
30720 goto non_constant;
30722 break;
30723 default:
30724 gcc_unreachable ();
30728 else
30730 non_constant:
30731 if (VECTOR_MODE_P (mode))
30732 op = safe_vector_operand (op, mode);
30734 /* If we aren't optimizing, only allow one memory operand to be
30735 generated. */
30736 if (memory_operand (op, mode))
30737 num_memory++;
30739 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30741 if (optimize
30742 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30743 || num_memory > 1)
30744 op = force_reg (mode, op);
30747 args[i].op = op;
30748 args[i].mode = mode;
30751 switch (nargs)
30753 case 1:
30754 pat = GEN_FCN (icode) (target, args[0].op);
30755 break;
30757 case 2:
30758 if (tf_p)
30759 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30760 GEN_INT ((int)sub_code));
30761 else if (! comparison_p)
30762 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30763 else
30765 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30766 args[0].op,
30767 args[1].op);
30769 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30771 break;
30773 case 3:
30774 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30775 break;
30777 case 4:
30778 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30779 break;
30781 default:
30782 gcc_unreachable ();
30785 if (! pat)
30786 return 0;
30788 emit_insn (pat);
30789 return target;
30792 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30793 insns with vec_merge. */
30795 static rtx
30796 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30797 rtx target)
30799 rtx pat;
30800 tree arg0 = CALL_EXPR_ARG (exp, 0);
30801 rtx op1, op0 = expand_normal (arg0);
30802 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30803 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30805 if (optimize || !target
30806 || GET_MODE (target) != tmode
30807 || !insn_data[icode].operand[0].predicate (target, tmode))
30808 target = gen_reg_rtx (tmode);
30810 if (VECTOR_MODE_P (mode0))
30811 op0 = safe_vector_operand (op0, mode0);
30813 if ((optimize && !register_operand (op0, mode0))
30814 || !insn_data[icode].operand[1].predicate (op0, mode0))
30815 op0 = copy_to_mode_reg (mode0, op0);
30817 op1 = op0;
30818 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30819 op1 = copy_to_mode_reg (mode0, op1);
30821 pat = GEN_FCN (icode) (target, op0, op1);
30822 if (! pat)
30823 return 0;
30824 emit_insn (pat);
30825 return target;
30828 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30830 static rtx
30831 ix86_expand_sse_compare (const struct builtin_description *d,
30832 tree exp, rtx target, bool swap)
30834 rtx pat;
30835 tree arg0 = CALL_EXPR_ARG (exp, 0);
30836 tree arg1 = CALL_EXPR_ARG (exp, 1);
30837 rtx op0 = expand_normal (arg0);
30838 rtx op1 = expand_normal (arg1);
30839 rtx op2;
30840 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30841 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30842 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30843 enum rtx_code comparison = d->comparison;
30845 if (VECTOR_MODE_P (mode0))
30846 op0 = safe_vector_operand (op0, mode0);
30847 if (VECTOR_MODE_P (mode1))
30848 op1 = safe_vector_operand (op1, mode1);
30850 /* Swap operands if we have a comparison that isn't available in
30851 hardware. */
30852 if (swap)
30854 rtx tmp = gen_reg_rtx (mode1);
30855 emit_move_insn (tmp, op1);
30856 op1 = op0;
30857 op0 = tmp;
30860 if (optimize || !target
30861 || GET_MODE (target) != tmode
30862 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30863 target = gen_reg_rtx (tmode);
30865 if ((optimize && !register_operand (op0, mode0))
30866 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30867 op0 = copy_to_mode_reg (mode0, op0);
30868 if ((optimize && !register_operand (op1, mode1))
30869 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30870 op1 = copy_to_mode_reg (mode1, op1);
30872 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30873 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30874 if (! pat)
30875 return 0;
30876 emit_insn (pat);
30877 return target;
30880 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30882 static rtx
30883 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30884 rtx target)
30886 rtx pat;
30887 tree arg0 = CALL_EXPR_ARG (exp, 0);
30888 tree arg1 = CALL_EXPR_ARG (exp, 1);
30889 rtx op0 = expand_normal (arg0);
30890 rtx op1 = expand_normal (arg1);
30891 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30892 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30893 enum rtx_code comparison = d->comparison;
30895 if (VECTOR_MODE_P (mode0))
30896 op0 = safe_vector_operand (op0, mode0);
30897 if (VECTOR_MODE_P (mode1))
30898 op1 = safe_vector_operand (op1, mode1);
30900 /* Swap operands if we have a comparison that isn't available in
30901 hardware. */
30902 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30904 rtx tmp = op1;
30905 op1 = op0;
30906 op0 = tmp;
30909 target = gen_reg_rtx (SImode);
30910 emit_move_insn (target, const0_rtx);
30911 target = gen_rtx_SUBREG (QImode, target, 0);
30913 if ((optimize && !register_operand (op0, mode0))
30914 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30915 op0 = copy_to_mode_reg (mode0, op0);
30916 if ((optimize && !register_operand (op1, mode1))
30917 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30918 op1 = copy_to_mode_reg (mode1, op1);
30920 pat = GEN_FCN (d->icode) (op0, op1);
30921 if (! pat)
30922 return 0;
30923 emit_insn (pat);
30924 emit_insn (gen_rtx_SET (VOIDmode,
30925 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30926 gen_rtx_fmt_ee (comparison, QImode,
30927 SET_DEST (pat),
30928 const0_rtx)));
30930 return SUBREG_REG (target);
30933 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30935 static rtx
30936 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30937 rtx target)
30939 rtx pat;
30940 tree arg0 = CALL_EXPR_ARG (exp, 0);
30941 rtx op1, op0 = expand_normal (arg0);
30942 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30943 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30945 if (optimize || target == 0
30946 || GET_MODE (target) != tmode
30947 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30948 target = gen_reg_rtx (tmode);
30950 if (VECTOR_MODE_P (mode0))
30951 op0 = safe_vector_operand (op0, mode0);
30953 if ((optimize && !register_operand (op0, mode0))
30954 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30955 op0 = copy_to_mode_reg (mode0, op0);
30957 op1 = GEN_INT (d->comparison);
30959 pat = GEN_FCN (d->icode) (target, op0, op1);
30960 if (! pat)
30961 return 0;
30962 emit_insn (pat);
30963 return target;
30966 static rtx
30967 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30968 tree exp, rtx target)
30970 rtx pat;
30971 tree arg0 = CALL_EXPR_ARG (exp, 0);
30972 tree arg1 = CALL_EXPR_ARG (exp, 1);
30973 rtx op0 = expand_normal (arg0);
30974 rtx op1 = expand_normal (arg1);
30975 rtx op2;
30976 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30977 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30978 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30980 if (optimize || target == 0
30981 || GET_MODE (target) != tmode
30982 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30983 target = gen_reg_rtx (tmode);
30985 op0 = safe_vector_operand (op0, mode0);
30986 op1 = safe_vector_operand (op1, mode1);
30988 if ((optimize && !register_operand (op0, mode0))
30989 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30990 op0 = copy_to_mode_reg (mode0, op0);
30991 if ((optimize && !register_operand (op1, mode1))
30992 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30993 op1 = copy_to_mode_reg (mode1, op1);
30995 op2 = GEN_INT (d->comparison);
30997 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30998 if (! pat)
30999 return 0;
31000 emit_insn (pat);
31001 return target;
31004 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31006 static rtx
31007 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31008 rtx target)
31010 rtx pat;
31011 tree arg0 = CALL_EXPR_ARG (exp, 0);
31012 tree arg1 = CALL_EXPR_ARG (exp, 1);
31013 rtx op0 = expand_normal (arg0);
31014 rtx op1 = expand_normal (arg1);
31015 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31016 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31017 enum rtx_code comparison = d->comparison;
31019 if (VECTOR_MODE_P (mode0))
31020 op0 = safe_vector_operand (op0, mode0);
31021 if (VECTOR_MODE_P (mode1))
31022 op1 = safe_vector_operand (op1, mode1);
31024 target = gen_reg_rtx (SImode);
31025 emit_move_insn (target, const0_rtx);
31026 target = gen_rtx_SUBREG (QImode, target, 0);
31028 if ((optimize && !register_operand (op0, mode0))
31029 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31030 op0 = copy_to_mode_reg (mode0, op0);
31031 if ((optimize && !register_operand (op1, mode1))
31032 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31033 op1 = copy_to_mode_reg (mode1, op1);
31035 pat = GEN_FCN (d->icode) (op0, op1);
31036 if (! pat)
31037 return 0;
31038 emit_insn (pat);
31039 emit_insn (gen_rtx_SET (VOIDmode,
31040 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31041 gen_rtx_fmt_ee (comparison, QImode,
31042 SET_DEST (pat),
31043 const0_rtx)));
31045 return SUBREG_REG (target);
31048 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31050 static rtx
31051 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31052 tree exp, rtx target)
31054 rtx pat;
31055 tree arg0 = CALL_EXPR_ARG (exp, 0);
31056 tree arg1 = CALL_EXPR_ARG (exp, 1);
31057 tree arg2 = CALL_EXPR_ARG (exp, 2);
31058 tree arg3 = CALL_EXPR_ARG (exp, 3);
31059 tree arg4 = CALL_EXPR_ARG (exp, 4);
31060 rtx scratch0, scratch1;
31061 rtx op0 = expand_normal (arg0);
31062 rtx op1 = expand_normal (arg1);
31063 rtx op2 = expand_normal (arg2);
31064 rtx op3 = expand_normal (arg3);
31065 rtx op4 = expand_normal (arg4);
31066 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31068 tmode0 = insn_data[d->icode].operand[0].mode;
31069 tmode1 = insn_data[d->icode].operand[1].mode;
31070 modev2 = insn_data[d->icode].operand[2].mode;
31071 modei3 = insn_data[d->icode].operand[3].mode;
31072 modev4 = insn_data[d->icode].operand[4].mode;
31073 modei5 = insn_data[d->icode].operand[5].mode;
31074 modeimm = insn_data[d->icode].operand[6].mode;
31076 if (VECTOR_MODE_P (modev2))
31077 op0 = safe_vector_operand (op0, modev2);
31078 if (VECTOR_MODE_P (modev4))
31079 op2 = safe_vector_operand (op2, modev4);
31081 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31082 op0 = copy_to_mode_reg (modev2, op0);
31083 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31084 op1 = copy_to_mode_reg (modei3, op1);
31085 if ((optimize && !register_operand (op2, modev4))
31086 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31087 op2 = copy_to_mode_reg (modev4, op2);
31088 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31089 op3 = copy_to_mode_reg (modei5, op3);
31091 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31093 error ("the fifth argument must be an 8-bit immediate");
31094 return const0_rtx;
31097 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31099 if (optimize || !target
31100 || GET_MODE (target) != tmode0
31101 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31102 target = gen_reg_rtx (tmode0);
31104 scratch1 = gen_reg_rtx (tmode1);
31106 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31108 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31110 if (optimize || !target
31111 || GET_MODE (target) != tmode1
31112 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31113 target = gen_reg_rtx (tmode1);
31115 scratch0 = gen_reg_rtx (tmode0);
31117 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31119 else
31121 gcc_assert (d->flag);
31123 scratch0 = gen_reg_rtx (tmode0);
31124 scratch1 = gen_reg_rtx (tmode1);
31126 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31129 if (! pat)
31130 return 0;
31132 emit_insn (pat);
31134 if (d->flag)
31136 target = gen_reg_rtx (SImode);
31137 emit_move_insn (target, const0_rtx);
31138 target = gen_rtx_SUBREG (QImode, target, 0);
31140 emit_insn
31141 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31142 gen_rtx_fmt_ee (EQ, QImode,
31143 gen_rtx_REG ((enum machine_mode) d->flag,
31144 FLAGS_REG),
31145 const0_rtx)));
31146 return SUBREG_REG (target);
31148 else
31149 return target;
31153 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31155 static rtx
31156 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31157 tree exp, rtx target)
31159 rtx pat;
31160 tree arg0 = CALL_EXPR_ARG (exp, 0);
31161 tree arg1 = CALL_EXPR_ARG (exp, 1);
31162 tree arg2 = CALL_EXPR_ARG (exp, 2);
31163 rtx scratch0, scratch1;
31164 rtx op0 = expand_normal (arg0);
31165 rtx op1 = expand_normal (arg1);
31166 rtx op2 = expand_normal (arg2);
31167 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31169 tmode0 = insn_data[d->icode].operand[0].mode;
31170 tmode1 = insn_data[d->icode].operand[1].mode;
31171 modev2 = insn_data[d->icode].operand[2].mode;
31172 modev3 = insn_data[d->icode].operand[3].mode;
31173 modeimm = insn_data[d->icode].operand[4].mode;
31175 if (VECTOR_MODE_P (modev2))
31176 op0 = safe_vector_operand (op0, modev2);
31177 if (VECTOR_MODE_P (modev3))
31178 op1 = safe_vector_operand (op1, modev3);
31180 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31181 op0 = copy_to_mode_reg (modev2, op0);
31182 if ((optimize && !register_operand (op1, modev3))
31183 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31184 op1 = copy_to_mode_reg (modev3, op1);
31186 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31188 error ("the third argument must be an 8-bit immediate");
31189 return const0_rtx;
31192 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31194 if (optimize || !target
31195 || GET_MODE (target) != tmode0
31196 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31197 target = gen_reg_rtx (tmode0);
31199 scratch1 = gen_reg_rtx (tmode1);
31201 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31203 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31205 if (optimize || !target
31206 || GET_MODE (target) != tmode1
31207 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31208 target = gen_reg_rtx (tmode1);
31210 scratch0 = gen_reg_rtx (tmode0);
31212 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31214 else
31216 gcc_assert (d->flag);
31218 scratch0 = gen_reg_rtx (tmode0);
31219 scratch1 = gen_reg_rtx (tmode1);
31221 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31224 if (! pat)
31225 return 0;
31227 emit_insn (pat);
31229 if (d->flag)
31231 target = gen_reg_rtx (SImode);
31232 emit_move_insn (target, const0_rtx);
31233 target = gen_rtx_SUBREG (QImode, target, 0);
31235 emit_insn
31236 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31237 gen_rtx_fmt_ee (EQ, QImode,
31238 gen_rtx_REG ((enum machine_mode) d->flag,
31239 FLAGS_REG),
31240 const0_rtx)));
31241 return SUBREG_REG (target);
31243 else
31244 return target;
31247 /* Subroutine of ix86_expand_builtin to take care of insns with
31248 variable number of operands. */
31250 static rtx
31251 ix86_expand_args_builtin (const struct builtin_description *d,
31252 tree exp, rtx target)
31254 rtx pat, real_target;
31255 unsigned int i, nargs;
31256 unsigned int nargs_constant = 0;
31257 int num_memory = 0;
31258 struct
31260 rtx op;
31261 enum machine_mode mode;
31262 } args[4];
31263 bool last_arg_count = false;
31264 enum insn_code icode = d->icode;
31265 const struct insn_data_d *insn_p = &insn_data[icode];
31266 enum machine_mode tmode = insn_p->operand[0].mode;
31267 enum machine_mode rmode = VOIDmode;
31268 bool swap = false;
31269 enum rtx_code comparison = d->comparison;
31271 switch ((enum ix86_builtin_func_type) d->flag)
31273 case V2DF_FTYPE_V2DF_ROUND:
31274 case V4DF_FTYPE_V4DF_ROUND:
31275 case V4SF_FTYPE_V4SF_ROUND:
31276 case V8SF_FTYPE_V8SF_ROUND:
31277 case V4SI_FTYPE_V4SF_ROUND:
31278 case V8SI_FTYPE_V8SF_ROUND:
31279 return ix86_expand_sse_round (d, exp, target);
31280 case V4SI_FTYPE_V2DF_V2DF_ROUND:
31281 case V8SI_FTYPE_V4DF_V4DF_ROUND:
31282 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
31283 case INT_FTYPE_V8SF_V8SF_PTEST:
31284 case INT_FTYPE_V4DI_V4DI_PTEST:
31285 case INT_FTYPE_V4DF_V4DF_PTEST:
31286 case INT_FTYPE_V4SF_V4SF_PTEST:
31287 case INT_FTYPE_V2DI_V2DI_PTEST:
31288 case INT_FTYPE_V2DF_V2DF_PTEST:
31289 return ix86_expand_sse_ptest (d, exp, target);
31290 case FLOAT128_FTYPE_FLOAT128:
31291 case FLOAT_FTYPE_FLOAT:
31292 case INT_FTYPE_INT:
31293 case UINT64_FTYPE_INT:
31294 case UINT16_FTYPE_UINT16:
31295 case INT64_FTYPE_INT64:
31296 case INT64_FTYPE_V4SF:
31297 case INT64_FTYPE_V2DF:
31298 case INT_FTYPE_V16QI:
31299 case INT_FTYPE_V8QI:
31300 case INT_FTYPE_V8SF:
31301 case INT_FTYPE_V4DF:
31302 case INT_FTYPE_V4SF:
31303 case INT_FTYPE_V2DF:
31304 case INT_FTYPE_V32QI:
31305 case V16QI_FTYPE_V16QI:
31306 case V8SI_FTYPE_V8SF:
31307 case V8SI_FTYPE_V4SI:
31308 case V8HI_FTYPE_V8HI:
31309 case V8HI_FTYPE_V16QI:
31310 case V8QI_FTYPE_V8QI:
31311 case V8SF_FTYPE_V8SF:
31312 case V8SF_FTYPE_V8SI:
31313 case V8SF_FTYPE_V4SF:
31314 case V8SF_FTYPE_V8HI:
31315 case V4SI_FTYPE_V4SI:
31316 case V4SI_FTYPE_V16QI:
31317 case V4SI_FTYPE_V4SF:
31318 case V4SI_FTYPE_V8SI:
31319 case V4SI_FTYPE_V8HI:
31320 case V4SI_FTYPE_V4DF:
31321 case V4SI_FTYPE_V2DF:
31322 case V4HI_FTYPE_V4HI:
31323 case V4DF_FTYPE_V4DF:
31324 case V4DF_FTYPE_V4SI:
31325 case V4DF_FTYPE_V4SF:
31326 case V4DF_FTYPE_V2DF:
31327 case V4SF_FTYPE_V4SF:
31328 case V4SF_FTYPE_V4SI:
31329 case V4SF_FTYPE_V8SF:
31330 case V4SF_FTYPE_V4DF:
31331 case V4SF_FTYPE_V8HI:
31332 case V4SF_FTYPE_V2DF:
31333 case V2DI_FTYPE_V2DI:
31334 case V2DI_FTYPE_V16QI:
31335 case V2DI_FTYPE_V8HI:
31336 case V2DI_FTYPE_V4SI:
31337 case V2DF_FTYPE_V2DF:
31338 case V2DF_FTYPE_V4SI:
31339 case V2DF_FTYPE_V4DF:
31340 case V2DF_FTYPE_V4SF:
31341 case V2DF_FTYPE_V2SI:
31342 case V2SI_FTYPE_V2SI:
31343 case V2SI_FTYPE_V4SF:
31344 case V2SI_FTYPE_V2SF:
31345 case V2SI_FTYPE_V2DF:
31346 case V2SF_FTYPE_V2SF:
31347 case V2SF_FTYPE_V2SI:
31348 case V32QI_FTYPE_V32QI:
31349 case V32QI_FTYPE_V16QI:
31350 case V16HI_FTYPE_V16HI:
31351 case V16HI_FTYPE_V8HI:
31352 case V8SI_FTYPE_V8SI:
31353 case V16HI_FTYPE_V16QI:
31354 case V8SI_FTYPE_V16QI:
31355 case V4DI_FTYPE_V16QI:
31356 case V8SI_FTYPE_V8HI:
31357 case V4DI_FTYPE_V8HI:
31358 case V4DI_FTYPE_V4SI:
31359 case V4DI_FTYPE_V2DI:
31360 nargs = 1;
31361 break;
31362 case V4SF_FTYPE_V4SF_VEC_MERGE:
31363 case V2DF_FTYPE_V2DF_VEC_MERGE:
31364 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
31365 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
31366 case V16QI_FTYPE_V16QI_V16QI:
31367 case V16QI_FTYPE_V8HI_V8HI:
31368 case V8QI_FTYPE_V8QI_V8QI:
31369 case V8QI_FTYPE_V4HI_V4HI:
31370 case V8HI_FTYPE_V8HI_V8HI:
31371 case V8HI_FTYPE_V16QI_V16QI:
31372 case V8HI_FTYPE_V4SI_V4SI:
31373 case V8SF_FTYPE_V8SF_V8SF:
31374 case V8SF_FTYPE_V8SF_V8SI:
31375 case V4SI_FTYPE_V4SI_V4SI:
31376 case V4SI_FTYPE_V8HI_V8HI:
31377 case V4SI_FTYPE_V4SF_V4SF:
31378 case V4SI_FTYPE_V2DF_V2DF:
31379 case V4HI_FTYPE_V4HI_V4HI:
31380 case V4HI_FTYPE_V8QI_V8QI:
31381 case V4HI_FTYPE_V2SI_V2SI:
31382 case V4DF_FTYPE_V4DF_V4DF:
31383 case V4DF_FTYPE_V4DF_V4DI:
31384 case V4SF_FTYPE_V4SF_V4SF:
31385 case V4SF_FTYPE_V4SF_V4SI:
31386 case V4SF_FTYPE_V4SF_V2SI:
31387 case V4SF_FTYPE_V4SF_V2DF:
31388 case V4SF_FTYPE_V4SF_DI:
31389 case V4SF_FTYPE_V4SF_SI:
31390 case V2DI_FTYPE_V2DI_V2DI:
31391 case V2DI_FTYPE_V16QI_V16QI:
31392 case V2DI_FTYPE_V4SI_V4SI:
31393 case V2UDI_FTYPE_V4USI_V4USI:
31394 case V2DI_FTYPE_V2DI_V16QI:
31395 case V2DI_FTYPE_V2DF_V2DF:
31396 case V2SI_FTYPE_V2SI_V2SI:
31397 case V2SI_FTYPE_V4HI_V4HI:
31398 case V2SI_FTYPE_V2SF_V2SF:
31399 case V2DF_FTYPE_V2DF_V2DF:
31400 case V2DF_FTYPE_V2DF_V4SF:
31401 case V2DF_FTYPE_V2DF_V2DI:
31402 case V2DF_FTYPE_V2DF_DI:
31403 case V2DF_FTYPE_V2DF_SI:
31404 case V2SF_FTYPE_V2SF_V2SF:
31405 case V1DI_FTYPE_V1DI_V1DI:
31406 case V1DI_FTYPE_V8QI_V8QI:
31407 case V1DI_FTYPE_V2SI_V2SI:
31408 case V32QI_FTYPE_V16HI_V16HI:
31409 case V16HI_FTYPE_V8SI_V8SI:
31410 case V32QI_FTYPE_V32QI_V32QI:
31411 case V16HI_FTYPE_V32QI_V32QI:
31412 case V16HI_FTYPE_V16HI_V16HI:
31413 case V8SI_FTYPE_V4DF_V4DF:
31414 case V8SI_FTYPE_V8SI_V8SI:
31415 case V8SI_FTYPE_V16HI_V16HI:
31416 case V4DI_FTYPE_V4DI_V4DI:
31417 case V4DI_FTYPE_V8SI_V8SI:
31418 case V4UDI_FTYPE_V8USI_V8USI:
31419 if (comparison == UNKNOWN)
31420 return ix86_expand_binop_builtin (icode, exp, target);
31421 nargs = 2;
31422 break;
31423 case V4SF_FTYPE_V4SF_V4SF_SWAP:
31424 case V2DF_FTYPE_V2DF_V2DF_SWAP:
31425 gcc_assert (comparison != UNKNOWN);
31426 nargs = 2;
31427 swap = true;
31428 break;
31429 case V16HI_FTYPE_V16HI_V8HI_COUNT:
31430 case V16HI_FTYPE_V16HI_SI_COUNT:
31431 case V8SI_FTYPE_V8SI_V4SI_COUNT:
31432 case V8SI_FTYPE_V8SI_SI_COUNT:
31433 case V4DI_FTYPE_V4DI_V2DI_COUNT:
31434 case V4DI_FTYPE_V4DI_INT_COUNT:
31435 case V8HI_FTYPE_V8HI_V8HI_COUNT:
31436 case V8HI_FTYPE_V8HI_SI_COUNT:
31437 case V4SI_FTYPE_V4SI_V4SI_COUNT:
31438 case V4SI_FTYPE_V4SI_SI_COUNT:
31439 case V4HI_FTYPE_V4HI_V4HI_COUNT:
31440 case V4HI_FTYPE_V4HI_SI_COUNT:
31441 case V2DI_FTYPE_V2DI_V2DI_COUNT:
31442 case V2DI_FTYPE_V2DI_SI_COUNT:
31443 case V2SI_FTYPE_V2SI_V2SI_COUNT:
31444 case V2SI_FTYPE_V2SI_SI_COUNT:
31445 case V1DI_FTYPE_V1DI_V1DI_COUNT:
31446 case V1DI_FTYPE_V1DI_SI_COUNT:
31447 nargs = 2;
31448 last_arg_count = true;
31449 break;
31450 case UINT64_FTYPE_UINT64_UINT64:
31451 case UINT_FTYPE_UINT_UINT:
31452 case UINT_FTYPE_UINT_USHORT:
31453 case UINT_FTYPE_UINT_UCHAR:
31454 case UINT16_FTYPE_UINT16_INT:
31455 case UINT8_FTYPE_UINT8_INT:
31456 nargs = 2;
31457 break;
31458 case V2DI_FTYPE_V2DI_INT_CONVERT:
31459 nargs = 2;
31460 rmode = V1TImode;
31461 nargs_constant = 1;
31462 break;
31463 case V4DI_FTYPE_V4DI_INT_CONVERT:
31464 nargs = 2;
31465 rmode = V2TImode;
31466 nargs_constant = 1;
31467 break;
31468 case V8HI_FTYPE_V8HI_INT:
31469 case V8HI_FTYPE_V8SF_INT:
31470 case V8HI_FTYPE_V4SF_INT:
31471 case V8SF_FTYPE_V8SF_INT:
31472 case V4SI_FTYPE_V4SI_INT:
31473 case V4SI_FTYPE_V8SI_INT:
31474 case V4HI_FTYPE_V4HI_INT:
31475 case V4DF_FTYPE_V4DF_INT:
31476 case V4SF_FTYPE_V4SF_INT:
31477 case V4SF_FTYPE_V8SF_INT:
31478 case V2DI_FTYPE_V2DI_INT:
31479 case V2DF_FTYPE_V2DF_INT:
31480 case V2DF_FTYPE_V4DF_INT:
31481 case V16HI_FTYPE_V16HI_INT:
31482 case V8SI_FTYPE_V8SI_INT:
31483 case V4DI_FTYPE_V4DI_INT:
31484 case V2DI_FTYPE_V4DI_INT:
31485 nargs = 2;
31486 nargs_constant = 1;
31487 break;
31488 case V16QI_FTYPE_V16QI_V16QI_V16QI:
31489 case V8SF_FTYPE_V8SF_V8SF_V8SF:
31490 case V4DF_FTYPE_V4DF_V4DF_V4DF:
31491 case V4SF_FTYPE_V4SF_V4SF_V4SF:
31492 case V2DF_FTYPE_V2DF_V2DF_V2DF:
31493 case V32QI_FTYPE_V32QI_V32QI_V32QI:
31494 nargs = 3;
31495 break;
31496 case V32QI_FTYPE_V32QI_V32QI_INT:
31497 case V16HI_FTYPE_V16HI_V16HI_INT:
31498 case V16QI_FTYPE_V16QI_V16QI_INT:
31499 case V4DI_FTYPE_V4DI_V4DI_INT:
31500 case V8HI_FTYPE_V8HI_V8HI_INT:
31501 case V8SI_FTYPE_V8SI_V8SI_INT:
31502 case V8SI_FTYPE_V8SI_V4SI_INT:
31503 case V8SF_FTYPE_V8SF_V8SF_INT:
31504 case V8SF_FTYPE_V8SF_V4SF_INT:
31505 case V4SI_FTYPE_V4SI_V4SI_INT:
31506 case V4DF_FTYPE_V4DF_V4DF_INT:
31507 case V4DF_FTYPE_V4DF_V2DF_INT:
31508 case V4SF_FTYPE_V4SF_V4SF_INT:
31509 case V2DI_FTYPE_V2DI_V2DI_INT:
31510 case V4DI_FTYPE_V4DI_V2DI_INT:
31511 case V2DF_FTYPE_V2DF_V2DF_INT:
31512 nargs = 3;
31513 nargs_constant = 1;
31514 break;
31515 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31516 nargs = 3;
31517 rmode = V4DImode;
31518 nargs_constant = 1;
31519 break;
31520 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31521 nargs = 3;
31522 rmode = V2DImode;
31523 nargs_constant = 1;
31524 break;
31525 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31526 nargs = 3;
31527 rmode = DImode;
31528 nargs_constant = 1;
31529 break;
31530 case V2DI_FTYPE_V2DI_UINT_UINT:
31531 nargs = 3;
31532 nargs_constant = 2;
31533 break;
31534 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31535 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31536 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31537 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31538 nargs = 4;
31539 nargs_constant = 1;
31540 break;
31541 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31542 nargs = 4;
31543 nargs_constant = 2;
31544 break;
31545 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31546 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31547 nargs = 4;
31548 break;
31549 default:
31550 gcc_unreachable ();
31553 gcc_assert (nargs <= ARRAY_SIZE (args));
31555 if (comparison != UNKNOWN)
31557 gcc_assert (nargs == 2);
31558 return ix86_expand_sse_compare (d, exp, target, swap);
31561 if (rmode == VOIDmode || rmode == tmode)
31563 if (optimize
31564 || target == 0
31565 || GET_MODE (target) != tmode
31566 || !insn_p->operand[0].predicate (target, tmode))
31567 target = gen_reg_rtx (tmode);
31568 real_target = target;
31570 else
31572 target = gen_reg_rtx (rmode);
31573 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31576 for (i = 0; i < nargs; i++)
31578 tree arg = CALL_EXPR_ARG (exp, i);
31579 rtx op = expand_normal (arg);
31580 enum machine_mode mode = insn_p->operand[i + 1].mode;
31581 bool match = insn_p->operand[i + 1].predicate (op, mode);
31583 if (last_arg_count && (i + 1) == nargs)
31585 /* SIMD shift insns take either an 8-bit immediate or
31586 register as count. But builtin functions take int as
31587 count. If count doesn't match, we put it in register. */
31588 if (!match)
31590 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31591 if (!insn_p->operand[i + 1].predicate (op, mode))
31592 op = copy_to_reg (op);
31595 else if ((nargs - i) <= nargs_constant)
31597 if (!match)
31598 switch (icode)
31600 case CODE_FOR_avx2_inserti128:
31601 case CODE_FOR_avx2_extracti128:
31602 error ("the last argument must be an 1-bit immediate");
31603 return const0_rtx;
31605 case CODE_FOR_sse4_1_roundsd:
31606 case CODE_FOR_sse4_1_roundss:
31608 case CODE_FOR_sse4_1_roundpd:
31609 case CODE_FOR_sse4_1_roundps:
31610 case CODE_FOR_avx_roundpd256:
31611 case CODE_FOR_avx_roundps256:
31613 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31614 case CODE_FOR_sse4_1_roundps_sfix:
31615 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31616 case CODE_FOR_avx_roundps_sfix256:
31618 case CODE_FOR_sse4_1_blendps:
31619 case CODE_FOR_avx_blendpd256:
31620 case CODE_FOR_avx_vpermilv4df:
31621 error ("the last argument must be a 4-bit immediate");
31622 return const0_rtx;
31624 case CODE_FOR_sse4_1_blendpd:
31625 case CODE_FOR_avx_vpermilv2df:
31626 case CODE_FOR_xop_vpermil2v2df3:
31627 case CODE_FOR_xop_vpermil2v4sf3:
31628 case CODE_FOR_xop_vpermil2v4df3:
31629 case CODE_FOR_xop_vpermil2v8sf3:
31630 error ("the last argument must be a 2-bit immediate");
31631 return const0_rtx;
31633 case CODE_FOR_avx_vextractf128v4df:
31634 case CODE_FOR_avx_vextractf128v8sf:
31635 case CODE_FOR_avx_vextractf128v8si:
31636 case CODE_FOR_avx_vinsertf128v4df:
31637 case CODE_FOR_avx_vinsertf128v8sf:
31638 case CODE_FOR_avx_vinsertf128v8si:
31639 error ("the last argument must be a 1-bit immediate");
31640 return const0_rtx;
31642 case CODE_FOR_avx_vmcmpv2df3:
31643 case CODE_FOR_avx_vmcmpv4sf3:
31644 case CODE_FOR_avx_cmpv2df3:
31645 case CODE_FOR_avx_cmpv4sf3:
31646 case CODE_FOR_avx_cmpv4df3:
31647 case CODE_FOR_avx_cmpv8sf3:
31648 error ("the last argument must be a 5-bit immediate");
31649 return const0_rtx;
31651 default:
31652 switch (nargs_constant)
31654 case 2:
31655 if ((nargs - i) == nargs_constant)
31657 error ("the next to last argument must be an 8-bit immediate");
31658 break;
31660 case 1:
31661 error ("the last argument must be an 8-bit immediate");
31662 break;
31663 default:
31664 gcc_unreachable ();
31666 return const0_rtx;
31669 else
31671 if (VECTOR_MODE_P (mode))
31672 op = safe_vector_operand (op, mode);
31674 /* If we aren't optimizing, only allow one memory operand to
31675 be generated. */
31676 if (memory_operand (op, mode))
31677 num_memory++;
31679 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31681 if (optimize || !match || num_memory > 1)
31682 op = copy_to_mode_reg (mode, op);
31684 else
31686 op = copy_to_reg (op);
31687 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31691 args[i].op = op;
31692 args[i].mode = mode;
31695 switch (nargs)
31697 case 1:
31698 pat = GEN_FCN (icode) (real_target, args[0].op);
31699 break;
31700 case 2:
31701 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31702 break;
31703 case 3:
31704 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31705 args[2].op);
31706 break;
31707 case 4:
31708 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31709 args[2].op, args[3].op);
31710 break;
31711 default:
31712 gcc_unreachable ();
31715 if (! pat)
31716 return 0;
31718 emit_insn (pat);
31719 return target;
31722 /* Subroutine of ix86_expand_builtin to take care of special insns
31723 with variable number of operands. */
31725 static rtx
31726 ix86_expand_special_args_builtin (const struct builtin_description *d,
31727 tree exp, rtx target)
31729 tree arg;
31730 rtx pat, op;
31731 unsigned int i, nargs, arg_adjust, memory;
31732 struct
31734 rtx op;
31735 enum machine_mode mode;
31736 } args[3];
31737 enum insn_code icode = d->icode;
31738 bool last_arg_constant = false;
31739 const struct insn_data_d *insn_p = &insn_data[icode];
31740 enum machine_mode tmode = insn_p->operand[0].mode;
31741 enum { load, store } klass;
31743 switch ((enum ix86_builtin_func_type) d->flag)
31745 case VOID_FTYPE_VOID:
31746 emit_insn (GEN_FCN (icode) (target));
31747 return 0;
31748 case VOID_FTYPE_UINT64:
31749 case VOID_FTYPE_UNSIGNED:
31750 nargs = 0;
31751 klass = store;
31752 memory = 0;
31753 break;
31755 case INT_FTYPE_VOID:
31756 case UINT64_FTYPE_VOID:
31757 case UNSIGNED_FTYPE_VOID:
31758 nargs = 0;
31759 klass = load;
31760 memory = 0;
31761 break;
31762 case UINT64_FTYPE_PUNSIGNED:
31763 case V2DI_FTYPE_PV2DI:
31764 case V4DI_FTYPE_PV4DI:
31765 case V32QI_FTYPE_PCCHAR:
31766 case V16QI_FTYPE_PCCHAR:
31767 case V8SF_FTYPE_PCV4SF:
31768 case V8SF_FTYPE_PCFLOAT:
31769 case V4SF_FTYPE_PCFLOAT:
31770 case V4DF_FTYPE_PCV2DF:
31771 case V4DF_FTYPE_PCDOUBLE:
31772 case V2DF_FTYPE_PCDOUBLE:
31773 case VOID_FTYPE_PVOID:
31774 nargs = 1;
31775 klass = load;
31776 memory = 0;
31777 break;
31778 case VOID_FTYPE_PV2SF_V4SF:
31779 case VOID_FTYPE_PV4DI_V4DI:
31780 case VOID_FTYPE_PV2DI_V2DI:
31781 case VOID_FTYPE_PCHAR_V32QI:
31782 case VOID_FTYPE_PCHAR_V16QI:
31783 case VOID_FTYPE_PFLOAT_V8SF:
31784 case VOID_FTYPE_PFLOAT_V4SF:
31785 case VOID_FTYPE_PDOUBLE_V4DF:
31786 case VOID_FTYPE_PDOUBLE_V2DF:
31787 case VOID_FTYPE_PLONGLONG_LONGLONG:
31788 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31789 case VOID_FTYPE_PINT_INT:
31790 nargs = 1;
31791 klass = store;
31792 /* Reserve memory operand for target. */
31793 memory = ARRAY_SIZE (args);
31794 break;
31795 case V4SF_FTYPE_V4SF_PCV2SF:
31796 case V2DF_FTYPE_V2DF_PCDOUBLE:
31797 nargs = 2;
31798 klass = load;
31799 memory = 1;
31800 break;
31801 case V8SF_FTYPE_PCV8SF_V8SI:
31802 case V4DF_FTYPE_PCV4DF_V4DI:
31803 case V4SF_FTYPE_PCV4SF_V4SI:
31804 case V2DF_FTYPE_PCV2DF_V2DI:
31805 case V8SI_FTYPE_PCV8SI_V8SI:
31806 case V4DI_FTYPE_PCV4DI_V4DI:
31807 case V4SI_FTYPE_PCV4SI_V4SI:
31808 case V2DI_FTYPE_PCV2DI_V2DI:
31809 nargs = 2;
31810 klass = load;
31811 memory = 0;
31812 break;
31813 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31814 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31815 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31816 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31817 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31818 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31819 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31820 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31821 nargs = 2;
31822 klass = store;
31823 /* Reserve memory operand for target. */
31824 memory = ARRAY_SIZE (args);
31825 break;
31826 case VOID_FTYPE_UINT_UINT_UINT:
31827 case VOID_FTYPE_UINT64_UINT_UINT:
31828 case UCHAR_FTYPE_UINT_UINT_UINT:
31829 case UCHAR_FTYPE_UINT64_UINT_UINT:
31830 nargs = 3;
31831 klass = load;
31832 memory = ARRAY_SIZE (args);
31833 last_arg_constant = true;
31834 break;
31835 default:
31836 gcc_unreachable ();
31839 gcc_assert (nargs <= ARRAY_SIZE (args));
31841 if (klass == store)
31843 arg = CALL_EXPR_ARG (exp, 0);
31844 op = expand_normal (arg);
31845 gcc_assert (target == 0);
31846 if (memory)
31848 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31849 target = gen_rtx_MEM (tmode, op);
31851 else
31852 target = force_reg (tmode, op);
31853 arg_adjust = 1;
31855 else
31857 arg_adjust = 0;
31858 if (optimize
31859 || target == 0
31860 || !register_operand (target, tmode)
31861 || GET_MODE (target) != tmode)
31862 target = gen_reg_rtx (tmode);
31865 for (i = 0; i < nargs; i++)
31867 enum machine_mode mode = insn_p->operand[i + 1].mode;
31868 bool match;
31870 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31871 op = expand_normal (arg);
31872 match = insn_p->operand[i + 1].predicate (op, mode);
31874 if (last_arg_constant && (i + 1) == nargs)
31876 if (!match)
31878 if (icode == CODE_FOR_lwp_lwpvalsi3
31879 || icode == CODE_FOR_lwp_lwpinssi3
31880 || icode == CODE_FOR_lwp_lwpvaldi3
31881 || icode == CODE_FOR_lwp_lwpinsdi3)
31882 error ("the last argument must be a 32-bit immediate");
31883 else
31884 error ("the last argument must be an 8-bit immediate");
31885 return const0_rtx;
31888 else
31890 if (i == memory)
31892 /* This must be the memory operand. */
31893 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31894 op = gen_rtx_MEM (mode, op);
31895 gcc_assert (GET_MODE (op) == mode
31896 || GET_MODE (op) == VOIDmode);
31898 else
31900 /* This must be register. */
31901 if (VECTOR_MODE_P (mode))
31902 op = safe_vector_operand (op, mode);
31904 gcc_assert (GET_MODE (op) == mode
31905 || GET_MODE (op) == VOIDmode);
31906 op = copy_to_mode_reg (mode, op);
31910 args[i].op = op;
31911 args[i].mode = mode;
31914 switch (nargs)
31916 case 0:
31917 pat = GEN_FCN (icode) (target);
31918 break;
31919 case 1:
31920 pat = GEN_FCN (icode) (target, args[0].op);
31921 break;
31922 case 2:
31923 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31924 break;
31925 case 3:
31926 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31927 break;
31928 default:
31929 gcc_unreachable ();
31932 if (! pat)
31933 return 0;
31934 emit_insn (pat);
31935 return klass == store ? 0 : target;
31938 /* Return the integer constant in ARG. Constrain it to be in the range
31939 of the subparts of VEC_TYPE; issue an error if not. */
31941 static int
31942 get_element_number (tree vec_type, tree arg)
31944 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31946 if (!host_integerp (arg, 1)
31947 || (elt = tree_low_cst (arg, 1), elt > max))
31949 error ("selector must be an integer constant in the range 0..%wi", max);
31950 return 0;
31953 return elt;
31956 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31957 ix86_expand_vector_init. We DO have language-level syntax for this, in
31958 the form of (type){ init-list }. Except that since we can't place emms
31959 instructions from inside the compiler, we can't allow the use of MMX
31960 registers unless the user explicitly asks for it. So we do *not* define
31961 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31962 we have builtins invoked by mmintrin.h that gives us license to emit
31963 these sorts of instructions. */
31965 static rtx
31966 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31968 enum machine_mode tmode = TYPE_MODE (type);
31969 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31970 int i, n_elt = GET_MODE_NUNITS (tmode);
31971 rtvec v = rtvec_alloc (n_elt);
31973 gcc_assert (VECTOR_MODE_P (tmode));
31974 gcc_assert (call_expr_nargs (exp) == n_elt);
31976 for (i = 0; i < n_elt; ++i)
31978 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31979 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31982 if (!target || !register_operand (target, tmode))
31983 target = gen_reg_rtx (tmode);
31985 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31986 return target;
31989 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31990 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31991 had a language-level syntax for referencing vector elements. */
31993 static rtx
31994 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31996 enum machine_mode tmode, mode0;
31997 tree arg0, arg1;
31998 int elt;
31999 rtx op0;
32001 arg0 = CALL_EXPR_ARG (exp, 0);
32002 arg1 = CALL_EXPR_ARG (exp, 1);
32004 op0 = expand_normal (arg0);
32005 elt = get_element_number (TREE_TYPE (arg0), arg1);
32007 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32008 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32009 gcc_assert (VECTOR_MODE_P (mode0));
32011 op0 = force_reg (mode0, op0);
32013 if (optimize || !target || !register_operand (target, tmode))
32014 target = gen_reg_rtx (tmode);
32016 ix86_expand_vector_extract (true, target, op0, elt);
32018 return target;
32021 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32022 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32023 a language-level syntax for referencing vector elements. */
32025 static rtx
32026 ix86_expand_vec_set_builtin (tree exp)
32028 enum machine_mode tmode, mode1;
32029 tree arg0, arg1, arg2;
32030 int elt;
32031 rtx op0, op1, target;
32033 arg0 = CALL_EXPR_ARG (exp, 0);
32034 arg1 = CALL_EXPR_ARG (exp, 1);
32035 arg2 = CALL_EXPR_ARG (exp, 2);
32037 tmode = TYPE_MODE (TREE_TYPE (arg0));
32038 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32039 gcc_assert (VECTOR_MODE_P (tmode));
32041 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32042 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32043 elt = get_element_number (TREE_TYPE (arg0), arg2);
32045 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32046 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32048 op0 = force_reg (tmode, op0);
32049 op1 = force_reg (mode1, op1);
32051 /* OP0 is the source of these builtin functions and shouldn't be
32052 modified. Create a copy, use it and return it as target. */
32053 target = gen_reg_rtx (tmode);
32054 emit_move_insn (target, op0);
32055 ix86_expand_vector_set (true, target, op1, elt);
32057 return target;
32060 /* Expand an expression EXP that calls a built-in function,
32061 with result going to TARGET if that's convenient
32062 (and in mode MODE if that's convenient).
32063 SUBTARGET may be used as the target for computing one of EXP's operands.
32064 IGNORE is nonzero if the value is to be ignored. */
32066 static rtx
32067 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
32068 enum machine_mode mode ATTRIBUTE_UNUSED,
32069 int ignore ATTRIBUTE_UNUSED)
32071 const struct builtin_description *d;
32072 size_t i;
32073 enum insn_code icode;
32074 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32075 tree arg0, arg1, arg2, arg3, arg4;
32076 rtx op0, op1, op2, op3, op4, pat, insn;
32077 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32078 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32080 /* For CPU builtins that can be folded, fold first and expand the fold. */
32081 switch (fcode)
32083 case IX86_BUILTIN_CPU_INIT:
32085 /* Make it call __cpu_indicator_init in libgcc. */
32086 tree call_expr, fndecl, type;
32087 type = build_function_type_list (integer_type_node, NULL_TREE);
32088 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32089 call_expr = build_call_expr (fndecl, 0);
32090 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32092 case IX86_BUILTIN_CPU_IS:
32093 case IX86_BUILTIN_CPU_SUPPORTS:
32095 tree arg0 = CALL_EXPR_ARG (exp, 0);
32096 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32097 gcc_assert (fold_expr != NULL_TREE);
32098 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32102 /* Determine whether the builtin function is available under the current ISA.
32103 Originally the builtin was not created if it wasn't applicable to the
32104 current ISA based on the command line switches. With function specific
32105 options, we need to check in the context of the function making the call
32106 whether it is supported. */
32107 if (ix86_builtins_isa[fcode].isa
32108 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32110 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32111 NULL, (enum fpmath_unit) 0, false);
32113 if (!opts)
32114 error ("%qE needs unknown isa option", fndecl);
32115 else
32117 gcc_assert (opts != NULL);
32118 error ("%qE needs isa option %s", fndecl, opts);
32119 free (opts);
32121 return const0_rtx;
32124 switch (fcode)
32126 case IX86_BUILTIN_MASKMOVQ:
32127 case IX86_BUILTIN_MASKMOVDQU:
32128 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32129 ? CODE_FOR_mmx_maskmovq
32130 : CODE_FOR_sse2_maskmovdqu);
32131 /* Note the arg order is different from the operand order. */
32132 arg1 = CALL_EXPR_ARG (exp, 0);
32133 arg2 = CALL_EXPR_ARG (exp, 1);
32134 arg0 = CALL_EXPR_ARG (exp, 2);
32135 op0 = expand_normal (arg0);
32136 op1 = expand_normal (arg1);
32137 op2 = expand_normal (arg2);
32138 mode0 = insn_data[icode].operand[0].mode;
32139 mode1 = insn_data[icode].operand[1].mode;
32140 mode2 = insn_data[icode].operand[2].mode;
32142 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32143 op0 = gen_rtx_MEM (mode1, op0);
32145 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32146 op0 = copy_to_mode_reg (mode0, op0);
32147 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32148 op1 = copy_to_mode_reg (mode1, op1);
32149 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32150 op2 = copy_to_mode_reg (mode2, op2);
32151 pat = GEN_FCN (icode) (op0, op1, op2);
32152 if (! pat)
32153 return 0;
32154 emit_insn (pat);
32155 return 0;
32157 case IX86_BUILTIN_LDMXCSR:
32158 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32159 target = assign_386_stack_local (SImode, SLOT_TEMP);
32160 emit_move_insn (target, op0);
32161 emit_insn (gen_sse_ldmxcsr (target));
32162 return 0;
32164 case IX86_BUILTIN_STMXCSR:
32165 target = assign_386_stack_local (SImode, SLOT_TEMP);
32166 emit_insn (gen_sse_stmxcsr (target));
32167 return copy_to_mode_reg (SImode, target);
32169 case IX86_BUILTIN_CLFLUSH:
32170 arg0 = CALL_EXPR_ARG (exp, 0);
32171 op0 = expand_normal (arg0);
32172 icode = CODE_FOR_sse2_clflush;
32173 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32174 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32176 emit_insn (gen_sse2_clflush (op0));
32177 return 0;
32179 case IX86_BUILTIN_MONITOR:
32180 arg0 = CALL_EXPR_ARG (exp, 0);
32181 arg1 = CALL_EXPR_ARG (exp, 1);
32182 arg2 = CALL_EXPR_ARG (exp, 2);
32183 op0 = expand_normal (arg0);
32184 op1 = expand_normal (arg1);
32185 op2 = expand_normal (arg2);
32186 if (!REG_P (op0))
32187 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32188 if (!REG_P (op1))
32189 op1 = copy_to_mode_reg (SImode, op1);
32190 if (!REG_P (op2))
32191 op2 = copy_to_mode_reg (SImode, op2);
32192 emit_insn (ix86_gen_monitor (op0, op1, op2));
32193 return 0;
32195 case IX86_BUILTIN_MWAIT:
32196 arg0 = CALL_EXPR_ARG (exp, 0);
32197 arg1 = CALL_EXPR_ARG (exp, 1);
32198 op0 = expand_normal (arg0);
32199 op1 = expand_normal (arg1);
32200 if (!REG_P (op0))
32201 op0 = copy_to_mode_reg (SImode, op0);
32202 if (!REG_P (op1))
32203 op1 = copy_to_mode_reg (SImode, op1);
32204 emit_insn (gen_sse3_mwait (op0, op1));
32205 return 0;
32207 case IX86_BUILTIN_VEC_INIT_V2SI:
32208 case IX86_BUILTIN_VEC_INIT_V4HI:
32209 case IX86_BUILTIN_VEC_INIT_V8QI:
32210 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32212 case IX86_BUILTIN_VEC_EXT_V2DF:
32213 case IX86_BUILTIN_VEC_EXT_V2DI:
32214 case IX86_BUILTIN_VEC_EXT_V4SF:
32215 case IX86_BUILTIN_VEC_EXT_V4SI:
32216 case IX86_BUILTIN_VEC_EXT_V8HI:
32217 case IX86_BUILTIN_VEC_EXT_V2SI:
32218 case IX86_BUILTIN_VEC_EXT_V4HI:
32219 case IX86_BUILTIN_VEC_EXT_V16QI:
32220 return ix86_expand_vec_ext_builtin (exp, target);
32222 case IX86_BUILTIN_VEC_SET_V2DI:
32223 case IX86_BUILTIN_VEC_SET_V4SF:
32224 case IX86_BUILTIN_VEC_SET_V4SI:
32225 case IX86_BUILTIN_VEC_SET_V8HI:
32226 case IX86_BUILTIN_VEC_SET_V4HI:
32227 case IX86_BUILTIN_VEC_SET_V16QI:
32228 return ix86_expand_vec_set_builtin (exp);
32230 case IX86_BUILTIN_INFQ:
32231 case IX86_BUILTIN_HUGE_VALQ:
32233 REAL_VALUE_TYPE inf;
32234 rtx tmp;
32236 real_inf (&inf);
32237 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32239 tmp = validize_mem (force_const_mem (mode, tmp));
32241 if (target == 0)
32242 target = gen_reg_rtx (mode);
32244 emit_move_insn (target, tmp);
32245 return target;
32248 case IX86_BUILTIN_RDPMC:
32249 case IX86_BUILTIN_RDTSC:
32250 case IX86_BUILTIN_RDTSCP:
32252 op0 = gen_reg_rtx (DImode);
32253 op1 = gen_reg_rtx (DImode);
32255 if (fcode == IX86_BUILTIN_RDPMC)
32257 arg0 = CALL_EXPR_ARG (exp, 0);
32258 op2 = expand_normal (arg0);
32259 if (!register_operand (op2, SImode))
32260 op2 = copy_to_mode_reg (SImode, op2);
32262 insn = (TARGET_64BIT
32263 ? gen_rdpmc_rex64 (op0, op1, op2)
32264 : gen_rdpmc (op0, op2));
32265 emit_insn (insn);
32267 else if (fcode == IX86_BUILTIN_RDTSC)
32269 insn = (TARGET_64BIT
32270 ? gen_rdtsc_rex64 (op0, op1)
32271 : gen_rdtsc (op0));
32272 emit_insn (insn);
32274 else
32276 op2 = gen_reg_rtx (SImode);
32278 insn = (TARGET_64BIT
32279 ? gen_rdtscp_rex64 (op0, op1, op2)
32280 : gen_rdtscp (op0, op2));
32281 emit_insn (insn);
32283 arg0 = CALL_EXPR_ARG (exp, 0);
32284 op4 = expand_normal (arg0);
32285 if (!address_operand (op4, VOIDmode))
32287 op4 = convert_memory_address (Pmode, op4);
32288 op4 = copy_addr_to_reg (op4);
32290 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
32293 if (target == 0)
32295 /* mode is VOIDmode if __builtin_rd* has been called
32296 without lhs. */
32297 if (mode == VOIDmode)
32298 return target;
32299 target = gen_reg_rtx (mode);
32302 if (TARGET_64BIT)
32304 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
32305 op1, 1, OPTAB_DIRECT);
32306 op0 = expand_simple_binop (DImode, IOR, op0, op1,
32307 op0, 1, OPTAB_DIRECT);
32310 emit_move_insn (target, op0);
32311 return target;
32313 case IX86_BUILTIN_FXSAVE:
32314 case IX86_BUILTIN_FXRSTOR:
32315 case IX86_BUILTIN_FXSAVE64:
32316 case IX86_BUILTIN_FXRSTOR64:
32317 switch (fcode)
32319 case IX86_BUILTIN_FXSAVE:
32320 icode = CODE_FOR_fxsave;
32321 break;
32322 case IX86_BUILTIN_FXRSTOR:
32323 icode = CODE_FOR_fxrstor;
32324 break;
32325 case IX86_BUILTIN_FXSAVE64:
32326 icode = CODE_FOR_fxsave64;
32327 break;
32328 case IX86_BUILTIN_FXRSTOR64:
32329 icode = CODE_FOR_fxrstor64;
32330 break;
32331 default:
32332 gcc_unreachable ();
32335 arg0 = CALL_EXPR_ARG (exp, 0);
32336 op0 = expand_normal (arg0);
32338 if (!address_operand (op0, VOIDmode))
32340 op0 = convert_memory_address (Pmode, op0);
32341 op0 = copy_addr_to_reg (op0);
32343 op0 = gen_rtx_MEM (BLKmode, op0);
32345 pat = GEN_FCN (icode) (op0);
32346 if (pat)
32347 emit_insn (pat);
32348 return 0;
32350 case IX86_BUILTIN_XSAVE:
32351 case IX86_BUILTIN_XRSTOR:
32352 case IX86_BUILTIN_XSAVE64:
32353 case IX86_BUILTIN_XRSTOR64:
32354 case IX86_BUILTIN_XSAVEOPT:
32355 case IX86_BUILTIN_XSAVEOPT64:
32356 arg0 = CALL_EXPR_ARG (exp, 0);
32357 arg1 = CALL_EXPR_ARG (exp, 1);
32358 op0 = expand_normal (arg0);
32359 op1 = expand_normal (arg1);
32361 if (!address_operand (op0, VOIDmode))
32363 op0 = convert_memory_address (Pmode, op0);
32364 op0 = copy_addr_to_reg (op0);
32366 op0 = gen_rtx_MEM (BLKmode, op0);
32368 op1 = force_reg (DImode, op1);
32370 if (TARGET_64BIT)
32372 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
32373 NULL, 1, OPTAB_DIRECT);
32374 switch (fcode)
32376 case IX86_BUILTIN_XSAVE:
32377 icode = CODE_FOR_xsave_rex64;
32378 break;
32379 case IX86_BUILTIN_XRSTOR:
32380 icode = CODE_FOR_xrstor_rex64;
32381 break;
32382 case IX86_BUILTIN_XSAVE64:
32383 icode = CODE_FOR_xsave64;
32384 break;
32385 case IX86_BUILTIN_XRSTOR64:
32386 icode = CODE_FOR_xrstor64;
32387 break;
32388 case IX86_BUILTIN_XSAVEOPT:
32389 icode = CODE_FOR_xsaveopt_rex64;
32390 break;
32391 case IX86_BUILTIN_XSAVEOPT64:
32392 icode = CODE_FOR_xsaveopt64;
32393 break;
32394 default:
32395 gcc_unreachable ();
32398 op2 = gen_lowpart (SImode, op2);
32399 op1 = gen_lowpart (SImode, op1);
32400 pat = GEN_FCN (icode) (op0, op1, op2);
32402 else
32404 switch (fcode)
32406 case IX86_BUILTIN_XSAVE:
32407 icode = CODE_FOR_xsave;
32408 break;
32409 case IX86_BUILTIN_XRSTOR:
32410 icode = CODE_FOR_xrstor;
32411 break;
32412 case IX86_BUILTIN_XSAVEOPT:
32413 icode = CODE_FOR_xsaveopt;
32414 break;
32415 default:
32416 gcc_unreachable ();
32418 pat = GEN_FCN (icode) (op0, op1);
32421 if (pat)
32422 emit_insn (pat);
32423 return 0;
32425 case IX86_BUILTIN_LLWPCB:
32426 arg0 = CALL_EXPR_ARG (exp, 0);
32427 op0 = expand_normal (arg0);
32428 icode = CODE_FOR_lwp_llwpcb;
32429 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32430 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32431 emit_insn (gen_lwp_llwpcb (op0));
32432 return 0;
32434 case IX86_BUILTIN_SLWPCB:
32435 icode = CODE_FOR_lwp_slwpcb;
32436 if (!target
32437 || !insn_data[icode].operand[0].predicate (target, Pmode))
32438 target = gen_reg_rtx (Pmode);
32439 emit_insn (gen_lwp_slwpcb (target));
32440 return target;
32442 case IX86_BUILTIN_BEXTRI32:
32443 case IX86_BUILTIN_BEXTRI64:
32444 arg0 = CALL_EXPR_ARG (exp, 0);
32445 arg1 = CALL_EXPR_ARG (exp, 1);
32446 op0 = expand_normal (arg0);
32447 op1 = expand_normal (arg1);
32448 icode = (fcode == IX86_BUILTIN_BEXTRI32
32449 ? CODE_FOR_tbm_bextri_si
32450 : CODE_FOR_tbm_bextri_di);
32451 if (!CONST_INT_P (op1))
32453 error ("last argument must be an immediate");
32454 return const0_rtx;
32456 else
32458 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
32459 unsigned char lsb_index = INTVAL (op1) & 0xFF;
32460 op1 = GEN_INT (length);
32461 op2 = GEN_INT (lsb_index);
32462 pat = GEN_FCN (icode) (target, op0, op1, op2);
32463 if (pat)
32464 emit_insn (pat);
32465 return target;
32468 case IX86_BUILTIN_RDRAND16_STEP:
32469 icode = CODE_FOR_rdrandhi_1;
32470 mode0 = HImode;
32471 goto rdrand_step;
32473 case IX86_BUILTIN_RDRAND32_STEP:
32474 icode = CODE_FOR_rdrandsi_1;
32475 mode0 = SImode;
32476 goto rdrand_step;
32478 case IX86_BUILTIN_RDRAND64_STEP:
32479 icode = CODE_FOR_rdranddi_1;
32480 mode0 = DImode;
32482 rdrand_step:
32483 op0 = gen_reg_rtx (mode0);
32484 emit_insn (GEN_FCN (icode) (op0));
32486 arg0 = CALL_EXPR_ARG (exp, 0);
32487 op1 = expand_normal (arg0);
32488 if (!address_operand (op1, VOIDmode))
32490 op1 = convert_memory_address (Pmode, op1);
32491 op1 = copy_addr_to_reg (op1);
32493 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32495 op1 = gen_reg_rtx (SImode);
32496 emit_move_insn (op1, CONST1_RTX (SImode));
32498 /* Emit SImode conditional move. */
32499 if (mode0 == HImode)
32501 op2 = gen_reg_rtx (SImode);
32502 emit_insn (gen_zero_extendhisi2 (op2, op0));
32504 else if (mode0 == SImode)
32505 op2 = op0;
32506 else
32507 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32509 if (target == 0)
32510 target = gen_reg_rtx (SImode);
32512 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32513 const0_rtx);
32514 emit_insn (gen_rtx_SET (VOIDmode, target,
32515 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32516 return target;
32518 case IX86_BUILTIN_RDSEED16_STEP:
32519 icode = CODE_FOR_rdseedhi_1;
32520 mode0 = HImode;
32521 goto rdseed_step;
32523 case IX86_BUILTIN_RDSEED32_STEP:
32524 icode = CODE_FOR_rdseedsi_1;
32525 mode0 = SImode;
32526 goto rdseed_step;
32528 case IX86_BUILTIN_RDSEED64_STEP:
32529 icode = CODE_FOR_rdseeddi_1;
32530 mode0 = DImode;
32532 rdseed_step:
32533 op0 = gen_reg_rtx (mode0);
32534 emit_insn (GEN_FCN (icode) (op0));
32536 arg0 = CALL_EXPR_ARG (exp, 0);
32537 op1 = expand_normal (arg0);
32538 if (!address_operand (op1, VOIDmode))
32540 op1 = convert_memory_address (Pmode, op1);
32541 op1 = copy_addr_to_reg (op1);
32543 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32545 op2 = gen_reg_rtx (QImode);
32547 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32548 const0_rtx);
32549 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32551 if (target == 0)
32552 target = gen_reg_rtx (SImode);
32554 emit_insn (gen_zero_extendqisi2 (target, op2));
32555 return target;
32557 case IX86_BUILTIN_ADDCARRYX32:
32558 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32559 mode0 = SImode;
32560 goto addcarryx;
32562 case IX86_BUILTIN_ADDCARRYX64:
32563 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32564 mode0 = DImode;
32566 addcarryx:
32567 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32568 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32569 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32570 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32572 op0 = gen_reg_rtx (QImode);
32574 /* Generate CF from input operand. */
32575 op1 = expand_normal (arg0);
32576 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32577 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32579 /* Gen ADCX instruction to compute X+Y+CF. */
32580 op2 = expand_normal (arg1);
32581 op3 = expand_normal (arg2);
32583 if (!REG_P (op2))
32584 op2 = copy_to_mode_reg (mode0, op2);
32585 if (!REG_P (op3))
32586 op3 = copy_to_mode_reg (mode0, op3);
32588 op0 = gen_reg_rtx (mode0);
32590 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32591 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32592 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32594 /* Store the result. */
32595 op4 = expand_normal (arg3);
32596 if (!address_operand (op4, VOIDmode))
32598 op4 = convert_memory_address (Pmode, op4);
32599 op4 = copy_addr_to_reg (op4);
32601 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32603 /* Return current CF value. */
32604 if (target == 0)
32605 target = gen_reg_rtx (QImode);
32607 PUT_MODE (pat, QImode);
32608 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32609 return target;
32611 case IX86_BUILTIN_GATHERSIV2DF:
32612 icode = CODE_FOR_avx2_gathersiv2df;
32613 goto gather_gen;
32614 case IX86_BUILTIN_GATHERSIV4DF:
32615 icode = CODE_FOR_avx2_gathersiv4df;
32616 goto gather_gen;
32617 case IX86_BUILTIN_GATHERDIV2DF:
32618 icode = CODE_FOR_avx2_gatherdiv2df;
32619 goto gather_gen;
32620 case IX86_BUILTIN_GATHERDIV4DF:
32621 icode = CODE_FOR_avx2_gatherdiv4df;
32622 goto gather_gen;
32623 case IX86_BUILTIN_GATHERSIV4SF:
32624 icode = CODE_FOR_avx2_gathersiv4sf;
32625 goto gather_gen;
32626 case IX86_BUILTIN_GATHERSIV8SF:
32627 icode = CODE_FOR_avx2_gathersiv8sf;
32628 goto gather_gen;
32629 case IX86_BUILTIN_GATHERDIV4SF:
32630 icode = CODE_FOR_avx2_gatherdiv4sf;
32631 goto gather_gen;
32632 case IX86_BUILTIN_GATHERDIV8SF:
32633 icode = CODE_FOR_avx2_gatherdiv8sf;
32634 goto gather_gen;
32635 case IX86_BUILTIN_GATHERSIV2DI:
32636 icode = CODE_FOR_avx2_gathersiv2di;
32637 goto gather_gen;
32638 case IX86_BUILTIN_GATHERSIV4DI:
32639 icode = CODE_FOR_avx2_gathersiv4di;
32640 goto gather_gen;
32641 case IX86_BUILTIN_GATHERDIV2DI:
32642 icode = CODE_FOR_avx2_gatherdiv2di;
32643 goto gather_gen;
32644 case IX86_BUILTIN_GATHERDIV4DI:
32645 icode = CODE_FOR_avx2_gatherdiv4di;
32646 goto gather_gen;
32647 case IX86_BUILTIN_GATHERSIV4SI:
32648 icode = CODE_FOR_avx2_gathersiv4si;
32649 goto gather_gen;
32650 case IX86_BUILTIN_GATHERSIV8SI:
32651 icode = CODE_FOR_avx2_gathersiv8si;
32652 goto gather_gen;
32653 case IX86_BUILTIN_GATHERDIV4SI:
32654 icode = CODE_FOR_avx2_gatherdiv4si;
32655 goto gather_gen;
32656 case IX86_BUILTIN_GATHERDIV8SI:
32657 icode = CODE_FOR_avx2_gatherdiv8si;
32658 goto gather_gen;
32659 case IX86_BUILTIN_GATHERALTSIV4DF:
32660 icode = CODE_FOR_avx2_gathersiv4df;
32661 goto gather_gen;
32662 case IX86_BUILTIN_GATHERALTDIV8SF:
32663 icode = CODE_FOR_avx2_gatherdiv8sf;
32664 goto gather_gen;
32665 case IX86_BUILTIN_GATHERALTSIV4DI:
32666 icode = CODE_FOR_avx2_gathersiv4di;
32667 goto gather_gen;
32668 case IX86_BUILTIN_GATHERALTDIV8SI:
32669 icode = CODE_FOR_avx2_gatherdiv8si;
32670 goto gather_gen;
32672 gather_gen:
32673 arg0 = CALL_EXPR_ARG (exp, 0);
32674 arg1 = CALL_EXPR_ARG (exp, 1);
32675 arg2 = CALL_EXPR_ARG (exp, 2);
32676 arg3 = CALL_EXPR_ARG (exp, 3);
32677 arg4 = CALL_EXPR_ARG (exp, 4);
32678 op0 = expand_normal (arg0);
32679 op1 = expand_normal (arg1);
32680 op2 = expand_normal (arg2);
32681 op3 = expand_normal (arg3);
32682 op4 = expand_normal (arg4);
32683 /* Note the arg order is different from the operand order. */
32684 mode0 = insn_data[icode].operand[1].mode;
32685 mode2 = insn_data[icode].operand[3].mode;
32686 mode3 = insn_data[icode].operand[4].mode;
32687 mode4 = insn_data[icode].operand[5].mode;
32689 if (target == NULL_RTX
32690 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32691 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32692 else
32693 subtarget = target;
32695 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32696 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32698 rtx half = gen_reg_rtx (V4SImode);
32699 if (!nonimmediate_operand (op2, V8SImode))
32700 op2 = copy_to_mode_reg (V8SImode, op2);
32701 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32702 op2 = half;
32704 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32705 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32707 rtx (*gen) (rtx, rtx);
32708 rtx half = gen_reg_rtx (mode0);
32709 if (mode0 == V4SFmode)
32710 gen = gen_vec_extract_lo_v8sf;
32711 else
32712 gen = gen_vec_extract_lo_v8si;
32713 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32714 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32715 emit_insn (gen (half, op0));
32716 op0 = half;
32717 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32718 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32719 emit_insn (gen (half, op3));
32720 op3 = half;
32723 /* Force memory operand only with base register here. But we
32724 don't want to do it on memory operand for other builtin
32725 functions. */
32726 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32728 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32729 op0 = copy_to_mode_reg (mode0, op0);
32730 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32731 op1 = copy_to_mode_reg (Pmode, op1);
32732 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32733 op2 = copy_to_mode_reg (mode2, op2);
32734 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32735 op3 = copy_to_mode_reg (mode3, op3);
32736 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32738 error ("last argument must be scale 1, 2, 4, 8");
32739 return const0_rtx;
32742 /* Optimize. If mask is known to have all high bits set,
32743 replace op0 with pc_rtx to signal that the instruction
32744 overwrites the whole destination and doesn't use its
32745 previous contents. */
32746 if (optimize)
32748 if (TREE_CODE (arg3) == VECTOR_CST)
32750 unsigned int negative = 0;
32751 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32753 tree cst = VECTOR_CST_ELT (arg3, i);
32754 if (TREE_CODE (cst) == INTEGER_CST
32755 && tree_int_cst_sign_bit (cst))
32756 negative++;
32757 else if (TREE_CODE (cst) == REAL_CST
32758 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32759 negative++;
32761 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32762 op0 = pc_rtx;
32764 else if (TREE_CODE (arg3) == SSA_NAME)
32766 /* Recognize also when mask is like:
32767 __v2df src = _mm_setzero_pd ();
32768 __v2df mask = _mm_cmpeq_pd (src, src);
32770 __v8sf src = _mm256_setzero_ps ();
32771 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32772 as that is a cheaper way to load all ones into
32773 a register than having to load a constant from
32774 memory. */
32775 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32776 if (is_gimple_call (def_stmt))
32778 tree fndecl = gimple_call_fndecl (def_stmt);
32779 if (fndecl
32780 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32781 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32783 case IX86_BUILTIN_CMPPD:
32784 case IX86_BUILTIN_CMPPS:
32785 case IX86_BUILTIN_CMPPD256:
32786 case IX86_BUILTIN_CMPPS256:
32787 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32788 break;
32789 /* FALLTHRU */
32790 case IX86_BUILTIN_CMPEQPD:
32791 case IX86_BUILTIN_CMPEQPS:
32792 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32793 && initializer_zerop (gimple_call_arg (def_stmt,
32794 1)))
32795 op0 = pc_rtx;
32796 break;
32797 default:
32798 break;
32804 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32805 if (! pat)
32806 return const0_rtx;
32807 emit_insn (pat);
32809 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32810 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32812 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32813 ? V4SFmode : V4SImode;
32814 if (target == NULL_RTX)
32815 target = gen_reg_rtx (tmode);
32816 if (tmode == V4SFmode)
32817 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32818 else
32819 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32821 else
32822 target = subtarget;
32824 return target;
32826 case IX86_BUILTIN_XABORT:
32827 icode = CODE_FOR_xabort;
32828 arg0 = CALL_EXPR_ARG (exp, 0);
32829 op0 = expand_normal (arg0);
32830 mode0 = insn_data[icode].operand[0].mode;
32831 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32833 error ("the xabort's argument must be an 8-bit immediate");
32834 return const0_rtx;
32836 emit_insn (gen_xabort (op0));
32837 return 0;
32839 default:
32840 break;
32843 for (i = 0, d = bdesc_special_args;
32844 i < ARRAY_SIZE (bdesc_special_args);
32845 i++, d++)
32846 if (d->code == fcode)
32847 return ix86_expand_special_args_builtin (d, exp, target);
32849 for (i = 0, d = bdesc_args;
32850 i < ARRAY_SIZE (bdesc_args);
32851 i++, d++)
32852 if (d->code == fcode)
32853 switch (fcode)
32855 case IX86_BUILTIN_FABSQ:
32856 case IX86_BUILTIN_COPYSIGNQ:
32857 if (!TARGET_SSE)
32858 /* Emit a normal call if SSE isn't available. */
32859 return expand_call (exp, target, ignore);
32860 default:
32861 return ix86_expand_args_builtin (d, exp, target);
32864 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32865 if (d->code == fcode)
32866 return ix86_expand_sse_comi (d, exp, target);
32868 for (i = 0, d = bdesc_pcmpestr;
32869 i < ARRAY_SIZE (bdesc_pcmpestr);
32870 i++, d++)
32871 if (d->code == fcode)
32872 return ix86_expand_sse_pcmpestr (d, exp, target);
32874 for (i = 0, d = bdesc_pcmpistr;
32875 i < ARRAY_SIZE (bdesc_pcmpistr);
32876 i++, d++)
32877 if (d->code == fcode)
32878 return ix86_expand_sse_pcmpistr (d, exp, target);
32880 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32881 if (d->code == fcode)
32882 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32883 (enum ix86_builtin_func_type)
32884 d->flag, d->comparison);
32886 gcc_unreachable ();
32889 /* Returns a function decl for a vectorized version of the builtin function
32890 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32891 if it is not available. */
32893 static tree
32894 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32895 tree type_in)
32897 enum machine_mode in_mode, out_mode;
32898 int in_n, out_n;
32899 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32901 if (TREE_CODE (type_out) != VECTOR_TYPE
32902 || TREE_CODE (type_in) != VECTOR_TYPE
32903 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32904 return NULL_TREE;
32906 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32907 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32908 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32909 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32911 switch (fn)
32913 case BUILT_IN_SQRT:
32914 if (out_mode == DFmode && in_mode == DFmode)
32916 if (out_n == 2 && in_n == 2)
32917 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32918 else if (out_n == 4 && in_n == 4)
32919 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32921 break;
32923 case BUILT_IN_SQRTF:
32924 if (out_mode == SFmode && in_mode == SFmode)
32926 if (out_n == 4 && in_n == 4)
32927 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32928 else if (out_n == 8 && in_n == 8)
32929 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32931 break;
32933 case BUILT_IN_IFLOOR:
32934 case BUILT_IN_LFLOOR:
32935 case BUILT_IN_LLFLOOR:
32936 /* The round insn does not trap on denormals. */
32937 if (flag_trapping_math || !TARGET_ROUND)
32938 break;
32940 if (out_mode == SImode && in_mode == DFmode)
32942 if (out_n == 4 && in_n == 2)
32943 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32944 else if (out_n == 8 && in_n == 4)
32945 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32947 break;
32949 case BUILT_IN_IFLOORF:
32950 case BUILT_IN_LFLOORF:
32951 case BUILT_IN_LLFLOORF:
32952 /* The round insn does not trap on denormals. */
32953 if (flag_trapping_math || !TARGET_ROUND)
32954 break;
32956 if (out_mode == SImode && in_mode == SFmode)
32958 if (out_n == 4 && in_n == 4)
32959 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32960 else if (out_n == 8 && in_n == 8)
32961 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32963 break;
32965 case BUILT_IN_ICEIL:
32966 case BUILT_IN_LCEIL:
32967 case BUILT_IN_LLCEIL:
32968 /* The round insn does not trap on denormals. */
32969 if (flag_trapping_math || !TARGET_ROUND)
32970 break;
32972 if (out_mode == SImode && in_mode == DFmode)
32974 if (out_n == 4 && in_n == 2)
32975 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32976 else if (out_n == 8 && in_n == 4)
32977 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32979 break;
32981 case BUILT_IN_ICEILF:
32982 case BUILT_IN_LCEILF:
32983 case BUILT_IN_LLCEILF:
32984 /* The round insn does not trap on denormals. */
32985 if (flag_trapping_math || !TARGET_ROUND)
32986 break;
32988 if (out_mode == SImode && in_mode == SFmode)
32990 if (out_n == 4 && in_n == 4)
32991 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32992 else if (out_n == 8 && in_n == 8)
32993 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32995 break;
32997 case BUILT_IN_IRINT:
32998 case BUILT_IN_LRINT:
32999 case BUILT_IN_LLRINT:
33000 if (out_mode == SImode && in_mode == DFmode)
33002 if (out_n == 4 && in_n == 2)
33003 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33004 else if (out_n == 8 && in_n == 4)
33005 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33007 break;
33009 case BUILT_IN_IRINTF:
33010 case BUILT_IN_LRINTF:
33011 case BUILT_IN_LLRINTF:
33012 if (out_mode == SImode && in_mode == SFmode)
33014 if (out_n == 4 && in_n == 4)
33015 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33016 else if (out_n == 8 && in_n == 8)
33017 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33019 break;
33021 case BUILT_IN_IROUND:
33022 case BUILT_IN_LROUND:
33023 case BUILT_IN_LLROUND:
33024 /* The round insn does not trap on denormals. */
33025 if (flag_trapping_math || !TARGET_ROUND)
33026 break;
33028 if (out_mode == SImode && in_mode == DFmode)
33030 if (out_n == 4 && in_n == 2)
33031 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33032 else if (out_n == 8 && in_n == 4)
33033 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33035 break;
33037 case BUILT_IN_IROUNDF:
33038 case BUILT_IN_LROUNDF:
33039 case BUILT_IN_LLROUNDF:
33040 /* The round insn does not trap on denormals. */
33041 if (flag_trapping_math || !TARGET_ROUND)
33042 break;
33044 if (out_mode == SImode && in_mode == SFmode)
33046 if (out_n == 4 && in_n == 4)
33047 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33048 else if (out_n == 8 && in_n == 8)
33049 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33051 break;
33053 case BUILT_IN_COPYSIGN:
33054 if (out_mode == DFmode && in_mode == DFmode)
33056 if (out_n == 2 && in_n == 2)
33057 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33058 else if (out_n == 4 && in_n == 4)
33059 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33061 break;
33063 case BUILT_IN_COPYSIGNF:
33064 if (out_mode == SFmode && in_mode == SFmode)
33066 if (out_n == 4 && in_n == 4)
33067 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33068 else if (out_n == 8 && in_n == 8)
33069 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33071 break;
33073 case BUILT_IN_FLOOR:
33074 /* The round insn does not trap on denormals. */
33075 if (flag_trapping_math || !TARGET_ROUND)
33076 break;
33078 if (out_mode == DFmode && in_mode == DFmode)
33080 if (out_n == 2 && in_n == 2)
33081 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33082 else if (out_n == 4 && in_n == 4)
33083 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33085 break;
33087 case BUILT_IN_FLOORF:
33088 /* The round insn does not trap on denormals. */
33089 if (flag_trapping_math || !TARGET_ROUND)
33090 break;
33092 if (out_mode == SFmode && in_mode == SFmode)
33094 if (out_n == 4 && in_n == 4)
33095 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33096 else if (out_n == 8 && in_n == 8)
33097 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33099 break;
33101 case BUILT_IN_CEIL:
33102 /* The round insn does not trap on denormals. */
33103 if (flag_trapping_math || !TARGET_ROUND)
33104 break;
33106 if (out_mode == DFmode && in_mode == DFmode)
33108 if (out_n == 2 && in_n == 2)
33109 return ix86_builtins[IX86_BUILTIN_CEILPD];
33110 else if (out_n == 4 && in_n == 4)
33111 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33113 break;
33115 case BUILT_IN_CEILF:
33116 /* The round insn does not trap on denormals. */
33117 if (flag_trapping_math || !TARGET_ROUND)
33118 break;
33120 if (out_mode == SFmode && in_mode == SFmode)
33122 if (out_n == 4 && in_n == 4)
33123 return ix86_builtins[IX86_BUILTIN_CEILPS];
33124 else if (out_n == 8 && in_n == 8)
33125 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33127 break;
33129 case BUILT_IN_TRUNC:
33130 /* The round insn does not trap on denormals. */
33131 if (flag_trapping_math || !TARGET_ROUND)
33132 break;
33134 if (out_mode == DFmode && in_mode == DFmode)
33136 if (out_n == 2 && in_n == 2)
33137 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33138 else if (out_n == 4 && in_n == 4)
33139 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33141 break;
33143 case BUILT_IN_TRUNCF:
33144 /* The round insn does not trap on denormals. */
33145 if (flag_trapping_math || !TARGET_ROUND)
33146 break;
33148 if (out_mode == SFmode && in_mode == SFmode)
33150 if (out_n == 4 && in_n == 4)
33151 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33152 else if (out_n == 8 && in_n == 8)
33153 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33155 break;
33157 case BUILT_IN_RINT:
33158 /* The round insn does not trap on denormals. */
33159 if (flag_trapping_math || !TARGET_ROUND)
33160 break;
33162 if (out_mode == DFmode && in_mode == DFmode)
33164 if (out_n == 2 && in_n == 2)
33165 return ix86_builtins[IX86_BUILTIN_RINTPD];
33166 else if (out_n == 4 && in_n == 4)
33167 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33169 break;
33171 case BUILT_IN_RINTF:
33172 /* The round insn does not trap on denormals. */
33173 if (flag_trapping_math || !TARGET_ROUND)
33174 break;
33176 if (out_mode == SFmode && in_mode == SFmode)
33178 if (out_n == 4 && in_n == 4)
33179 return ix86_builtins[IX86_BUILTIN_RINTPS];
33180 else if (out_n == 8 && in_n == 8)
33181 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33183 break;
33185 case BUILT_IN_ROUND:
33186 /* The round insn does not trap on denormals. */
33187 if (flag_trapping_math || !TARGET_ROUND)
33188 break;
33190 if (out_mode == DFmode && in_mode == DFmode)
33192 if (out_n == 2 && in_n == 2)
33193 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33194 else if (out_n == 4 && in_n == 4)
33195 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33197 break;
33199 case BUILT_IN_ROUNDF:
33200 /* The round insn does not trap on denormals. */
33201 if (flag_trapping_math || !TARGET_ROUND)
33202 break;
33204 if (out_mode == SFmode && in_mode == SFmode)
33206 if (out_n == 4 && in_n == 4)
33207 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33208 else if (out_n == 8 && in_n == 8)
33209 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33211 break;
33213 case BUILT_IN_FMA:
33214 if (out_mode == DFmode && in_mode == DFmode)
33216 if (out_n == 2 && in_n == 2)
33217 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33218 if (out_n == 4 && in_n == 4)
33219 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33221 break;
33223 case BUILT_IN_FMAF:
33224 if (out_mode == SFmode && in_mode == SFmode)
33226 if (out_n == 4 && in_n == 4)
33227 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33228 if (out_n == 8 && in_n == 8)
33229 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33231 break;
33233 default:
33234 break;
33237 /* Dispatch to a handler for a vectorization library. */
33238 if (ix86_veclib_handler)
33239 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33240 type_in);
33242 return NULL_TREE;
33245 /* Handler for an SVML-style interface to
33246 a library with vectorized intrinsics. */
33248 static tree
33249 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
33251 char name[20];
33252 tree fntype, new_fndecl, args;
33253 unsigned arity;
33254 const char *bname;
33255 enum machine_mode el_mode, in_mode;
33256 int n, in_n;
33258 /* The SVML is suitable for unsafe math only. */
33259 if (!flag_unsafe_math_optimizations)
33260 return NULL_TREE;
33262 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33263 n = TYPE_VECTOR_SUBPARTS (type_out);
33264 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33265 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33266 if (el_mode != in_mode
33267 || n != in_n)
33268 return NULL_TREE;
33270 switch (fn)
33272 case BUILT_IN_EXP:
33273 case BUILT_IN_LOG:
33274 case BUILT_IN_LOG10:
33275 case BUILT_IN_POW:
33276 case BUILT_IN_TANH:
33277 case BUILT_IN_TAN:
33278 case BUILT_IN_ATAN:
33279 case BUILT_IN_ATAN2:
33280 case BUILT_IN_ATANH:
33281 case BUILT_IN_CBRT:
33282 case BUILT_IN_SINH:
33283 case BUILT_IN_SIN:
33284 case BUILT_IN_ASINH:
33285 case BUILT_IN_ASIN:
33286 case BUILT_IN_COSH:
33287 case BUILT_IN_COS:
33288 case BUILT_IN_ACOSH:
33289 case BUILT_IN_ACOS:
33290 if (el_mode != DFmode || n != 2)
33291 return NULL_TREE;
33292 break;
33294 case BUILT_IN_EXPF:
33295 case BUILT_IN_LOGF:
33296 case BUILT_IN_LOG10F:
33297 case BUILT_IN_POWF:
33298 case BUILT_IN_TANHF:
33299 case BUILT_IN_TANF:
33300 case BUILT_IN_ATANF:
33301 case BUILT_IN_ATAN2F:
33302 case BUILT_IN_ATANHF:
33303 case BUILT_IN_CBRTF:
33304 case BUILT_IN_SINHF:
33305 case BUILT_IN_SINF:
33306 case BUILT_IN_ASINHF:
33307 case BUILT_IN_ASINF:
33308 case BUILT_IN_COSHF:
33309 case BUILT_IN_COSF:
33310 case BUILT_IN_ACOSHF:
33311 case BUILT_IN_ACOSF:
33312 if (el_mode != SFmode || n != 4)
33313 return NULL_TREE;
33314 break;
33316 default:
33317 return NULL_TREE;
33320 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33322 if (fn == BUILT_IN_LOGF)
33323 strcpy (name, "vmlsLn4");
33324 else if (fn == BUILT_IN_LOG)
33325 strcpy (name, "vmldLn2");
33326 else if (n == 4)
33328 sprintf (name, "vmls%s", bname+10);
33329 name[strlen (name)-1] = '4';
33331 else
33332 sprintf (name, "vmld%s2", bname+10);
33334 /* Convert to uppercase. */
33335 name[4] &= ~0x20;
33337 arity = 0;
33338 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33339 args;
33340 args = TREE_CHAIN (args))
33341 arity++;
33343 if (arity == 1)
33344 fntype = build_function_type_list (type_out, type_in, NULL);
33345 else
33346 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33348 /* Build a function declaration for the vectorized function. */
33349 new_fndecl = build_decl (BUILTINS_LOCATION,
33350 FUNCTION_DECL, get_identifier (name), fntype);
33351 TREE_PUBLIC (new_fndecl) = 1;
33352 DECL_EXTERNAL (new_fndecl) = 1;
33353 DECL_IS_NOVOPS (new_fndecl) = 1;
33354 TREE_READONLY (new_fndecl) = 1;
33356 return new_fndecl;
33359 /* Handler for an ACML-style interface to
33360 a library with vectorized intrinsics. */
33362 static tree
33363 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
33365 char name[20] = "__vr.._";
33366 tree fntype, new_fndecl, args;
33367 unsigned arity;
33368 const char *bname;
33369 enum machine_mode el_mode, in_mode;
33370 int n, in_n;
33372 /* The ACML is 64bits only and suitable for unsafe math only as
33373 it does not correctly support parts of IEEE with the required
33374 precision such as denormals. */
33375 if (!TARGET_64BIT
33376 || !flag_unsafe_math_optimizations)
33377 return NULL_TREE;
33379 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33380 n = TYPE_VECTOR_SUBPARTS (type_out);
33381 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33382 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33383 if (el_mode != in_mode
33384 || n != in_n)
33385 return NULL_TREE;
33387 switch (fn)
33389 case BUILT_IN_SIN:
33390 case BUILT_IN_COS:
33391 case BUILT_IN_EXP:
33392 case BUILT_IN_LOG:
33393 case BUILT_IN_LOG2:
33394 case BUILT_IN_LOG10:
33395 name[4] = 'd';
33396 name[5] = '2';
33397 if (el_mode != DFmode
33398 || n != 2)
33399 return NULL_TREE;
33400 break;
33402 case BUILT_IN_SINF:
33403 case BUILT_IN_COSF:
33404 case BUILT_IN_EXPF:
33405 case BUILT_IN_POWF:
33406 case BUILT_IN_LOGF:
33407 case BUILT_IN_LOG2F:
33408 case BUILT_IN_LOG10F:
33409 name[4] = 's';
33410 name[5] = '4';
33411 if (el_mode != SFmode
33412 || n != 4)
33413 return NULL_TREE;
33414 break;
33416 default:
33417 return NULL_TREE;
33420 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33421 sprintf (name + 7, "%s", bname+10);
33423 arity = 0;
33424 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33425 args;
33426 args = TREE_CHAIN (args))
33427 arity++;
33429 if (arity == 1)
33430 fntype = build_function_type_list (type_out, type_in, NULL);
33431 else
33432 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33434 /* Build a function declaration for the vectorized function. */
33435 new_fndecl = build_decl (BUILTINS_LOCATION,
33436 FUNCTION_DECL, get_identifier (name), fntype);
33437 TREE_PUBLIC (new_fndecl) = 1;
33438 DECL_EXTERNAL (new_fndecl) = 1;
33439 DECL_IS_NOVOPS (new_fndecl) = 1;
33440 TREE_READONLY (new_fndecl) = 1;
33442 return new_fndecl;
33445 /* Returns a decl of a function that implements gather load with
33446 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
33447 Return NULL_TREE if it is not available. */
33449 static tree
33450 ix86_vectorize_builtin_gather (const_tree mem_vectype,
33451 const_tree index_type, int scale)
33453 bool si;
33454 enum ix86_builtins code;
33456 if (! TARGET_AVX2)
33457 return NULL_TREE;
33459 if ((TREE_CODE (index_type) != INTEGER_TYPE
33460 && !POINTER_TYPE_P (index_type))
33461 || (TYPE_MODE (index_type) != SImode
33462 && TYPE_MODE (index_type) != DImode))
33463 return NULL_TREE;
33465 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
33466 return NULL_TREE;
33468 /* v*gather* insn sign extends index to pointer mode. */
33469 if (TYPE_PRECISION (index_type) < POINTER_SIZE
33470 && TYPE_UNSIGNED (index_type))
33471 return NULL_TREE;
33473 if (scale <= 0
33474 || scale > 8
33475 || (scale & (scale - 1)) != 0)
33476 return NULL_TREE;
33478 si = TYPE_MODE (index_type) == SImode;
33479 switch (TYPE_MODE (mem_vectype))
33481 case V2DFmode:
33482 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
33483 break;
33484 case V4DFmode:
33485 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
33486 break;
33487 case V2DImode:
33488 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
33489 break;
33490 case V4DImode:
33491 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
33492 break;
33493 case V4SFmode:
33494 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
33495 break;
33496 case V8SFmode:
33497 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
33498 break;
33499 case V4SImode:
33500 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
33501 break;
33502 case V8SImode:
33503 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33504 break;
33505 default:
33506 return NULL_TREE;
33509 return ix86_builtins[code];
33512 /* Returns a code for a target-specific builtin that implements
33513 reciprocal of the function, or NULL_TREE if not available. */
33515 static tree
33516 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33517 bool sqrt ATTRIBUTE_UNUSED)
33519 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33520 && flag_finite_math_only && !flag_trapping_math
33521 && flag_unsafe_math_optimizations))
33522 return NULL_TREE;
33524 if (md_fn)
33525 /* Machine dependent builtins. */
33526 switch (fn)
33528 /* Vectorized version of sqrt to rsqrt conversion. */
33529 case IX86_BUILTIN_SQRTPS_NR:
33530 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33532 case IX86_BUILTIN_SQRTPS_NR256:
33533 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33535 default:
33536 return NULL_TREE;
33538 else
33539 /* Normal builtins. */
33540 switch (fn)
33542 /* Sqrt to rsqrt conversion. */
33543 case BUILT_IN_SQRTF:
33544 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33546 default:
33547 return NULL_TREE;
33551 /* Helper for avx_vpermilps256_operand et al. This is also used by
33552 the expansion functions to turn the parallel back into a mask.
33553 The return value is 0 for no match and the imm8+1 for a match. */
33556 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33558 unsigned i, nelt = GET_MODE_NUNITS (mode);
33559 unsigned mask = 0;
33560 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33562 if (XVECLEN (par, 0) != (int) nelt)
33563 return 0;
33565 /* Validate that all of the elements are constants, and not totally
33566 out of range. Copy the data into an integral array to make the
33567 subsequent checks easier. */
33568 for (i = 0; i < nelt; ++i)
33570 rtx er = XVECEXP (par, 0, i);
33571 unsigned HOST_WIDE_INT ei;
33573 if (!CONST_INT_P (er))
33574 return 0;
33575 ei = INTVAL (er);
33576 if (ei >= nelt)
33577 return 0;
33578 ipar[i] = ei;
33581 switch (mode)
33583 case V4DFmode:
33584 /* In the 256-bit DFmode case, we can only move elements within
33585 a 128-bit lane. */
33586 for (i = 0; i < 2; ++i)
33588 if (ipar[i] >= 2)
33589 return 0;
33590 mask |= ipar[i] << i;
33592 for (i = 2; i < 4; ++i)
33594 if (ipar[i] < 2)
33595 return 0;
33596 mask |= (ipar[i] - 2) << i;
33598 break;
33600 case V8SFmode:
33601 /* In the 256-bit SFmode case, we have full freedom of movement
33602 within the low 128-bit lane, but the high 128-bit lane must
33603 mirror the exact same pattern. */
33604 for (i = 0; i < 4; ++i)
33605 if (ipar[i] + 4 != ipar[i + 4])
33606 return 0;
33607 nelt = 4;
33608 /* FALLTHRU */
33610 case V2DFmode:
33611 case V4SFmode:
33612 /* In the 128-bit case, we've full freedom in the placement of
33613 the elements from the source operand. */
33614 for (i = 0; i < nelt; ++i)
33615 mask |= ipar[i] << (i * (nelt / 2));
33616 break;
33618 default:
33619 gcc_unreachable ();
33622 /* Make sure success has a non-zero value by adding one. */
33623 return mask + 1;
33626 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33627 the expansion functions to turn the parallel back into a mask.
33628 The return value is 0 for no match and the imm8+1 for a match. */
33631 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33633 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33634 unsigned mask = 0;
33635 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33637 if (XVECLEN (par, 0) != (int) nelt)
33638 return 0;
33640 /* Validate that all of the elements are constants, and not totally
33641 out of range. Copy the data into an integral array to make the
33642 subsequent checks easier. */
33643 for (i = 0; i < nelt; ++i)
33645 rtx er = XVECEXP (par, 0, i);
33646 unsigned HOST_WIDE_INT ei;
33648 if (!CONST_INT_P (er))
33649 return 0;
33650 ei = INTVAL (er);
33651 if (ei >= 2 * nelt)
33652 return 0;
33653 ipar[i] = ei;
33656 /* Validate that the halves of the permute are halves. */
33657 for (i = 0; i < nelt2 - 1; ++i)
33658 if (ipar[i] + 1 != ipar[i + 1])
33659 return 0;
33660 for (i = nelt2; i < nelt - 1; ++i)
33661 if (ipar[i] + 1 != ipar[i + 1])
33662 return 0;
33664 /* Reconstruct the mask. */
33665 for (i = 0; i < 2; ++i)
33667 unsigned e = ipar[i * nelt2];
33668 if (e % nelt2)
33669 return 0;
33670 e /= nelt2;
33671 mask |= e << (i * 4);
33674 /* Make sure success has a non-zero value by adding one. */
33675 return mask + 1;
33678 /* Store OPERAND to the memory after reload is completed. This means
33679 that we can't easily use assign_stack_local. */
33681 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33683 rtx result;
33685 gcc_assert (reload_completed);
33686 if (ix86_using_red_zone ())
33688 result = gen_rtx_MEM (mode,
33689 gen_rtx_PLUS (Pmode,
33690 stack_pointer_rtx,
33691 GEN_INT (-RED_ZONE_SIZE)));
33692 emit_move_insn (result, operand);
33694 else if (TARGET_64BIT)
33696 switch (mode)
33698 case HImode:
33699 case SImode:
33700 operand = gen_lowpart (DImode, operand);
33701 /* FALLTHRU */
33702 case DImode:
33703 emit_insn (
33704 gen_rtx_SET (VOIDmode,
33705 gen_rtx_MEM (DImode,
33706 gen_rtx_PRE_DEC (DImode,
33707 stack_pointer_rtx)),
33708 operand));
33709 break;
33710 default:
33711 gcc_unreachable ();
33713 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33715 else
33717 switch (mode)
33719 case DImode:
33721 rtx operands[2];
33722 split_double_mode (mode, &operand, 1, operands, operands + 1);
33723 emit_insn (
33724 gen_rtx_SET (VOIDmode,
33725 gen_rtx_MEM (SImode,
33726 gen_rtx_PRE_DEC (Pmode,
33727 stack_pointer_rtx)),
33728 operands[1]));
33729 emit_insn (
33730 gen_rtx_SET (VOIDmode,
33731 gen_rtx_MEM (SImode,
33732 gen_rtx_PRE_DEC (Pmode,
33733 stack_pointer_rtx)),
33734 operands[0]));
33736 break;
33737 case HImode:
33738 /* Store HImodes as SImodes. */
33739 operand = gen_lowpart (SImode, operand);
33740 /* FALLTHRU */
33741 case SImode:
33742 emit_insn (
33743 gen_rtx_SET (VOIDmode,
33744 gen_rtx_MEM (GET_MODE (operand),
33745 gen_rtx_PRE_DEC (SImode,
33746 stack_pointer_rtx)),
33747 operand));
33748 break;
33749 default:
33750 gcc_unreachable ();
33752 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33754 return result;
33757 /* Free operand from the memory. */
33758 void
33759 ix86_free_from_memory (enum machine_mode mode)
33761 if (!ix86_using_red_zone ())
33763 int size;
33765 if (mode == DImode || TARGET_64BIT)
33766 size = 8;
33767 else
33768 size = 4;
33769 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33770 to pop or add instruction if registers are available. */
33771 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33772 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33773 GEN_INT (size))));
33777 /* Return a register priority for hard reg REGNO. */
33778 static int
33779 ix86_register_priority (int hard_regno)
33781 /* ebp and r13 as the base always wants a displacement, r12 as the
33782 base always wants an index. So discourage their usage in an
33783 address. */
33784 if (hard_regno == R12_REG || hard_regno == R13_REG)
33785 return 0;
33786 if (hard_regno == BP_REG)
33787 return 1;
33788 /* New x86-64 int registers result in bigger code size. Discourage
33789 them. */
33790 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33791 return 2;
33792 /* New x86-64 SSE registers result in bigger code size. Discourage
33793 them. */
33794 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33795 return 2;
33796 /* Usage of AX register results in smaller code. Prefer it. */
33797 if (hard_regno == 0)
33798 return 4;
33799 return 3;
33802 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33804 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33805 QImode must go into class Q_REGS.
33806 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33807 movdf to do mem-to-mem moves through integer regs. */
33809 static reg_class_t
33810 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33812 enum machine_mode mode = GET_MODE (x);
33814 /* We're only allowed to return a subclass of CLASS. Many of the
33815 following checks fail for NO_REGS, so eliminate that early. */
33816 if (regclass == NO_REGS)
33817 return NO_REGS;
33819 /* All classes can load zeros. */
33820 if (x == CONST0_RTX (mode))
33821 return regclass;
33823 /* Force constants into memory if we are loading a (nonzero) constant into
33824 an MMX or SSE register. This is because there are no MMX/SSE instructions
33825 to load from a constant. */
33826 if (CONSTANT_P (x)
33827 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33828 return NO_REGS;
33830 /* Prefer SSE regs only, if we can use them for math. */
33831 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33832 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33834 /* Floating-point constants need more complex checks. */
33835 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33837 /* General regs can load everything. */
33838 if (reg_class_subset_p (regclass, GENERAL_REGS))
33839 return regclass;
33841 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33842 zero above. We only want to wind up preferring 80387 registers if
33843 we plan on doing computation with them. */
33844 if (TARGET_80387
33845 && standard_80387_constant_p (x) > 0)
33847 /* Limit class to non-sse. */
33848 if (regclass == FLOAT_SSE_REGS)
33849 return FLOAT_REGS;
33850 if (regclass == FP_TOP_SSE_REGS)
33851 return FP_TOP_REG;
33852 if (regclass == FP_SECOND_SSE_REGS)
33853 return FP_SECOND_REG;
33854 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33855 return regclass;
33858 return NO_REGS;
33861 /* Generally when we see PLUS here, it's the function invariant
33862 (plus soft-fp const_int). Which can only be computed into general
33863 regs. */
33864 if (GET_CODE (x) == PLUS)
33865 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33867 /* QImode constants are easy to load, but non-constant QImode data
33868 must go into Q_REGS. */
33869 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33871 if (reg_class_subset_p (regclass, Q_REGS))
33872 return regclass;
33873 if (reg_class_subset_p (Q_REGS, regclass))
33874 return Q_REGS;
33875 return NO_REGS;
33878 return regclass;
33881 /* Discourage putting floating-point values in SSE registers unless
33882 SSE math is being used, and likewise for the 387 registers. */
33883 static reg_class_t
33884 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33886 enum machine_mode mode = GET_MODE (x);
33888 /* Restrict the output reload class to the register bank that we are doing
33889 math on. If we would like not to return a subset of CLASS, reject this
33890 alternative: if reload cannot do this, it will still use its choice. */
33891 mode = GET_MODE (x);
33892 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33893 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33895 if (X87_FLOAT_MODE_P (mode))
33897 if (regclass == FP_TOP_SSE_REGS)
33898 return FP_TOP_REG;
33899 else if (regclass == FP_SECOND_SSE_REGS)
33900 return FP_SECOND_REG;
33901 else
33902 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33905 return regclass;
33908 static reg_class_t
33909 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33910 enum machine_mode mode, secondary_reload_info *sri)
33912 /* Double-word spills from general registers to non-offsettable memory
33913 references (zero-extended addresses) require special handling. */
33914 if (TARGET_64BIT
33915 && MEM_P (x)
33916 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33917 && INTEGER_CLASS_P (rclass)
33918 && !offsettable_memref_p (x))
33920 sri->icode = (in_p
33921 ? CODE_FOR_reload_noff_load
33922 : CODE_FOR_reload_noff_store);
33923 /* Add the cost of moving address to a temporary. */
33924 sri->extra_cost = 1;
33926 return NO_REGS;
33929 /* QImode spills from non-QI registers require
33930 intermediate register on 32bit targets. */
33931 if (!TARGET_64BIT
33932 && !in_p && mode == QImode
33933 && INTEGER_CLASS_P (rclass)
33934 && MAYBE_NON_Q_CLASS_P (rclass))
33936 int regno;
33938 if (REG_P (x))
33939 regno = REGNO (x);
33940 else
33941 regno = -1;
33943 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33944 regno = true_regnum (x);
33946 /* Return Q_REGS if the operand is in memory. */
33947 if (regno == -1)
33948 return Q_REGS;
33951 /* This condition handles corner case where an expression involving
33952 pointers gets vectorized. We're trying to use the address of a
33953 stack slot as a vector initializer.
33955 (set (reg:V2DI 74 [ vect_cst_.2 ])
33956 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33958 Eventually frame gets turned into sp+offset like this:
33960 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33961 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33962 (const_int 392 [0x188]))))
33964 That later gets turned into:
33966 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33967 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33968 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33970 We'll have the following reload recorded:
33972 Reload 0: reload_in (DI) =
33973 (plus:DI (reg/f:DI 7 sp)
33974 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33975 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33976 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33977 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33978 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33979 reload_reg_rtx: (reg:V2DI 22 xmm1)
33981 Which isn't going to work since SSE instructions can't handle scalar
33982 additions. Returning GENERAL_REGS forces the addition into integer
33983 register and reload can handle subsequent reloads without problems. */
33985 if (in_p && GET_CODE (x) == PLUS
33986 && SSE_CLASS_P (rclass)
33987 && SCALAR_INT_MODE_P (mode))
33988 return GENERAL_REGS;
33990 return NO_REGS;
33993 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33995 static bool
33996 ix86_class_likely_spilled_p (reg_class_t rclass)
33998 switch (rclass)
34000 case AREG:
34001 case DREG:
34002 case CREG:
34003 case BREG:
34004 case AD_REGS:
34005 case SIREG:
34006 case DIREG:
34007 case SSE_FIRST_REG:
34008 case FP_TOP_REG:
34009 case FP_SECOND_REG:
34010 return true;
34012 default:
34013 break;
34016 return false;
34019 /* If we are copying between general and FP registers, we need a memory
34020 location. The same is true for SSE and MMX registers.
34022 To optimize register_move_cost performance, allow inline variant.
34024 The macro can't work reliably when one of the CLASSES is class containing
34025 registers from multiple units (SSE, MMX, integer). We avoid this by never
34026 combining those units in single alternative in the machine description.
34027 Ensure that this constraint holds to avoid unexpected surprises.
34029 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34030 enforce these sanity checks. */
34032 static inline bool
34033 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34034 enum machine_mode mode, int strict)
34036 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34037 return false;
34038 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34039 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34040 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34041 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34042 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34043 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34045 gcc_assert (!strict || lra_in_progress);
34046 return true;
34049 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34050 return true;
34052 /* ??? This is a lie. We do have moves between mmx/general, and for
34053 mmx/sse2. But by saying we need secondary memory we discourage the
34054 register allocator from using the mmx registers unless needed. */
34055 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34056 return true;
34058 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34060 /* SSE1 doesn't have any direct moves from other classes. */
34061 if (!TARGET_SSE2)
34062 return true;
34064 /* If the target says that inter-unit moves are more expensive
34065 than moving through memory, then don't generate them. */
34066 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34067 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34068 return true;
34070 /* Between SSE and general, we have moves no larger than word size. */
34071 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34072 return true;
34075 return false;
34078 bool
34079 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34080 enum machine_mode mode, int strict)
34082 return inline_secondary_memory_needed (class1, class2, mode, strict);
34085 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34087 On the 80386, this is the size of MODE in words,
34088 except in the FP regs, where a single reg is always enough. */
34090 static unsigned char
34091 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34093 if (MAYBE_INTEGER_CLASS_P (rclass))
34095 if (mode == XFmode)
34096 return (TARGET_64BIT ? 2 : 3);
34097 else if (mode == XCmode)
34098 return (TARGET_64BIT ? 4 : 6);
34099 else
34100 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34102 else
34104 if (COMPLEX_MODE_P (mode))
34105 return 2;
34106 else
34107 return 1;
34111 /* Return true if the registers in CLASS cannot represent the change from
34112 modes FROM to TO. */
34114 bool
34115 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34116 enum reg_class regclass)
34118 if (from == to)
34119 return false;
34121 /* x87 registers can't do subreg at all, as all values are reformatted
34122 to extended precision. */
34123 if (MAYBE_FLOAT_CLASS_P (regclass))
34124 return true;
34126 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34128 /* Vector registers do not support QI or HImode loads. If we don't
34129 disallow a change to these modes, reload will assume it's ok to
34130 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34131 the vec_dupv4hi pattern. */
34132 if (GET_MODE_SIZE (from) < 4)
34133 return true;
34135 /* Vector registers do not support subreg with nonzero offsets, which
34136 are otherwise valid for integer registers. Since we can't see
34137 whether we have a nonzero offset from here, prohibit all
34138 nonparadoxical subregs changing size. */
34139 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34140 return true;
34143 return false;
34146 /* Return the cost of moving data of mode M between a
34147 register and memory. A value of 2 is the default; this cost is
34148 relative to those in `REGISTER_MOVE_COST'.
34150 This function is used extensively by register_move_cost that is used to
34151 build tables at startup. Make it inline in this case.
34152 When IN is 2, return maximum of in and out move cost.
34154 If moving between registers and memory is more expensive than
34155 between two registers, you should define this macro to express the
34156 relative cost.
34158 Model also increased moving costs of QImode registers in non
34159 Q_REGS classes.
34161 static inline int
34162 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34163 int in)
34165 int cost;
34166 if (FLOAT_CLASS_P (regclass))
34168 int index;
34169 switch (mode)
34171 case SFmode:
34172 index = 0;
34173 break;
34174 case DFmode:
34175 index = 1;
34176 break;
34177 case XFmode:
34178 index = 2;
34179 break;
34180 default:
34181 return 100;
34183 if (in == 2)
34184 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34185 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34187 if (SSE_CLASS_P (regclass))
34189 int index;
34190 switch (GET_MODE_SIZE (mode))
34192 case 4:
34193 index = 0;
34194 break;
34195 case 8:
34196 index = 1;
34197 break;
34198 case 16:
34199 index = 2;
34200 break;
34201 default:
34202 return 100;
34204 if (in == 2)
34205 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34206 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34208 if (MMX_CLASS_P (regclass))
34210 int index;
34211 switch (GET_MODE_SIZE (mode))
34213 case 4:
34214 index = 0;
34215 break;
34216 case 8:
34217 index = 1;
34218 break;
34219 default:
34220 return 100;
34222 if (in)
34223 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34224 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34226 switch (GET_MODE_SIZE (mode))
34228 case 1:
34229 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34231 if (!in)
34232 return ix86_cost->int_store[0];
34233 if (TARGET_PARTIAL_REG_DEPENDENCY
34234 && optimize_function_for_speed_p (cfun))
34235 cost = ix86_cost->movzbl_load;
34236 else
34237 cost = ix86_cost->int_load[0];
34238 if (in == 2)
34239 return MAX (cost, ix86_cost->int_store[0]);
34240 return cost;
34242 else
34244 if (in == 2)
34245 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
34246 if (in)
34247 return ix86_cost->movzbl_load;
34248 else
34249 return ix86_cost->int_store[0] + 4;
34251 break;
34252 case 2:
34253 if (in == 2)
34254 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
34255 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
34256 default:
34257 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
34258 if (mode == TFmode)
34259 mode = XFmode;
34260 if (in == 2)
34261 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
34262 else if (in)
34263 cost = ix86_cost->int_load[2];
34264 else
34265 cost = ix86_cost->int_store[2];
34266 return (cost * (((int) GET_MODE_SIZE (mode)
34267 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
34271 static int
34272 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
34273 bool in)
34275 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
34279 /* Return the cost of moving data from a register in class CLASS1 to
34280 one in class CLASS2.
34282 It is not required that the cost always equal 2 when FROM is the same as TO;
34283 on some machines it is expensive to move between registers if they are not
34284 general registers. */
34286 static int
34287 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
34288 reg_class_t class2_i)
34290 enum reg_class class1 = (enum reg_class) class1_i;
34291 enum reg_class class2 = (enum reg_class) class2_i;
34293 /* In case we require secondary memory, compute cost of the store followed
34294 by load. In order to avoid bad register allocation choices, we need
34295 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
34297 if (inline_secondary_memory_needed (class1, class2, mode, 0))
34299 int cost = 1;
34301 cost += inline_memory_move_cost (mode, class1, 2);
34302 cost += inline_memory_move_cost (mode, class2, 2);
34304 /* In case of copying from general_purpose_register we may emit multiple
34305 stores followed by single load causing memory size mismatch stall.
34306 Count this as arbitrarily high cost of 20. */
34307 if (targetm.class_max_nregs (class1, mode)
34308 > targetm.class_max_nregs (class2, mode))
34309 cost += 20;
34311 /* In the case of FP/MMX moves, the registers actually overlap, and we
34312 have to switch modes in order to treat them differently. */
34313 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
34314 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
34315 cost += 20;
34317 return cost;
34320 /* Moves between SSE/MMX and integer unit are expensive. */
34321 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
34322 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34324 /* ??? By keeping returned value relatively high, we limit the number
34325 of moves between integer and MMX/SSE registers for all targets.
34326 Additionally, high value prevents problem with x86_modes_tieable_p(),
34327 where integer modes in MMX/SSE registers are not tieable
34328 because of missing QImode and HImode moves to, from or between
34329 MMX/SSE registers. */
34330 return MAX (8, ix86_cost->mmxsse_to_integer);
34332 if (MAYBE_FLOAT_CLASS_P (class1))
34333 return ix86_cost->fp_move;
34334 if (MAYBE_SSE_CLASS_P (class1))
34335 return ix86_cost->sse_move;
34336 if (MAYBE_MMX_CLASS_P (class1))
34337 return ix86_cost->mmx_move;
34338 return 2;
34341 /* Return TRUE if hard register REGNO can hold a value of machine-mode
34342 MODE. */
34344 bool
34345 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
34347 /* Flags and only flags can only hold CCmode values. */
34348 if (CC_REGNO_P (regno))
34349 return GET_MODE_CLASS (mode) == MODE_CC;
34350 if (GET_MODE_CLASS (mode) == MODE_CC
34351 || GET_MODE_CLASS (mode) == MODE_RANDOM
34352 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
34353 return false;
34354 if (STACK_REGNO_P (regno))
34355 return VALID_FP_MODE_P (mode);
34356 if (SSE_REGNO_P (regno))
34358 /* We implement the move patterns for all vector modes into and
34359 out of SSE registers, even when no operation instructions
34360 are available. OImode move is available only when AVX is
34361 enabled. */
34362 return ((TARGET_AVX && mode == OImode)
34363 || VALID_AVX256_REG_MODE (mode)
34364 || VALID_SSE_REG_MODE (mode)
34365 || VALID_SSE2_REG_MODE (mode)
34366 || VALID_MMX_REG_MODE (mode)
34367 || VALID_MMX_REG_MODE_3DNOW (mode));
34369 if (MMX_REGNO_P (regno))
34371 /* We implement the move patterns for 3DNOW modes even in MMX mode,
34372 so if the register is available at all, then we can move data of
34373 the given mode into or out of it. */
34374 return (VALID_MMX_REG_MODE (mode)
34375 || VALID_MMX_REG_MODE_3DNOW (mode));
34378 if (mode == QImode)
34380 /* Take care for QImode values - they can be in non-QI regs,
34381 but then they do cause partial register stalls. */
34382 if (ANY_QI_REGNO_P (regno))
34383 return true;
34384 if (!TARGET_PARTIAL_REG_STALL)
34385 return true;
34386 /* LRA checks if the hard register is OK for the given mode.
34387 QImode values can live in non-QI regs, so we allow all
34388 registers here. */
34389 if (lra_in_progress)
34390 return true;
34391 return !can_create_pseudo_p ();
34393 /* We handle both integer and floats in the general purpose registers. */
34394 else if (VALID_INT_MODE_P (mode))
34395 return true;
34396 else if (VALID_FP_MODE_P (mode))
34397 return true;
34398 else if (VALID_DFP_MODE_P (mode))
34399 return true;
34400 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
34401 on to use that value in smaller contexts, this can easily force a
34402 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
34403 supporting DImode, allow it. */
34404 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
34405 return true;
34407 return false;
34410 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
34411 tieable integer mode. */
34413 static bool
34414 ix86_tieable_integer_mode_p (enum machine_mode mode)
34416 switch (mode)
34418 case HImode:
34419 case SImode:
34420 return true;
34422 case QImode:
34423 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
34425 case DImode:
34426 return TARGET_64BIT;
34428 default:
34429 return false;
34433 /* Return true if MODE1 is accessible in a register that can hold MODE2
34434 without copying. That is, all register classes that can hold MODE2
34435 can also hold MODE1. */
34437 bool
34438 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
34440 if (mode1 == mode2)
34441 return true;
34443 if (ix86_tieable_integer_mode_p (mode1)
34444 && ix86_tieable_integer_mode_p (mode2))
34445 return true;
34447 /* MODE2 being XFmode implies fp stack or general regs, which means we
34448 can tie any smaller floating point modes to it. Note that we do not
34449 tie this with TFmode. */
34450 if (mode2 == XFmode)
34451 return mode1 == SFmode || mode1 == DFmode;
34453 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
34454 that we can tie it with SFmode. */
34455 if (mode2 == DFmode)
34456 return mode1 == SFmode;
34458 /* If MODE2 is only appropriate for an SSE register, then tie with
34459 any other mode acceptable to SSE registers. */
34460 if (GET_MODE_SIZE (mode2) == 32
34461 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34462 return (GET_MODE_SIZE (mode1) == 32
34463 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34464 if (GET_MODE_SIZE (mode2) == 16
34465 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34466 return (GET_MODE_SIZE (mode1) == 16
34467 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34469 /* If MODE2 is appropriate for an MMX register, then tie
34470 with any other mode acceptable to MMX registers. */
34471 if (GET_MODE_SIZE (mode2) == 8
34472 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
34473 return (GET_MODE_SIZE (mode1) == 8
34474 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
34476 return false;
34479 /* Return the cost of moving between two registers of mode MODE. */
34481 static int
34482 ix86_set_reg_reg_cost (enum machine_mode mode)
34484 unsigned int units = UNITS_PER_WORD;
34486 switch (GET_MODE_CLASS (mode))
34488 default:
34489 break;
34491 case MODE_CC:
34492 units = GET_MODE_SIZE (CCmode);
34493 break;
34495 case MODE_FLOAT:
34496 if ((TARGET_SSE && mode == TFmode)
34497 || (TARGET_80387 && mode == XFmode)
34498 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
34499 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
34500 units = GET_MODE_SIZE (mode);
34501 break;
34503 case MODE_COMPLEX_FLOAT:
34504 if ((TARGET_SSE && mode == TCmode)
34505 || (TARGET_80387 && mode == XCmode)
34506 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
34507 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34508 units = GET_MODE_SIZE (mode);
34509 break;
34511 case MODE_VECTOR_INT:
34512 case MODE_VECTOR_FLOAT:
34513 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34514 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34515 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34516 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34517 units = GET_MODE_SIZE (mode);
34520 /* Return the cost of moving between two registers of mode MODE,
34521 assuming that the move will be in pieces of at most UNITS bytes. */
34522 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34525 /* Compute a (partial) cost for rtx X. Return true if the complete
34526 cost has been computed, and false if subexpressions should be
34527 scanned. In either case, *TOTAL contains the cost result. */
34529 static bool
34530 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34531 bool speed)
34533 enum rtx_code code = (enum rtx_code) code_i;
34534 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34535 enum machine_mode mode = GET_MODE (x);
34536 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34538 switch (code)
34540 case SET:
34541 if (register_operand (SET_DEST (x), VOIDmode)
34542 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34544 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34545 return true;
34547 return false;
34549 case CONST_INT:
34550 case CONST:
34551 case LABEL_REF:
34552 case SYMBOL_REF:
34553 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34554 *total = 3;
34555 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34556 *total = 2;
34557 else if (flag_pic && SYMBOLIC_CONST (x)
34558 && (!TARGET_64BIT
34559 || (!GET_CODE (x) != LABEL_REF
34560 && (GET_CODE (x) != SYMBOL_REF
34561 || !SYMBOL_REF_LOCAL_P (x)))))
34562 *total = 1;
34563 else
34564 *total = 0;
34565 return true;
34567 case CONST_DOUBLE:
34568 if (mode == VOIDmode)
34570 *total = 0;
34571 return true;
34573 switch (standard_80387_constant_p (x))
34575 case 1: /* 0.0 */
34576 *total = 1;
34577 return true;
34578 default: /* Other constants */
34579 *total = 2;
34580 return true;
34581 case 0:
34582 case -1:
34583 break;
34585 if (SSE_FLOAT_MODE_P (mode))
34587 case CONST_VECTOR:
34588 switch (standard_sse_constant_p (x))
34590 case 0:
34591 break;
34592 case 1: /* 0: xor eliminates false dependency */
34593 *total = 0;
34594 return true;
34595 default: /* -1: cmp contains false dependency */
34596 *total = 1;
34597 return true;
34600 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34601 it'll probably end up. Add a penalty for size. */
34602 *total = (COSTS_N_INSNS (1)
34603 + (flag_pic != 0 && !TARGET_64BIT)
34604 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34605 return true;
34607 case ZERO_EXTEND:
34608 /* The zero extensions is often completely free on x86_64, so make
34609 it as cheap as possible. */
34610 if (TARGET_64BIT && mode == DImode
34611 && GET_MODE (XEXP (x, 0)) == SImode)
34612 *total = 1;
34613 else if (TARGET_ZERO_EXTEND_WITH_AND)
34614 *total = cost->add;
34615 else
34616 *total = cost->movzx;
34617 return false;
34619 case SIGN_EXTEND:
34620 *total = cost->movsx;
34621 return false;
34623 case ASHIFT:
34624 if (SCALAR_INT_MODE_P (mode)
34625 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34626 && CONST_INT_P (XEXP (x, 1)))
34628 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34629 if (value == 1)
34631 *total = cost->add;
34632 return false;
34634 if ((value == 2 || value == 3)
34635 && cost->lea <= cost->shift_const)
34637 *total = cost->lea;
34638 return false;
34641 /* FALLTHRU */
34643 case ROTATE:
34644 case ASHIFTRT:
34645 case LSHIFTRT:
34646 case ROTATERT:
34647 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34649 /* ??? Should be SSE vector operation cost. */
34650 /* At least for published AMD latencies, this really is the same
34651 as the latency for a simple fpu operation like fabs. */
34652 /* V*QImode is emulated with 1-11 insns. */
34653 if (mode == V16QImode || mode == V32QImode)
34655 int count = 11;
34656 if (TARGET_XOP && mode == V16QImode)
34658 /* For XOP we use vpshab, which requires a broadcast of the
34659 value to the variable shift insn. For constants this
34660 means a V16Q const in mem; even when we can perform the
34661 shift with one insn set the cost to prefer paddb. */
34662 if (CONSTANT_P (XEXP (x, 1)))
34664 *total = (cost->fabs
34665 + rtx_cost (XEXP (x, 0), code, 0, speed)
34666 + (speed ? 2 : COSTS_N_BYTES (16)));
34667 return true;
34669 count = 3;
34671 else if (TARGET_SSSE3)
34672 count = 7;
34673 *total = cost->fabs * count;
34675 else
34676 *total = cost->fabs;
34678 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34680 if (CONST_INT_P (XEXP (x, 1)))
34682 if (INTVAL (XEXP (x, 1)) > 32)
34683 *total = cost->shift_const + COSTS_N_INSNS (2);
34684 else
34685 *total = cost->shift_const * 2;
34687 else
34689 if (GET_CODE (XEXP (x, 1)) == AND)
34690 *total = cost->shift_var * 2;
34691 else
34692 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34695 else
34697 if (CONST_INT_P (XEXP (x, 1)))
34698 *total = cost->shift_const;
34699 else if (GET_CODE (XEXP (x, 1)) == SUBREG
34700 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
34702 /* Return the cost after shift-and truncation. */
34703 *total = cost->shift_var;
34704 return true;
34706 else
34707 *total = cost->shift_var;
34709 return false;
34711 case FMA:
34713 rtx sub;
34715 gcc_assert (FLOAT_MODE_P (mode));
34716 gcc_assert (TARGET_FMA || TARGET_FMA4);
34718 /* ??? SSE scalar/vector cost should be used here. */
34719 /* ??? Bald assumption that fma has the same cost as fmul. */
34720 *total = cost->fmul;
34721 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34723 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34724 sub = XEXP (x, 0);
34725 if (GET_CODE (sub) == NEG)
34726 sub = XEXP (sub, 0);
34727 *total += rtx_cost (sub, FMA, 0, speed);
34729 sub = XEXP (x, 2);
34730 if (GET_CODE (sub) == NEG)
34731 sub = XEXP (sub, 0);
34732 *total += rtx_cost (sub, FMA, 2, speed);
34733 return true;
34736 case MULT:
34737 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34739 /* ??? SSE scalar cost should be used here. */
34740 *total = cost->fmul;
34741 return false;
34743 else if (X87_FLOAT_MODE_P (mode))
34745 *total = cost->fmul;
34746 return false;
34748 else if (FLOAT_MODE_P (mode))
34750 /* ??? SSE vector cost should be used here. */
34751 *total = cost->fmul;
34752 return false;
34754 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34756 /* V*QImode is emulated with 7-13 insns. */
34757 if (mode == V16QImode || mode == V32QImode)
34759 int extra = 11;
34760 if (TARGET_XOP && mode == V16QImode)
34761 extra = 5;
34762 else if (TARGET_SSSE3)
34763 extra = 6;
34764 *total = cost->fmul * 2 + cost->fabs * extra;
34766 /* V*DImode is emulated with 5-8 insns. */
34767 else if (mode == V2DImode || mode == V4DImode)
34769 if (TARGET_XOP && mode == V2DImode)
34770 *total = cost->fmul * 2 + cost->fabs * 3;
34771 else
34772 *total = cost->fmul * 3 + cost->fabs * 5;
34774 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34775 insns, including two PMULUDQ. */
34776 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34777 *total = cost->fmul * 2 + cost->fabs * 5;
34778 else
34779 *total = cost->fmul;
34780 return false;
34782 else
34784 rtx op0 = XEXP (x, 0);
34785 rtx op1 = XEXP (x, 1);
34786 int nbits;
34787 if (CONST_INT_P (XEXP (x, 1)))
34789 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34790 for (nbits = 0; value != 0; value &= value - 1)
34791 nbits++;
34793 else
34794 /* This is arbitrary. */
34795 nbits = 7;
34797 /* Compute costs correctly for widening multiplication. */
34798 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34799 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34800 == GET_MODE_SIZE (mode))
34802 int is_mulwiden = 0;
34803 enum machine_mode inner_mode = GET_MODE (op0);
34805 if (GET_CODE (op0) == GET_CODE (op1))
34806 is_mulwiden = 1, op1 = XEXP (op1, 0);
34807 else if (CONST_INT_P (op1))
34809 if (GET_CODE (op0) == SIGN_EXTEND)
34810 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34811 == INTVAL (op1);
34812 else
34813 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34816 if (is_mulwiden)
34817 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34820 *total = (cost->mult_init[MODE_INDEX (mode)]
34821 + nbits * cost->mult_bit
34822 + rtx_cost (op0, outer_code, opno, speed)
34823 + rtx_cost (op1, outer_code, opno, speed));
34825 return true;
34828 case DIV:
34829 case UDIV:
34830 case MOD:
34831 case UMOD:
34832 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34833 /* ??? SSE cost should be used here. */
34834 *total = cost->fdiv;
34835 else if (X87_FLOAT_MODE_P (mode))
34836 *total = cost->fdiv;
34837 else if (FLOAT_MODE_P (mode))
34838 /* ??? SSE vector cost should be used here. */
34839 *total = cost->fdiv;
34840 else
34841 *total = cost->divide[MODE_INDEX (mode)];
34842 return false;
34844 case PLUS:
34845 if (GET_MODE_CLASS (mode) == MODE_INT
34846 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34848 if (GET_CODE (XEXP (x, 0)) == PLUS
34849 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34850 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34851 && CONSTANT_P (XEXP (x, 1)))
34853 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34854 if (val == 2 || val == 4 || val == 8)
34856 *total = cost->lea;
34857 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34858 outer_code, opno, speed);
34859 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34860 outer_code, opno, speed);
34861 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34862 return true;
34865 else if (GET_CODE (XEXP (x, 0)) == MULT
34866 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34868 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34869 if (val == 2 || val == 4 || val == 8)
34871 *total = cost->lea;
34872 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34873 outer_code, opno, speed);
34874 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34875 return true;
34878 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34880 *total = cost->lea;
34881 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34882 outer_code, opno, speed);
34883 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34884 outer_code, opno, speed);
34885 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34886 return true;
34889 /* FALLTHRU */
34891 case MINUS:
34892 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34894 /* ??? SSE cost should be used here. */
34895 *total = cost->fadd;
34896 return false;
34898 else if (X87_FLOAT_MODE_P (mode))
34900 *total = cost->fadd;
34901 return false;
34903 else if (FLOAT_MODE_P (mode))
34905 /* ??? SSE vector cost should be used here. */
34906 *total = cost->fadd;
34907 return false;
34909 /* FALLTHRU */
34911 case AND:
34912 case IOR:
34913 case XOR:
34914 if (GET_MODE_CLASS (mode) == MODE_INT
34915 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34917 *total = (cost->add * 2
34918 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34919 << (GET_MODE (XEXP (x, 0)) != DImode))
34920 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34921 << (GET_MODE (XEXP (x, 1)) != DImode)));
34922 return true;
34924 /* FALLTHRU */
34926 case NEG:
34927 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34929 /* ??? SSE cost should be used here. */
34930 *total = cost->fchs;
34931 return false;
34933 else if (X87_FLOAT_MODE_P (mode))
34935 *total = cost->fchs;
34936 return false;
34938 else if (FLOAT_MODE_P (mode))
34940 /* ??? SSE vector cost should be used here. */
34941 *total = cost->fchs;
34942 return false;
34944 /* FALLTHRU */
34946 case NOT:
34947 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34949 /* ??? Should be SSE vector operation cost. */
34950 /* At least for published AMD latencies, this really is the same
34951 as the latency for a simple fpu operation like fabs. */
34952 *total = cost->fabs;
34954 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34955 *total = cost->add * 2;
34956 else
34957 *total = cost->add;
34958 return false;
34960 case COMPARE:
34961 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34962 && XEXP (XEXP (x, 0), 1) == const1_rtx
34963 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34964 && XEXP (x, 1) == const0_rtx)
34966 /* This kind of construct is implemented using test[bwl].
34967 Treat it as if we had an AND. */
34968 *total = (cost->add
34969 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34970 + rtx_cost (const1_rtx, outer_code, opno, speed));
34971 return true;
34973 return false;
34975 case FLOAT_EXTEND:
34976 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34977 *total = 0;
34978 return false;
34980 case ABS:
34981 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34982 /* ??? SSE cost should be used here. */
34983 *total = cost->fabs;
34984 else if (X87_FLOAT_MODE_P (mode))
34985 *total = cost->fabs;
34986 else if (FLOAT_MODE_P (mode))
34987 /* ??? SSE vector cost should be used here. */
34988 *total = cost->fabs;
34989 return false;
34991 case SQRT:
34992 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34993 /* ??? SSE cost should be used here. */
34994 *total = cost->fsqrt;
34995 else if (X87_FLOAT_MODE_P (mode))
34996 *total = cost->fsqrt;
34997 else if (FLOAT_MODE_P (mode))
34998 /* ??? SSE vector cost should be used here. */
34999 *total = cost->fsqrt;
35000 return false;
35002 case UNSPEC:
35003 if (XINT (x, 1) == UNSPEC_TP)
35004 *total = 0;
35005 return false;
35007 case VEC_SELECT:
35008 case VEC_CONCAT:
35009 case VEC_MERGE:
35010 case VEC_DUPLICATE:
35011 /* ??? Assume all of these vector manipulation patterns are
35012 recognizable. In which case they all pretty much have the
35013 same cost. */
35014 *total = cost->fabs;
35015 return true;
35017 default:
35018 return false;
35022 #if TARGET_MACHO
35024 static int current_machopic_label_num;
35026 /* Given a symbol name and its associated stub, write out the
35027 definition of the stub. */
35029 void
35030 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35032 unsigned int length;
35033 char *binder_name, *symbol_name, lazy_ptr_name[32];
35034 int label = ++current_machopic_label_num;
35036 /* For 64-bit we shouldn't get here. */
35037 gcc_assert (!TARGET_64BIT);
35039 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35040 symb = targetm.strip_name_encoding (symb);
35042 length = strlen (stub);
35043 binder_name = XALLOCAVEC (char, length + 32);
35044 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35046 length = strlen (symb);
35047 symbol_name = XALLOCAVEC (char, length + 32);
35048 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35050 sprintf (lazy_ptr_name, "L%d$lz", label);
35052 if (MACHOPIC_ATT_STUB)
35053 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35054 else if (MACHOPIC_PURE)
35055 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35056 else
35057 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35059 fprintf (file, "%s:\n", stub);
35060 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35062 if (MACHOPIC_ATT_STUB)
35064 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35066 else if (MACHOPIC_PURE)
35068 /* PIC stub. */
35069 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35070 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35071 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35072 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35073 label, lazy_ptr_name, label);
35074 fprintf (file, "\tjmp\t*%%ecx\n");
35076 else
35077 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35079 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35080 it needs no stub-binding-helper. */
35081 if (MACHOPIC_ATT_STUB)
35082 return;
35084 fprintf (file, "%s:\n", binder_name);
35086 if (MACHOPIC_PURE)
35088 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35089 fprintf (file, "\tpushl\t%%ecx\n");
35091 else
35092 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35094 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35096 /* N.B. Keep the correspondence of these
35097 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35098 old-pic/new-pic/non-pic stubs; altering this will break
35099 compatibility with existing dylibs. */
35100 if (MACHOPIC_PURE)
35102 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35103 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35105 else
35106 /* 16-byte -mdynamic-no-pic stub. */
35107 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35109 fprintf (file, "%s:\n", lazy_ptr_name);
35110 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35111 fprintf (file, ASM_LONG "%s\n", binder_name);
35113 #endif /* TARGET_MACHO */
35115 /* Order the registers for register allocator. */
35117 void
35118 x86_order_regs_for_local_alloc (void)
35120 int pos = 0;
35121 int i;
35123 /* First allocate the local general purpose registers. */
35124 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35125 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35126 reg_alloc_order [pos++] = i;
35128 /* Global general purpose registers. */
35129 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35130 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35131 reg_alloc_order [pos++] = i;
35133 /* x87 registers come first in case we are doing FP math
35134 using them. */
35135 if (!TARGET_SSE_MATH)
35136 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35137 reg_alloc_order [pos++] = i;
35139 /* SSE registers. */
35140 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35141 reg_alloc_order [pos++] = i;
35142 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35143 reg_alloc_order [pos++] = i;
35145 /* x87 registers. */
35146 if (TARGET_SSE_MATH)
35147 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35148 reg_alloc_order [pos++] = i;
35150 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35151 reg_alloc_order [pos++] = i;
35153 /* Initialize the rest of array as we do not allocate some registers
35154 at all. */
35155 while (pos < FIRST_PSEUDO_REGISTER)
35156 reg_alloc_order [pos++] = 0;
35159 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35160 in struct attribute_spec handler. */
35161 static tree
35162 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35163 tree args,
35164 int flags ATTRIBUTE_UNUSED,
35165 bool *no_add_attrs)
35167 if (TREE_CODE (*node) != FUNCTION_TYPE
35168 && TREE_CODE (*node) != METHOD_TYPE
35169 && TREE_CODE (*node) != FIELD_DECL
35170 && TREE_CODE (*node) != TYPE_DECL)
35172 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35173 name);
35174 *no_add_attrs = true;
35175 return NULL_TREE;
35177 if (TARGET_64BIT)
35179 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35180 name);
35181 *no_add_attrs = true;
35182 return NULL_TREE;
35184 if (is_attribute_p ("callee_pop_aggregate_return", name))
35186 tree cst;
35188 cst = TREE_VALUE (args);
35189 if (TREE_CODE (cst) != INTEGER_CST)
35191 warning (OPT_Wattributes,
35192 "%qE attribute requires an integer constant argument",
35193 name);
35194 *no_add_attrs = true;
35196 else if (compare_tree_int (cst, 0) != 0
35197 && compare_tree_int (cst, 1) != 0)
35199 warning (OPT_Wattributes,
35200 "argument to %qE attribute is neither zero, nor one",
35201 name);
35202 *no_add_attrs = true;
35205 return NULL_TREE;
35208 return NULL_TREE;
35211 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
35212 struct attribute_spec.handler. */
35213 static tree
35214 ix86_handle_abi_attribute (tree *node, tree name,
35215 tree args ATTRIBUTE_UNUSED,
35216 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35218 if (TREE_CODE (*node) != FUNCTION_TYPE
35219 && TREE_CODE (*node) != METHOD_TYPE
35220 && TREE_CODE (*node) != FIELD_DECL
35221 && TREE_CODE (*node) != TYPE_DECL)
35223 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35224 name);
35225 *no_add_attrs = true;
35226 return NULL_TREE;
35229 /* Can combine regparm with all attributes but fastcall. */
35230 if (is_attribute_p ("ms_abi", name))
35232 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
35234 error ("ms_abi and sysv_abi attributes are not compatible");
35237 return NULL_TREE;
35239 else if (is_attribute_p ("sysv_abi", name))
35241 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
35243 error ("ms_abi and sysv_abi attributes are not compatible");
35246 return NULL_TREE;
35249 return NULL_TREE;
35252 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
35253 struct attribute_spec.handler. */
35254 static tree
35255 ix86_handle_struct_attribute (tree *node, tree name,
35256 tree args ATTRIBUTE_UNUSED,
35257 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35259 tree *type = NULL;
35260 if (DECL_P (*node))
35262 if (TREE_CODE (*node) == TYPE_DECL)
35263 type = &TREE_TYPE (*node);
35265 else
35266 type = node;
35268 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
35270 warning (OPT_Wattributes, "%qE attribute ignored",
35271 name);
35272 *no_add_attrs = true;
35275 else if ((is_attribute_p ("ms_struct", name)
35276 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
35277 || ((is_attribute_p ("gcc_struct", name)
35278 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
35280 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
35281 name);
35282 *no_add_attrs = true;
35285 return NULL_TREE;
35288 static tree
35289 ix86_handle_fndecl_attribute (tree *node, tree name,
35290 tree args ATTRIBUTE_UNUSED,
35291 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35293 if (TREE_CODE (*node) != FUNCTION_DECL)
35295 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35296 name);
35297 *no_add_attrs = true;
35299 return NULL_TREE;
35302 static bool
35303 ix86_ms_bitfield_layout_p (const_tree record_type)
35305 return ((TARGET_MS_BITFIELD_LAYOUT
35306 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
35307 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
35310 /* Returns an expression indicating where the this parameter is
35311 located on entry to the FUNCTION. */
35313 static rtx
35314 x86_this_parameter (tree function)
35316 tree type = TREE_TYPE (function);
35317 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
35318 int nregs;
35320 if (TARGET_64BIT)
35322 const int *parm_regs;
35324 if (ix86_function_type_abi (type) == MS_ABI)
35325 parm_regs = x86_64_ms_abi_int_parameter_registers;
35326 else
35327 parm_regs = x86_64_int_parameter_registers;
35328 return gen_rtx_REG (Pmode, parm_regs[aggr]);
35331 nregs = ix86_function_regparm (type, function);
35333 if (nregs > 0 && !stdarg_p (type))
35335 int regno;
35336 unsigned int ccvt = ix86_get_callcvt (type);
35338 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35339 regno = aggr ? DX_REG : CX_REG;
35340 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35342 regno = CX_REG;
35343 if (aggr)
35344 return gen_rtx_MEM (SImode,
35345 plus_constant (Pmode, stack_pointer_rtx, 4));
35347 else
35349 regno = AX_REG;
35350 if (aggr)
35352 regno = DX_REG;
35353 if (nregs == 1)
35354 return gen_rtx_MEM (SImode,
35355 plus_constant (Pmode,
35356 stack_pointer_rtx, 4));
35359 return gen_rtx_REG (SImode, regno);
35362 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
35363 aggr ? 8 : 4));
35366 /* Determine whether x86_output_mi_thunk can succeed. */
35368 static bool
35369 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
35370 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
35371 HOST_WIDE_INT vcall_offset, const_tree function)
35373 /* 64-bit can handle anything. */
35374 if (TARGET_64BIT)
35375 return true;
35377 /* For 32-bit, everything's fine if we have one free register. */
35378 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
35379 return true;
35381 /* Need a free register for vcall_offset. */
35382 if (vcall_offset)
35383 return false;
35385 /* Need a free register for GOT references. */
35386 if (flag_pic && !targetm.binds_local_p (function))
35387 return false;
35389 /* Otherwise ok. */
35390 return true;
35393 /* Output the assembler code for a thunk function. THUNK_DECL is the
35394 declaration for the thunk function itself, FUNCTION is the decl for
35395 the target function. DELTA is an immediate constant offset to be
35396 added to THIS. If VCALL_OFFSET is nonzero, the word at
35397 *(*this + vcall_offset) should be added to THIS. */
35399 static void
35400 x86_output_mi_thunk (FILE *file,
35401 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
35402 HOST_WIDE_INT vcall_offset, tree function)
35404 rtx this_param = x86_this_parameter (function);
35405 rtx this_reg, tmp, fnaddr;
35406 unsigned int tmp_regno;
35408 if (TARGET_64BIT)
35409 tmp_regno = R10_REG;
35410 else
35412 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
35413 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35414 tmp_regno = AX_REG;
35415 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35416 tmp_regno = DX_REG;
35417 else
35418 tmp_regno = CX_REG;
35421 emit_note (NOTE_INSN_PROLOGUE_END);
35423 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
35424 pull it in now and let DELTA benefit. */
35425 if (REG_P (this_param))
35426 this_reg = this_param;
35427 else if (vcall_offset)
35429 /* Put the this parameter into %eax. */
35430 this_reg = gen_rtx_REG (Pmode, AX_REG);
35431 emit_move_insn (this_reg, this_param);
35433 else
35434 this_reg = NULL_RTX;
35436 /* Adjust the this parameter by a fixed constant. */
35437 if (delta)
35439 rtx delta_rtx = GEN_INT (delta);
35440 rtx delta_dst = this_reg ? this_reg : this_param;
35442 if (TARGET_64BIT)
35444 if (!x86_64_general_operand (delta_rtx, Pmode))
35446 tmp = gen_rtx_REG (Pmode, tmp_regno);
35447 emit_move_insn (tmp, delta_rtx);
35448 delta_rtx = tmp;
35452 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
35455 /* Adjust the this parameter by a value stored in the vtable. */
35456 if (vcall_offset)
35458 rtx vcall_addr, vcall_mem, this_mem;
35460 tmp = gen_rtx_REG (Pmode, tmp_regno);
35462 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
35463 if (Pmode != ptr_mode)
35464 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
35465 emit_move_insn (tmp, this_mem);
35467 /* Adjust the this parameter. */
35468 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
35469 if (TARGET_64BIT
35470 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
35472 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
35473 emit_move_insn (tmp2, GEN_INT (vcall_offset));
35474 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
35477 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
35478 if (Pmode != ptr_mode)
35479 emit_insn (gen_addsi_1_zext (this_reg,
35480 gen_rtx_REG (ptr_mode,
35481 REGNO (this_reg)),
35482 vcall_mem));
35483 else
35484 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
35487 /* If necessary, drop THIS back to its stack slot. */
35488 if (this_reg && this_reg != this_param)
35489 emit_move_insn (this_param, this_reg);
35491 fnaddr = XEXP (DECL_RTL (function), 0);
35492 if (TARGET_64BIT)
35494 if (!flag_pic || targetm.binds_local_p (function)
35495 || TARGET_PECOFF)
35497 else
35499 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
35500 tmp = gen_rtx_CONST (Pmode, tmp);
35501 fnaddr = gen_rtx_MEM (Pmode, tmp);
35504 else
35506 if (!flag_pic || targetm.binds_local_p (function))
35508 #if TARGET_MACHO
35509 else if (TARGET_MACHO)
35511 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
35512 fnaddr = XEXP (fnaddr, 0);
35514 #endif /* TARGET_MACHO */
35515 else
35517 tmp = gen_rtx_REG (Pmode, CX_REG);
35518 output_set_got (tmp, NULL_RTX);
35520 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
35521 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
35522 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
35526 /* Our sibling call patterns do not allow memories, because we have no
35527 predicate that can distinguish between frame and non-frame memory.
35528 For our purposes here, we can get away with (ab)using a jump pattern,
35529 because we're going to do no optimization. */
35530 if (MEM_P (fnaddr))
35531 emit_jump_insn (gen_indirect_jump (fnaddr));
35532 else
35534 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35535 fnaddr = legitimize_pic_address (fnaddr,
35536 gen_rtx_REG (Pmode, tmp_regno));
35538 if (!sibcall_insn_operand (fnaddr, word_mode))
35540 tmp = gen_rtx_REG (word_mode, tmp_regno);
35541 if (GET_MODE (fnaddr) != word_mode)
35542 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35543 emit_move_insn (tmp, fnaddr);
35544 fnaddr = tmp;
35547 tmp = gen_rtx_MEM (QImode, fnaddr);
35548 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35549 tmp = emit_call_insn (tmp);
35550 SIBLING_CALL_P (tmp) = 1;
35552 emit_barrier ();
35554 /* Emit just enough of rest_of_compilation to get the insns emitted.
35555 Note that use_thunk calls assemble_start_function et al. */
35556 tmp = get_insns ();
35557 shorten_branches (tmp);
35558 final_start_function (tmp, file, 1);
35559 final (tmp, file, 1);
35560 final_end_function ();
35563 static void
35564 x86_file_start (void)
35566 default_file_start ();
35567 #if TARGET_MACHO
35568 darwin_file_start ();
35569 #endif
35570 if (X86_FILE_START_VERSION_DIRECTIVE)
35571 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35572 if (X86_FILE_START_FLTUSED)
35573 fputs ("\t.global\t__fltused\n", asm_out_file);
35574 if (ix86_asm_dialect == ASM_INTEL)
35575 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35579 x86_field_alignment (tree field, int computed)
35581 enum machine_mode mode;
35582 tree type = TREE_TYPE (field);
35584 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35585 return computed;
35586 mode = TYPE_MODE (strip_array_types (type));
35587 if (mode == DFmode || mode == DCmode
35588 || GET_MODE_CLASS (mode) == MODE_INT
35589 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35590 return MIN (32, computed);
35591 return computed;
35594 /* Output assembler code to FILE to increment profiler label # LABELNO
35595 for profiling a function entry. */
35596 void
35597 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35599 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35600 : MCOUNT_NAME);
35602 if (TARGET_64BIT)
35604 #ifndef NO_PROFILE_COUNTERS
35605 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35606 #endif
35608 if (!TARGET_PECOFF && flag_pic)
35609 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35610 else
35611 fprintf (file, "\tcall\t%s\n", mcount_name);
35613 else if (flag_pic)
35615 #ifndef NO_PROFILE_COUNTERS
35616 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35617 LPREFIX, labelno);
35618 #endif
35619 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35621 else
35623 #ifndef NO_PROFILE_COUNTERS
35624 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35625 LPREFIX, labelno);
35626 #endif
35627 fprintf (file, "\tcall\t%s\n", mcount_name);
35631 /* We don't have exact information about the insn sizes, but we may assume
35632 quite safely that we are informed about all 1 byte insns and memory
35633 address sizes. This is enough to eliminate unnecessary padding in
35634 99% of cases. */
35636 static int
35637 min_insn_size (rtx insn)
35639 int l = 0, len;
35641 if (!INSN_P (insn) || !active_insn_p (insn))
35642 return 0;
35644 /* Discard alignments we've emit and jump instructions. */
35645 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35646 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35647 return 0;
35649 /* Important case - calls are always 5 bytes.
35650 It is common to have many calls in the row. */
35651 if (CALL_P (insn)
35652 && symbolic_reference_mentioned_p (PATTERN (insn))
35653 && !SIBLING_CALL_P (insn))
35654 return 5;
35655 len = get_attr_length (insn);
35656 if (len <= 1)
35657 return 1;
35659 /* For normal instructions we rely on get_attr_length being exact,
35660 with a few exceptions. */
35661 if (!JUMP_P (insn))
35663 enum attr_type type = get_attr_type (insn);
35665 switch (type)
35667 case TYPE_MULTI:
35668 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35669 || asm_noperands (PATTERN (insn)) >= 0)
35670 return 0;
35671 break;
35672 case TYPE_OTHER:
35673 case TYPE_FCMP:
35674 break;
35675 default:
35676 /* Otherwise trust get_attr_length. */
35677 return len;
35680 l = get_attr_length_address (insn);
35681 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35682 l = 4;
35684 if (l)
35685 return 1+l;
35686 else
35687 return 2;
35690 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35692 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35693 window. */
35695 static void
35696 ix86_avoid_jump_mispredicts (void)
35698 rtx insn, start = get_insns ();
35699 int nbytes = 0, njumps = 0;
35700 int isjump = 0;
35702 /* Look for all minimal intervals of instructions containing 4 jumps.
35703 The intervals are bounded by START and INSN. NBYTES is the total
35704 size of instructions in the interval including INSN and not including
35705 START. When the NBYTES is smaller than 16 bytes, it is possible
35706 that the end of START and INSN ends up in the same 16byte page.
35708 The smallest offset in the page INSN can start is the case where START
35709 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35710 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35712 for (insn = start; insn; insn = NEXT_INSN (insn))
35714 int min_size;
35716 if (LABEL_P (insn))
35718 int align = label_to_alignment (insn);
35719 int max_skip = label_to_max_skip (insn);
35721 if (max_skip > 15)
35722 max_skip = 15;
35723 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35724 already in the current 16 byte page, because otherwise
35725 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35726 bytes to reach 16 byte boundary. */
35727 if (align <= 0
35728 || (align <= 3 && max_skip != (1 << align) - 1))
35729 max_skip = 0;
35730 if (dump_file)
35731 fprintf (dump_file, "Label %i with max_skip %i\n",
35732 INSN_UID (insn), max_skip);
35733 if (max_skip)
35735 while (nbytes + max_skip >= 16)
35737 start = NEXT_INSN (start);
35738 if (JUMP_P (start) || CALL_P (start))
35739 njumps--, isjump = 1;
35740 else
35741 isjump = 0;
35742 nbytes -= min_insn_size (start);
35745 continue;
35748 min_size = min_insn_size (insn);
35749 nbytes += min_size;
35750 if (dump_file)
35751 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35752 INSN_UID (insn), min_size);
35753 if (JUMP_P (insn) || CALL_P (insn))
35754 njumps++;
35755 else
35756 continue;
35758 while (njumps > 3)
35760 start = NEXT_INSN (start);
35761 if (JUMP_P (start) || CALL_P (start))
35762 njumps--, isjump = 1;
35763 else
35764 isjump = 0;
35765 nbytes -= min_insn_size (start);
35767 gcc_assert (njumps >= 0);
35768 if (dump_file)
35769 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35770 INSN_UID (start), INSN_UID (insn), nbytes);
35772 if (njumps == 3 && isjump && nbytes < 16)
35774 int padsize = 15 - nbytes + min_insn_size (insn);
35776 if (dump_file)
35777 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35778 INSN_UID (insn), padsize);
35779 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35783 #endif
35785 /* AMD Athlon works faster
35786 when RET is not destination of conditional jump or directly preceded
35787 by other jump instruction. We avoid the penalty by inserting NOP just
35788 before the RET instructions in such cases. */
35789 static void
35790 ix86_pad_returns (void)
35792 edge e;
35793 edge_iterator ei;
35795 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35797 basic_block bb = e->src;
35798 rtx ret = BB_END (bb);
35799 rtx prev;
35800 bool replace = false;
35802 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35803 || optimize_bb_for_size_p (bb))
35804 continue;
35805 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35806 if (active_insn_p (prev) || LABEL_P (prev))
35807 break;
35808 if (prev && LABEL_P (prev))
35810 edge e;
35811 edge_iterator ei;
35813 FOR_EACH_EDGE (e, ei, bb->preds)
35814 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35815 && !(e->flags & EDGE_FALLTHRU))
35817 replace = true;
35818 break;
35821 if (!replace)
35823 prev = prev_active_insn (ret);
35824 if (prev
35825 && ((JUMP_P (prev) && any_condjump_p (prev))
35826 || CALL_P (prev)))
35827 replace = true;
35828 /* Empty functions get branch mispredict even when
35829 the jump destination is not visible to us. */
35830 if (!prev && !optimize_function_for_size_p (cfun))
35831 replace = true;
35833 if (replace)
35835 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35836 delete_insn (ret);
35841 /* Count the minimum number of instructions in BB. Return 4 if the
35842 number of instructions >= 4. */
35844 static int
35845 ix86_count_insn_bb (basic_block bb)
35847 rtx insn;
35848 int insn_count = 0;
35850 /* Count number of instructions in this block. Return 4 if the number
35851 of instructions >= 4. */
35852 FOR_BB_INSNS (bb, insn)
35854 /* Only happen in exit blocks. */
35855 if (JUMP_P (insn)
35856 && ANY_RETURN_P (PATTERN (insn)))
35857 break;
35859 if (NONDEBUG_INSN_P (insn)
35860 && GET_CODE (PATTERN (insn)) != USE
35861 && GET_CODE (PATTERN (insn)) != CLOBBER)
35863 insn_count++;
35864 if (insn_count >= 4)
35865 return insn_count;
35869 return insn_count;
35873 /* Count the minimum number of instructions in code path in BB.
35874 Return 4 if the number of instructions >= 4. */
35876 static int
35877 ix86_count_insn (basic_block bb)
35879 edge e;
35880 edge_iterator ei;
35881 int min_prev_count;
35883 /* Only bother counting instructions along paths with no
35884 more than 2 basic blocks between entry and exit. Given
35885 that BB has an edge to exit, determine if a predecessor
35886 of BB has an edge from entry. If so, compute the number
35887 of instructions in the predecessor block. If there
35888 happen to be multiple such blocks, compute the minimum. */
35889 min_prev_count = 4;
35890 FOR_EACH_EDGE (e, ei, bb->preds)
35892 edge prev_e;
35893 edge_iterator prev_ei;
35895 if (e->src == ENTRY_BLOCK_PTR)
35897 min_prev_count = 0;
35898 break;
35900 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35902 if (prev_e->src == ENTRY_BLOCK_PTR)
35904 int count = ix86_count_insn_bb (e->src);
35905 if (count < min_prev_count)
35906 min_prev_count = count;
35907 break;
35912 if (min_prev_count < 4)
35913 min_prev_count += ix86_count_insn_bb (bb);
35915 return min_prev_count;
35918 /* Pad short function to 4 instructions. */
35920 static void
35921 ix86_pad_short_function (void)
35923 edge e;
35924 edge_iterator ei;
35926 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35928 rtx ret = BB_END (e->src);
35929 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35931 int insn_count = ix86_count_insn (e->src);
35933 /* Pad short function. */
35934 if (insn_count < 4)
35936 rtx insn = ret;
35938 /* Find epilogue. */
35939 while (insn
35940 && (!NOTE_P (insn)
35941 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35942 insn = PREV_INSN (insn);
35944 if (!insn)
35945 insn = ret;
35947 /* Two NOPs count as one instruction. */
35948 insn_count = 2 * (4 - insn_count);
35949 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35955 /* Fix up a Windows system unwinder issue. If an EH region falls through into
35956 the epilogue, the Windows system unwinder will apply epilogue logic and
35957 produce incorrect offsets. This can be avoided by adding a nop between
35958 the last insn that can throw and the first insn of the epilogue. */
35960 static void
35961 ix86_seh_fixup_eh_fallthru (void)
35963 edge e;
35964 edge_iterator ei;
35966 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35968 rtx insn, next;
35970 /* Find the beginning of the epilogue. */
35971 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
35972 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
35973 break;
35974 if (insn == NULL)
35975 continue;
35977 /* We only care about preceding insns that can throw. */
35978 insn = prev_active_insn (insn);
35979 if (insn == NULL || !can_throw_internal (insn))
35980 continue;
35982 /* Do not separate calls from their debug information. */
35983 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
35984 if (NOTE_P (next)
35985 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
35986 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
35987 insn = next;
35988 else
35989 break;
35991 emit_insn_after (gen_nops (const1_rtx), insn);
35995 /* Implement machine specific optimizations. We implement padding of returns
35996 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35997 static void
35998 ix86_reorg (void)
36000 /* We are freeing block_for_insn in the toplev to keep compatibility
36001 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36002 compute_bb_for_insn ();
36004 if (TARGET_SEH && current_function_has_exception_handlers ())
36005 ix86_seh_fixup_eh_fallthru ();
36007 if (optimize && optimize_function_for_speed_p (cfun))
36009 if (TARGET_PAD_SHORT_FUNCTION)
36010 ix86_pad_short_function ();
36011 else if (TARGET_PAD_RETURNS)
36012 ix86_pad_returns ();
36013 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36014 if (TARGET_FOUR_JUMP_LIMIT)
36015 ix86_avoid_jump_mispredicts ();
36016 #endif
36020 /* Return nonzero when QImode register that must be represented via REX prefix
36021 is used. */
36022 bool
36023 x86_extended_QIreg_mentioned_p (rtx insn)
36025 int i;
36026 extract_insn_cached (insn);
36027 for (i = 0; i < recog_data.n_operands; i++)
36028 if (GENERAL_REG_P (recog_data.operand[i])
36029 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36030 return true;
36031 return false;
36034 /* Return nonzero when P points to register encoded via REX prefix.
36035 Called via for_each_rtx. */
36036 static int
36037 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36039 unsigned int regno;
36040 if (!REG_P (*p))
36041 return 0;
36042 regno = REGNO (*p);
36043 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36046 /* Return true when INSN mentions register that must be encoded using REX
36047 prefix. */
36048 bool
36049 x86_extended_reg_mentioned_p (rtx insn)
36051 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36052 extended_reg_mentioned_1, NULL);
36055 /* If profitable, negate (without causing overflow) integer constant
36056 of mode MODE at location LOC. Return true in this case. */
36057 bool
36058 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36060 HOST_WIDE_INT val;
36062 if (!CONST_INT_P (*loc))
36063 return false;
36065 switch (mode)
36067 case DImode:
36068 /* DImode x86_64 constants must fit in 32 bits. */
36069 gcc_assert (x86_64_immediate_operand (*loc, mode));
36071 mode = SImode;
36072 break;
36074 case SImode:
36075 case HImode:
36076 case QImode:
36077 break;
36079 default:
36080 gcc_unreachable ();
36083 /* Avoid overflows. */
36084 if (mode_signbit_p (mode, *loc))
36085 return false;
36087 val = INTVAL (*loc);
36089 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36090 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36091 if ((val < 0 && val != -128)
36092 || val == 128)
36094 *loc = GEN_INT (-val);
36095 return true;
36098 return false;
36101 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36102 optabs would emit if we didn't have TFmode patterns. */
36104 void
36105 x86_emit_floatuns (rtx operands[2])
36107 rtx neglab, donelab, i0, i1, f0, in, out;
36108 enum machine_mode mode, inmode;
36110 inmode = GET_MODE (operands[1]);
36111 gcc_assert (inmode == SImode || inmode == DImode);
36113 out = operands[0];
36114 in = force_reg (inmode, operands[1]);
36115 mode = GET_MODE (out);
36116 neglab = gen_label_rtx ();
36117 donelab = gen_label_rtx ();
36118 f0 = gen_reg_rtx (mode);
36120 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36122 expand_float (out, in, 0);
36124 emit_jump_insn (gen_jump (donelab));
36125 emit_barrier ();
36127 emit_label (neglab);
36129 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36130 1, OPTAB_DIRECT);
36131 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36132 1, OPTAB_DIRECT);
36133 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36135 expand_float (f0, i0, 0);
36137 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36139 emit_label (donelab);
36142 /* AVX2 does support 32-byte integer vector operations,
36143 thus the longest vector we are faced with is V32QImode. */
36144 #define MAX_VECT_LEN 32
36146 struct expand_vec_perm_d
36148 rtx target, op0, op1;
36149 unsigned char perm[MAX_VECT_LEN];
36150 enum machine_mode vmode;
36151 unsigned char nelt;
36152 bool one_operand_p;
36153 bool testing_p;
36156 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36157 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36158 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36160 /* Get a vector mode of the same size as the original but with elements
36161 twice as wide. This is only guaranteed to apply to integral vectors. */
36163 static inline enum machine_mode
36164 get_mode_wider_vector (enum machine_mode o)
36166 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36167 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36168 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36169 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36170 return n;
36173 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36174 with all elements equal to VAR. Return true if successful. */
36176 static bool
36177 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36178 rtx target, rtx val)
36180 bool ok;
36182 switch (mode)
36184 case V2SImode:
36185 case V2SFmode:
36186 if (!mmx_ok)
36187 return false;
36188 /* FALLTHRU */
36190 case V4DFmode:
36191 case V4DImode:
36192 case V8SFmode:
36193 case V8SImode:
36194 case V2DFmode:
36195 case V2DImode:
36196 case V4SFmode:
36197 case V4SImode:
36199 rtx insn, dup;
36201 /* First attempt to recognize VAL as-is. */
36202 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36203 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36204 if (recog_memoized (insn) < 0)
36206 rtx seq;
36207 /* If that fails, force VAL into a register. */
36209 start_sequence ();
36210 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
36211 seq = get_insns ();
36212 end_sequence ();
36213 if (seq)
36214 emit_insn_before (seq, insn);
36216 ok = recog_memoized (insn) >= 0;
36217 gcc_assert (ok);
36220 return true;
36222 case V4HImode:
36223 if (!mmx_ok)
36224 return false;
36225 if (TARGET_SSE || TARGET_3DNOW_A)
36227 rtx x;
36229 val = gen_lowpart (SImode, val);
36230 x = gen_rtx_TRUNCATE (HImode, val);
36231 x = gen_rtx_VEC_DUPLICATE (mode, x);
36232 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36233 return true;
36235 goto widen;
36237 case V8QImode:
36238 if (!mmx_ok)
36239 return false;
36240 goto widen;
36242 case V8HImode:
36243 if (TARGET_SSE2)
36245 struct expand_vec_perm_d dperm;
36246 rtx tmp1, tmp2;
36248 permute:
36249 memset (&dperm, 0, sizeof (dperm));
36250 dperm.target = target;
36251 dperm.vmode = mode;
36252 dperm.nelt = GET_MODE_NUNITS (mode);
36253 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
36254 dperm.one_operand_p = true;
36256 /* Extend to SImode using a paradoxical SUBREG. */
36257 tmp1 = gen_reg_rtx (SImode);
36258 emit_move_insn (tmp1, gen_lowpart (SImode, val));
36260 /* Insert the SImode value as low element of a V4SImode vector. */
36261 tmp2 = gen_lowpart (V4SImode, dperm.op0);
36262 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
36264 ok = (expand_vec_perm_1 (&dperm)
36265 || expand_vec_perm_broadcast_1 (&dperm));
36266 gcc_assert (ok);
36267 return ok;
36269 goto widen;
36271 case V16QImode:
36272 if (TARGET_SSE2)
36273 goto permute;
36274 goto widen;
36276 widen:
36277 /* Replicate the value once into the next wider mode and recurse. */
36279 enum machine_mode smode, wsmode, wvmode;
36280 rtx x;
36282 smode = GET_MODE_INNER (mode);
36283 wvmode = get_mode_wider_vector (mode);
36284 wsmode = GET_MODE_INNER (wvmode);
36286 val = convert_modes (wsmode, smode, val, true);
36287 x = expand_simple_binop (wsmode, ASHIFT, val,
36288 GEN_INT (GET_MODE_BITSIZE (smode)),
36289 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36290 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
36292 x = gen_lowpart (wvmode, target);
36293 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
36294 gcc_assert (ok);
36295 return ok;
36298 case V16HImode:
36299 case V32QImode:
36301 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
36302 rtx x = gen_reg_rtx (hvmode);
36304 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
36305 gcc_assert (ok);
36307 x = gen_rtx_VEC_CONCAT (mode, x, x);
36308 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36310 return true;
36312 default:
36313 return false;
36317 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36318 whose ONE_VAR element is VAR, and other elements are zero. Return true
36319 if successful. */
36321 static bool
36322 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
36323 rtx target, rtx var, int one_var)
36325 enum machine_mode vsimode;
36326 rtx new_target;
36327 rtx x, tmp;
36328 bool use_vector_set = false;
36330 switch (mode)
36332 case V2DImode:
36333 /* For SSE4.1, we normally use vector set. But if the second
36334 element is zero and inter-unit moves are OK, we use movq
36335 instead. */
36336 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
36337 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
36338 && one_var == 0));
36339 break;
36340 case V16QImode:
36341 case V4SImode:
36342 case V4SFmode:
36343 use_vector_set = TARGET_SSE4_1;
36344 break;
36345 case V8HImode:
36346 use_vector_set = TARGET_SSE2;
36347 break;
36348 case V4HImode:
36349 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
36350 break;
36351 case V32QImode:
36352 case V16HImode:
36353 case V8SImode:
36354 case V8SFmode:
36355 case V4DFmode:
36356 use_vector_set = TARGET_AVX;
36357 break;
36358 case V4DImode:
36359 /* Use ix86_expand_vector_set in 64bit mode only. */
36360 use_vector_set = TARGET_AVX && TARGET_64BIT;
36361 break;
36362 default:
36363 break;
36366 if (use_vector_set)
36368 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
36369 var = force_reg (GET_MODE_INNER (mode), var);
36370 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36371 return true;
36374 switch (mode)
36376 case V2SFmode:
36377 case V2SImode:
36378 if (!mmx_ok)
36379 return false;
36380 /* FALLTHRU */
36382 case V2DFmode:
36383 case V2DImode:
36384 if (one_var != 0)
36385 return false;
36386 var = force_reg (GET_MODE_INNER (mode), var);
36387 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
36388 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36389 return true;
36391 case V4SFmode:
36392 case V4SImode:
36393 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
36394 new_target = gen_reg_rtx (mode);
36395 else
36396 new_target = target;
36397 var = force_reg (GET_MODE_INNER (mode), var);
36398 x = gen_rtx_VEC_DUPLICATE (mode, var);
36399 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
36400 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
36401 if (one_var != 0)
36403 /* We need to shuffle the value to the correct position, so
36404 create a new pseudo to store the intermediate result. */
36406 /* With SSE2, we can use the integer shuffle insns. */
36407 if (mode != V4SFmode && TARGET_SSE2)
36409 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
36410 const1_rtx,
36411 GEN_INT (one_var == 1 ? 0 : 1),
36412 GEN_INT (one_var == 2 ? 0 : 1),
36413 GEN_INT (one_var == 3 ? 0 : 1)));
36414 if (target != new_target)
36415 emit_move_insn (target, new_target);
36416 return true;
36419 /* Otherwise convert the intermediate result to V4SFmode and
36420 use the SSE1 shuffle instructions. */
36421 if (mode != V4SFmode)
36423 tmp = gen_reg_rtx (V4SFmode);
36424 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
36426 else
36427 tmp = new_target;
36429 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
36430 const1_rtx,
36431 GEN_INT (one_var == 1 ? 0 : 1),
36432 GEN_INT (one_var == 2 ? 0+4 : 1+4),
36433 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
36435 if (mode != V4SFmode)
36436 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
36437 else if (tmp != target)
36438 emit_move_insn (target, tmp);
36440 else if (target != new_target)
36441 emit_move_insn (target, new_target);
36442 return true;
36444 case V8HImode:
36445 case V16QImode:
36446 vsimode = V4SImode;
36447 goto widen;
36448 case V4HImode:
36449 case V8QImode:
36450 if (!mmx_ok)
36451 return false;
36452 vsimode = V2SImode;
36453 goto widen;
36454 widen:
36455 if (one_var != 0)
36456 return false;
36458 /* Zero extend the variable element to SImode and recurse. */
36459 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
36461 x = gen_reg_rtx (vsimode);
36462 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
36463 var, one_var))
36464 gcc_unreachable ();
36466 emit_move_insn (target, gen_lowpart (mode, x));
36467 return true;
36469 default:
36470 return false;
36474 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36475 consisting of the values in VALS. It is known that all elements
36476 except ONE_VAR are constants. Return true if successful. */
36478 static bool
36479 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
36480 rtx target, rtx vals, int one_var)
36482 rtx var = XVECEXP (vals, 0, one_var);
36483 enum machine_mode wmode;
36484 rtx const_vec, x;
36486 const_vec = copy_rtx (vals);
36487 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
36488 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
36490 switch (mode)
36492 case V2DFmode:
36493 case V2DImode:
36494 case V2SFmode:
36495 case V2SImode:
36496 /* For the two element vectors, it's just as easy to use
36497 the general case. */
36498 return false;
36500 case V4DImode:
36501 /* Use ix86_expand_vector_set in 64bit mode only. */
36502 if (!TARGET_64BIT)
36503 return false;
36504 case V4DFmode:
36505 case V8SFmode:
36506 case V8SImode:
36507 case V16HImode:
36508 case V32QImode:
36509 case V4SFmode:
36510 case V4SImode:
36511 case V8HImode:
36512 case V4HImode:
36513 break;
36515 case V16QImode:
36516 if (TARGET_SSE4_1)
36517 break;
36518 wmode = V8HImode;
36519 goto widen;
36520 case V8QImode:
36521 wmode = V4HImode;
36522 goto widen;
36523 widen:
36524 /* There's no way to set one QImode entry easily. Combine
36525 the variable value with its adjacent constant value, and
36526 promote to an HImode set. */
36527 x = XVECEXP (vals, 0, one_var ^ 1);
36528 if (one_var & 1)
36530 var = convert_modes (HImode, QImode, var, true);
36531 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
36532 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36533 x = GEN_INT (INTVAL (x) & 0xff);
36535 else
36537 var = convert_modes (HImode, QImode, var, true);
36538 x = gen_int_mode (INTVAL (x) << 8, HImode);
36540 if (x != const0_rtx)
36541 var = expand_simple_binop (HImode, IOR, var, x, var,
36542 1, OPTAB_LIB_WIDEN);
36544 x = gen_reg_rtx (wmode);
36545 emit_move_insn (x, gen_lowpart (wmode, const_vec));
36546 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
36548 emit_move_insn (target, gen_lowpart (mode, x));
36549 return true;
36551 default:
36552 return false;
36555 emit_move_insn (target, const_vec);
36556 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36557 return true;
36560 /* A subroutine of ix86_expand_vector_init_general. Use vector
36561 concatenate to handle the most general case: all values variable,
36562 and none identical. */
36564 static void
36565 ix86_expand_vector_init_concat (enum machine_mode mode,
36566 rtx target, rtx *ops, int n)
36568 enum machine_mode cmode, hmode = VOIDmode;
36569 rtx first[8], second[4];
36570 rtvec v;
36571 int i, j;
36573 switch (n)
36575 case 2:
36576 switch (mode)
36578 case V8SImode:
36579 cmode = V4SImode;
36580 break;
36581 case V8SFmode:
36582 cmode = V4SFmode;
36583 break;
36584 case V4DImode:
36585 cmode = V2DImode;
36586 break;
36587 case V4DFmode:
36588 cmode = V2DFmode;
36589 break;
36590 case V4SImode:
36591 cmode = V2SImode;
36592 break;
36593 case V4SFmode:
36594 cmode = V2SFmode;
36595 break;
36596 case V2DImode:
36597 cmode = DImode;
36598 break;
36599 case V2SImode:
36600 cmode = SImode;
36601 break;
36602 case V2DFmode:
36603 cmode = DFmode;
36604 break;
36605 case V2SFmode:
36606 cmode = SFmode;
36607 break;
36608 default:
36609 gcc_unreachable ();
36612 if (!register_operand (ops[1], cmode))
36613 ops[1] = force_reg (cmode, ops[1]);
36614 if (!register_operand (ops[0], cmode))
36615 ops[0] = force_reg (cmode, ops[0]);
36616 emit_insn (gen_rtx_SET (VOIDmode, target,
36617 gen_rtx_VEC_CONCAT (mode, ops[0],
36618 ops[1])));
36619 break;
36621 case 4:
36622 switch (mode)
36624 case V4DImode:
36625 cmode = V2DImode;
36626 break;
36627 case V4DFmode:
36628 cmode = V2DFmode;
36629 break;
36630 case V4SImode:
36631 cmode = V2SImode;
36632 break;
36633 case V4SFmode:
36634 cmode = V2SFmode;
36635 break;
36636 default:
36637 gcc_unreachable ();
36639 goto half;
36641 case 8:
36642 switch (mode)
36644 case V8SImode:
36645 cmode = V2SImode;
36646 hmode = V4SImode;
36647 break;
36648 case V8SFmode:
36649 cmode = V2SFmode;
36650 hmode = V4SFmode;
36651 break;
36652 default:
36653 gcc_unreachable ();
36655 goto half;
36657 half:
36658 /* FIXME: We process inputs backward to help RA. PR 36222. */
36659 i = n - 1;
36660 j = (n >> 1) - 1;
36661 for (; i > 0; i -= 2, j--)
36663 first[j] = gen_reg_rtx (cmode);
36664 v = gen_rtvec (2, ops[i - 1], ops[i]);
36665 ix86_expand_vector_init (false, first[j],
36666 gen_rtx_PARALLEL (cmode, v));
36669 n >>= 1;
36670 if (n > 2)
36672 gcc_assert (hmode != VOIDmode);
36673 for (i = j = 0; i < n; i += 2, j++)
36675 second[j] = gen_reg_rtx (hmode);
36676 ix86_expand_vector_init_concat (hmode, second [j],
36677 &first [i], 2);
36679 n >>= 1;
36680 ix86_expand_vector_init_concat (mode, target, second, n);
36682 else
36683 ix86_expand_vector_init_concat (mode, target, first, n);
36684 break;
36686 default:
36687 gcc_unreachable ();
36691 /* A subroutine of ix86_expand_vector_init_general. Use vector
36692 interleave to handle the most general case: all values variable,
36693 and none identical. */
36695 static void
36696 ix86_expand_vector_init_interleave (enum machine_mode mode,
36697 rtx target, rtx *ops, int n)
36699 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36700 int i, j;
36701 rtx op0, op1;
36702 rtx (*gen_load_even) (rtx, rtx, rtx);
36703 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36704 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36706 switch (mode)
36708 case V8HImode:
36709 gen_load_even = gen_vec_setv8hi;
36710 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36711 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36712 inner_mode = HImode;
36713 first_imode = V4SImode;
36714 second_imode = V2DImode;
36715 third_imode = VOIDmode;
36716 break;
36717 case V16QImode:
36718 gen_load_even = gen_vec_setv16qi;
36719 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36720 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36721 inner_mode = QImode;
36722 first_imode = V8HImode;
36723 second_imode = V4SImode;
36724 third_imode = V2DImode;
36725 break;
36726 default:
36727 gcc_unreachable ();
36730 for (i = 0; i < n; i++)
36732 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36733 op0 = gen_reg_rtx (SImode);
36734 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36736 /* Insert the SImode value as low element of V4SImode vector. */
36737 op1 = gen_reg_rtx (V4SImode);
36738 op0 = gen_rtx_VEC_MERGE (V4SImode,
36739 gen_rtx_VEC_DUPLICATE (V4SImode,
36740 op0),
36741 CONST0_RTX (V4SImode),
36742 const1_rtx);
36743 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36745 /* Cast the V4SImode vector back to a vector in orignal mode. */
36746 op0 = gen_reg_rtx (mode);
36747 emit_move_insn (op0, gen_lowpart (mode, op1));
36749 /* Load even elements into the second position. */
36750 emit_insn (gen_load_even (op0,
36751 force_reg (inner_mode,
36752 ops [i + i + 1]),
36753 const1_rtx));
36755 /* Cast vector to FIRST_IMODE vector. */
36756 ops[i] = gen_reg_rtx (first_imode);
36757 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36760 /* Interleave low FIRST_IMODE vectors. */
36761 for (i = j = 0; i < n; i += 2, j++)
36763 op0 = gen_reg_rtx (first_imode);
36764 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36766 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36767 ops[j] = gen_reg_rtx (second_imode);
36768 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36771 /* Interleave low SECOND_IMODE vectors. */
36772 switch (second_imode)
36774 case V4SImode:
36775 for (i = j = 0; i < n / 2; i += 2, j++)
36777 op0 = gen_reg_rtx (second_imode);
36778 emit_insn (gen_interleave_second_low (op0, ops[i],
36779 ops[i + 1]));
36781 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36782 vector. */
36783 ops[j] = gen_reg_rtx (third_imode);
36784 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36786 second_imode = V2DImode;
36787 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36788 /* FALLTHRU */
36790 case V2DImode:
36791 op0 = gen_reg_rtx (second_imode);
36792 emit_insn (gen_interleave_second_low (op0, ops[0],
36793 ops[1]));
36795 /* Cast the SECOND_IMODE vector back to a vector on original
36796 mode. */
36797 emit_insn (gen_rtx_SET (VOIDmode, target,
36798 gen_lowpart (mode, op0)));
36799 break;
36801 default:
36802 gcc_unreachable ();
36806 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36807 all values variable, and none identical. */
36809 static void
36810 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36811 rtx target, rtx vals)
36813 rtx ops[32], op0, op1;
36814 enum machine_mode half_mode = VOIDmode;
36815 int n, i;
36817 switch (mode)
36819 case V2SFmode:
36820 case V2SImode:
36821 if (!mmx_ok && !TARGET_SSE)
36822 break;
36823 /* FALLTHRU */
36825 case V8SFmode:
36826 case V8SImode:
36827 case V4DFmode:
36828 case V4DImode:
36829 case V4SFmode:
36830 case V4SImode:
36831 case V2DFmode:
36832 case V2DImode:
36833 n = GET_MODE_NUNITS (mode);
36834 for (i = 0; i < n; i++)
36835 ops[i] = XVECEXP (vals, 0, i);
36836 ix86_expand_vector_init_concat (mode, target, ops, n);
36837 return;
36839 case V32QImode:
36840 half_mode = V16QImode;
36841 goto half;
36843 case V16HImode:
36844 half_mode = V8HImode;
36845 goto half;
36847 half:
36848 n = GET_MODE_NUNITS (mode);
36849 for (i = 0; i < n; i++)
36850 ops[i] = XVECEXP (vals, 0, i);
36851 op0 = gen_reg_rtx (half_mode);
36852 op1 = gen_reg_rtx (half_mode);
36853 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36854 n >> 2);
36855 ix86_expand_vector_init_interleave (half_mode, op1,
36856 &ops [n >> 1], n >> 2);
36857 emit_insn (gen_rtx_SET (VOIDmode, target,
36858 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36859 return;
36861 case V16QImode:
36862 if (!TARGET_SSE4_1)
36863 break;
36864 /* FALLTHRU */
36866 case V8HImode:
36867 if (!TARGET_SSE2)
36868 break;
36870 /* Don't use ix86_expand_vector_init_interleave if we can't
36871 move from GPR to SSE register directly. */
36872 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
36873 break;
36875 n = GET_MODE_NUNITS (mode);
36876 for (i = 0; i < n; i++)
36877 ops[i] = XVECEXP (vals, 0, i);
36878 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36879 return;
36881 case V4HImode:
36882 case V8QImode:
36883 break;
36885 default:
36886 gcc_unreachable ();
36890 int i, j, n_elts, n_words, n_elt_per_word;
36891 enum machine_mode inner_mode;
36892 rtx words[4], shift;
36894 inner_mode = GET_MODE_INNER (mode);
36895 n_elts = GET_MODE_NUNITS (mode);
36896 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36897 n_elt_per_word = n_elts / n_words;
36898 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36900 for (i = 0; i < n_words; ++i)
36902 rtx word = NULL_RTX;
36904 for (j = 0; j < n_elt_per_word; ++j)
36906 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36907 elt = convert_modes (word_mode, inner_mode, elt, true);
36909 if (j == 0)
36910 word = elt;
36911 else
36913 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36914 word, 1, OPTAB_LIB_WIDEN);
36915 word = expand_simple_binop (word_mode, IOR, word, elt,
36916 word, 1, OPTAB_LIB_WIDEN);
36920 words[i] = word;
36923 if (n_words == 1)
36924 emit_move_insn (target, gen_lowpart (mode, words[0]));
36925 else if (n_words == 2)
36927 rtx tmp = gen_reg_rtx (mode);
36928 emit_clobber (tmp);
36929 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36930 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36931 emit_move_insn (target, tmp);
36933 else if (n_words == 4)
36935 rtx tmp = gen_reg_rtx (V4SImode);
36936 gcc_assert (word_mode == SImode);
36937 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36938 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36939 emit_move_insn (target, gen_lowpart (mode, tmp));
36941 else
36942 gcc_unreachable ();
36946 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36947 instructions unless MMX_OK is true. */
36949 void
36950 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36952 enum machine_mode mode = GET_MODE (target);
36953 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36954 int n_elts = GET_MODE_NUNITS (mode);
36955 int n_var = 0, one_var = -1;
36956 bool all_same = true, all_const_zero = true;
36957 int i;
36958 rtx x;
36960 for (i = 0; i < n_elts; ++i)
36962 x = XVECEXP (vals, 0, i);
36963 if (!(CONST_INT_P (x)
36964 || GET_CODE (x) == CONST_DOUBLE
36965 || GET_CODE (x) == CONST_FIXED))
36966 n_var++, one_var = i;
36967 else if (x != CONST0_RTX (inner_mode))
36968 all_const_zero = false;
36969 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36970 all_same = false;
36973 /* Constants are best loaded from the constant pool. */
36974 if (n_var == 0)
36976 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36977 return;
36980 /* If all values are identical, broadcast the value. */
36981 if (all_same
36982 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36983 XVECEXP (vals, 0, 0)))
36984 return;
36986 /* Values where only one field is non-constant are best loaded from
36987 the pool and overwritten via move later. */
36988 if (n_var == 1)
36990 if (all_const_zero
36991 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36992 XVECEXP (vals, 0, one_var),
36993 one_var))
36994 return;
36996 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36997 return;
37000 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37003 void
37004 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37006 enum machine_mode mode = GET_MODE (target);
37007 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37008 enum machine_mode half_mode;
37009 bool use_vec_merge = false;
37010 rtx tmp;
37011 static rtx (*gen_extract[6][2]) (rtx, rtx)
37013 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37014 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37015 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37016 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37017 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37018 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37020 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37022 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37023 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37024 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37025 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37026 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37027 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37029 int i, j, n;
37031 switch (mode)
37033 case V2SFmode:
37034 case V2SImode:
37035 if (mmx_ok)
37037 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37038 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37039 if (elt == 0)
37040 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37041 else
37042 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37043 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37044 return;
37046 break;
37048 case V2DImode:
37049 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37050 if (use_vec_merge)
37051 break;
37053 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37054 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37055 if (elt == 0)
37056 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37057 else
37058 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37059 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37060 return;
37062 case V2DFmode:
37064 rtx op0, op1;
37066 /* For the two element vectors, we implement a VEC_CONCAT with
37067 the extraction of the other element. */
37069 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37070 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37072 if (elt == 0)
37073 op0 = val, op1 = tmp;
37074 else
37075 op0 = tmp, op1 = val;
37077 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37078 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37080 return;
37082 case V4SFmode:
37083 use_vec_merge = TARGET_SSE4_1;
37084 if (use_vec_merge)
37085 break;
37087 switch (elt)
37089 case 0:
37090 use_vec_merge = true;
37091 break;
37093 case 1:
37094 /* tmp = target = A B C D */
37095 tmp = copy_to_reg (target);
37096 /* target = A A B B */
37097 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37098 /* target = X A B B */
37099 ix86_expand_vector_set (false, target, val, 0);
37100 /* target = A X C D */
37101 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37102 const1_rtx, const0_rtx,
37103 GEN_INT (2+4), GEN_INT (3+4)));
37104 return;
37106 case 2:
37107 /* tmp = target = A B C D */
37108 tmp = copy_to_reg (target);
37109 /* tmp = X B C D */
37110 ix86_expand_vector_set (false, tmp, val, 0);
37111 /* target = A B X D */
37112 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37113 const0_rtx, const1_rtx,
37114 GEN_INT (0+4), GEN_INT (3+4)));
37115 return;
37117 case 3:
37118 /* tmp = target = A B C D */
37119 tmp = copy_to_reg (target);
37120 /* tmp = X B C D */
37121 ix86_expand_vector_set (false, tmp, val, 0);
37122 /* target = A B X D */
37123 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37124 const0_rtx, const1_rtx,
37125 GEN_INT (2+4), GEN_INT (0+4)));
37126 return;
37128 default:
37129 gcc_unreachable ();
37131 break;
37133 case V4SImode:
37134 use_vec_merge = TARGET_SSE4_1;
37135 if (use_vec_merge)
37136 break;
37138 /* Element 0 handled by vec_merge below. */
37139 if (elt == 0)
37141 use_vec_merge = true;
37142 break;
37145 if (TARGET_SSE2)
37147 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37148 store into element 0, then shuffle them back. */
37150 rtx order[4];
37152 order[0] = GEN_INT (elt);
37153 order[1] = const1_rtx;
37154 order[2] = const2_rtx;
37155 order[3] = GEN_INT (3);
37156 order[elt] = const0_rtx;
37158 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37159 order[1], order[2], order[3]));
37161 ix86_expand_vector_set (false, target, val, 0);
37163 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37164 order[1], order[2], order[3]));
37166 else
37168 /* For SSE1, we have to reuse the V4SF code. */
37169 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
37170 gen_lowpart (SFmode, val), elt);
37172 return;
37174 case V8HImode:
37175 use_vec_merge = TARGET_SSE2;
37176 break;
37177 case V4HImode:
37178 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37179 break;
37181 case V16QImode:
37182 use_vec_merge = TARGET_SSE4_1;
37183 break;
37185 case V8QImode:
37186 break;
37188 case V32QImode:
37189 half_mode = V16QImode;
37190 j = 0;
37191 n = 16;
37192 goto half;
37194 case V16HImode:
37195 half_mode = V8HImode;
37196 j = 1;
37197 n = 8;
37198 goto half;
37200 case V8SImode:
37201 half_mode = V4SImode;
37202 j = 2;
37203 n = 4;
37204 goto half;
37206 case V4DImode:
37207 half_mode = V2DImode;
37208 j = 3;
37209 n = 2;
37210 goto half;
37212 case V8SFmode:
37213 half_mode = V4SFmode;
37214 j = 4;
37215 n = 4;
37216 goto half;
37218 case V4DFmode:
37219 half_mode = V2DFmode;
37220 j = 5;
37221 n = 2;
37222 goto half;
37224 half:
37225 /* Compute offset. */
37226 i = elt / n;
37227 elt %= n;
37229 gcc_assert (i <= 1);
37231 /* Extract the half. */
37232 tmp = gen_reg_rtx (half_mode);
37233 emit_insn (gen_extract[j][i] (tmp, target));
37235 /* Put val in tmp at elt. */
37236 ix86_expand_vector_set (false, tmp, val, elt);
37238 /* Put it back. */
37239 emit_insn (gen_insert[j][i] (target, target, tmp));
37240 return;
37242 default:
37243 break;
37246 if (use_vec_merge)
37248 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
37249 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
37250 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37252 else
37254 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37256 emit_move_insn (mem, target);
37258 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37259 emit_move_insn (tmp, val);
37261 emit_move_insn (target, mem);
37265 void
37266 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
37268 enum machine_mode mode = GET_MODE (vec);
37269 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37270 bool use_vec_extr = false;
37271 rtx tmp;
37273 switch (mode)
37275 case V2SImode:
37276 case V2SFmode:
37277 if (!mmx_ok)
37278 break;
37279 /* FALLTHRU */
37281 case V2DFmode:
37282 case V2DImode:
37283 use_vec_extr = true;
37284 break;
37286 case V4SFmode:
37287 use_vec_extr = TARGET_SSE4_1;
37288 if (use_vec_extr)
37289 break;
37291 switch (elt)
37293 case 0:
37294 tmp = vec;
37295 break;
37297 case 1:
37298 case 3:
37299 tmp = gen_reg_rtx (mode);
37300 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
37301 GEN_INT (elt), GEN_INT (elt),
37302 GEN_INT (elt+4), GEN_INT (elt+4)));
37303 break;
37305 case 2:
37306 tmp = gen_reg_rtx (mode);
37307 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
37308 break;
37310 default:
37311 gcc_unreachable ();
37313 vec = tmp;
37314 use_vec_extr = true;
37315 elt = 0;
37316 break;
37318 case V4SImode:
37319 use_vec_extr = TARGET_SSE4_1;
37320 if (use_vec_extr)
37321 break;
37323 if (TARGET_SSE2)
37325 switch (elt)
37327 case 0:
37328 tmp = vec;
37329 break;
37331 case 1:
37332 case 3:
37333 tmp = gen_reg_rtx (mode);
37334 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
37335 GEN_INT (elt), GEN_INT (elt),
37336 GEN_INT (elt), GEN_INT (elt)));
37337 break;
37339 case 2:
37340 tmp = gen_reg_rtx (mode);
37341 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
37342 break;
37344 default:
37345 gcc_unreachable ();
37347 vec = tmp;
37348 use_vec_extr = true;
37349 elt = 0;
37351 else
37353 /* For SSE1, we have to reuse the V4SF code. */
37354 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
37355 gen_lowpart (V4SFmode, vec), elt);
37356 return;
37358 break;
37360 case V8HImode:
37361 use_vec_extr = TARGET_SSE2;
37362 break;
37363 case V4HImode:
37364 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37365 break;
37367 case V16QImode:
37368 use_vec_extr = TARGET_SSE4_1;
37369 break;
37371 case V8SFmode:
37372 if (TARGET_AVX)
37374 tmp = gen_reg_rtx (V4SFmode);
37375 if (elt < 4)
37376 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
37377 else
37378 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
37379 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37380 return;
37382 break;
37384 case V4DFmode:
37385 if (TARGET_AVX)
37387 tmp = gen_reg_rtx (V2DFmode);
37388 if (elt < 2)
37389 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
37390 else
37391 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
37392 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37393 return;
37395 break;
37397 case V32QImode:
37398 if (TARGET_AVX)
37400 tmp = gen_reg_rtx (V16QImode);
37401 if (elt < 16)
37402 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
37403 else
37404 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
37405 ix86_expand_vector_extract (false, target, tmp, elt & 15);
37406 return;
37408 break;
37410 case V16HImode:
37411 if (TARGET_AVX)
37413 tmp = gen_reg_rtx (V8HImode);
37414 if (elt < 8)
37415 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
37416 else
37417 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
37418 ix86_expand_vector_extract (false, target, tmp, elt & 7);
37419 return;
37421 break;
37423 case V8SImode:
37424 if (TARGET_AVX)
37426 tmp = gen_reg_rtx (V4SImode);
37427 if (elt < 4)
37428 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
37429 else
37430 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
37431 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37432 return;
37434 break;
37436 case V4DImode:
37437 if (TARGET_AVX)
37439 tmp = gen_reg_rtx (V2DImode);
37440 if (elt < 2)
37441 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
37442 else
37443 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
37444 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37445 return;
37447 break;
37449 case V8QImode:
37450 /* ??? Could extract the appropriate HImode element and shift. */
37451 default:
37452 break;
37455 if (use_vec_extr)
37457 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
37458 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
37460 /* Let the rtl optimizers know about the zero extension performed. */
37461 if (inner_mode == QImode || inner_mode == HImode)
37463 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
37464 target = gen_lowpart (SImode, target);
37467 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37469 else
37471 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37473 emit_move_insn (mem, vec);
37475 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37476 emit_move_insn (target, tmp);
37480 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
37481 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
37482 The upper bits of DEST are undefined, though they shouldn't cause
37483 exceptions (some bits from src or all zeros are ok). */
37485 static void
37486 emit_reduc_half (rtx dest, rtx src, int i)
37488 rtx tem;
37489 switch (GET_MODE (src))
37491 case V4SFmode:
37492 if (i == 128)
37493 tem = gen_sse_movhlps (dest, src, src);
37494 else
37495 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
37496 GEN_INT (1 + 4), GEN_INT (1 + 4));
37497 break;
37498 case V2DFmode:
37499 tem = gen_vec_interleave_highv2df (dest, src, src);
37500 break;
37501 case V16QImode:
37502 case V8HImode:
37503 case V4SImode:
37504 case V2DImode:
37505 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
37506 gen_lowpart (V1TImode, src),
37507 GEN_INT (i / 2));
37508 break;
37509 case V8SFmode:
37510 if (i == 256)
37511 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
37512 else
37513 tem = gen_avx_shufps256 (dest, src, src,
37514 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
37515 break;
37516 case V4DFmode:
37517 if (i == 256)
37518 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
37519 else
37520 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
37521 break;
37522 case V32QImode:
37523 case V16HImode:
37524 case V8SImode:
37525 case V4DImode:
37526 if (i == 256)
37527 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
37528 gen_lowpart (V4DImode, src),
37529 gen_lowpart (V4DImode, src),
37530 const1_rtx);
37531 else
37532 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
37533 gen_lowpart (V2TImode, src),
37534 GEN_INT (i / 2));
37535 break;
37536 default:
37537 gcc_unreachable ();
37539 emit_insn (tem);
37542 /* Expand a vector reduction. FN is the binary pattern to reduce;
37543 DEST is the destination; IN is the input vector. */
37545 void
37546 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
37548 rtx half, dst, vec = in;
37549 enum machine_mode mode = GET_MODE (in);
37550 int i;
37552 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
37553 if (TARGET_SSE4_1
37554 && mode == V8HImode
37555 && fn == gen_uminv8hi3)
37557 emit_insn (gen_sse4_1_phminposuw (dest, in));
37558 return;
37561 for (i = GET_MODE_BITSIZE (mode);
37562 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37563 i >>= 1)
37565 half = gen_reg_rtx (mode);
37566 emit_reduc_half (half, vec, i);
37567 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37568 dst = dest;
37569 else
37570 dst = gen_reg_rtx (mode);
37571 emit_insn (fn (dst, half, vec));
37572 vec = dst;
37576 /* Target hook for scalar_mode_supported_p. */
37577 static bool
37578 ix86_scalar_mode_supported_p (enum machine_mode mode)
37580 if (DECIMAL_FLOAT_MODE_P (mode))
37581 return default_decimal_float_supported_p ();
37582 else if (mode == TFmode)
37583 return true;
37584 else
37585 return default_scalar_mode_supported_p (mode);
37588 /* Implements target hook vector_mode_supported_p. */
37589 static bool
37590 ix86_vector_mode_supported_p (enum machine_mode mode)
37592 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37593 return true;
37594 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37595 return true;
37596 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37597 return true;
37598 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37599 return true;
37600 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37601 return true;
37602 return false;
37605 /* Target hook for c_mode_for_suffix. */
37606 static enum machine_mode
37607 ix86_c_mode_for_suffix (char suffix)
37609 if (suffix == 'q')
37610 return TFmode;
37611 if (suffix == 'w')
37612 return XFmode;
37614 return VOIDmode;
37617 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37619 We do this in the new i386 backend to maintain source compatibility
37620 with the old cc0-based compiler. */
37622 static tree
37623 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37624 tree inputs ATTRIBUTE_UNUSED,
37625 tree clobbers)
37627 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37628 clobbers);
37629 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37630 clobbers);
37631 return clobbers;
37634 /* Implements target vector targetm.asm.encode_section_info. */
37636 static void ATTRIBUTE_UNUSED
37637 ix86_encode_section_info (tree decl, rtx rtl, int first)
37639 default_encode_section_info (decl, rtl, first);
37641 if (TREE_CODE (decl) == VAR_DECL
37642 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37643 && ix86_in_large_data_p (decl))
37644 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37647 /* Worker function for REVERSE_CONDITION. */
37649 enum rtx_code
37650 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37652 return (mode != CCFPmode && mode != CCFPUmode
37653 ? reverse_condition (code)
37654 : reverse_condition_maybe_unordered (code));
37657 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37658 to OPERANDS[0]. */
37660 const char *
37661 output_387_reg_move (rtx insn, rtx *operands)
37663 if (REG_P (operands[0]))
37665 if (REG_P (operands[1])
37666 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37668 if (REGNO (operands[0]) == FIRST_STACK_REG)
37669 return output_387_ffreep (operands, 0);
37670 return "fstp\t%y0";
37672 if (STACK_TOP_P (operands[0]))
37673 return "fld%Z1\t%y1";
37674 return "fst\t%y0";
37676 else if (MEM_P (operands[0]))
37678 gcc_assert (REG_P (operands[1]));
37679 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37680 return "fstp%Z0\t%y0";
37681 else
37683 /* There is no non-popping store to memory for XFmode.
37684 So if we need one, follow the store with a load. */
37685 if (GET_MODE (operands[0]) == XFmode)
37686 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37687 else
37688 return "fst%Z0\t%y0";
37691 else
37692 gcc_unreachable();
37695 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37696 FP status register is set. */
37698 void
37699 ix86_emit_fp_unordered_jump (rtx label)
37701 rtx reg = gen_reg_rtx (HImode);
37702 rtx temp;
37704 emit_insn (gen_x86_fnstsw_1 (reg));
37706 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37708 emit_insn (gen_x86_sahf_1 (reg));
37710 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37711 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37713 else
37715 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37717 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37718 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37721 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37722 gen_rtx_LABEL_REF (VOIDmode, label),
37723 pc_rtx);
37724 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37726 emit_jump_insn (temp);
37727 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37730 /* Output code to perform a log1p XFmode calculation. */
37732 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37734 rtx label1 = gen_label_rtx ();
37735 rtx label2 = gen_label_rtx ();
37737 rtx tmp = gen_reg_rtx (XFmode);
37738 rtx tmp2 = gen_reg_rtx (XFmode);
37739 rtx test;
37741 emit_insn (gen_absxf2 (tmp, op1));
37742 test = gen_rtx_GE (VOIDmode, tmp,
37743 CONST_DOUBLE_FROM_REAL_VALUE (
37744 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37745 XFmode));
37746 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37748 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37749 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37750 emit_jump (label2);
37752 emit_label (label1);
37753 emit_move_insn (tmp, CONST1_RTX (XFmode));
37754 emit_insn (gen_addxf3 (tmp, op1, tmp));
37755 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37756 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37758 emit_label (label2);
37761 /* Emit code for round calculation. */
37762 void ix86_emit_i387_round (rtx op0, rtx op1)
37764 enum machine_mode inmode = GET_MODE (op1);
37765 enum machine_mode outmode = GET_MODE (op0);
37766 rtx e1, e2, res, tmp, tmp1, half;
37767 rtx scratch = gen_reg_rtx (HImode);
37768 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37769 rtx jump_label = gen_label_rtx ();
37770 rtx insn;
37771 rtx (*gen_abs) (rtx, rtx);
37772 rtx (*gen_neg) (rtx, rtx);
37774 switch (inmode)
37776 case SFmode:
37777 gen_abs = gen_abssf2;
37778 break;
37779 case DFmode:
37780 gen_abs = gen_absdf2;
37781 break;
37782 case XFmode:
37783 gen_abs = gen_absxf2;
37784 break;
37785 default:
37786 gcc_unreachable ();
37789 switch (outmode)
37791 case SFmode:
37792 gen_neg = gen_negsf2;
37793 break;
37794 case DFmode:
37795 gen_neg = gen_negdf2;
37796 break;
37797 case XFmode:
37798 gen_neg = gen_negxf2;
37799 break;
37800 case HImode:
37801 gen_neg = gen_neghi2;
37802 break;
37803 case SImode:
37804 gen_neg = gen_negsi2;
37805 break;
37806 case DImode:
37807 gen_neg = gen_negdi2;
37808 break;
37809 default:
37810 gcc_unreachable ();
37813 e1 = gen_reg_rtx (inmode);
37814 e2 = gen_reg_rtx (inmode);
37815 res = gen_reg_rtx (outmode);
37817 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37819 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37821 /* scratch = fxam(op1) */
37822 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37823 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37824 UNSPEC_FXAM)));
37825 /* e1 = fabs(op1) */
37826 emit_insn (gen_abs (e1, op1));
37828 /* e2 = e1 + 0.5 */
37829 half = force_reg (inmode, half);
37830 emit_insn (gen_rtx_SET (VOIDmode, e2,
37831 gen_rtx_PLUS (inmode, e1, half)));
37833 /* res = floor(e2) */
37834 if (inmode != XFmode)
37836 tmp1 = gen_reg_rtx (XFmode);
37838 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37839 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37841 else
37842 tmp1 = e2;
37844 switch (outmode)
37846 case SFmode:
37847 case DFmode:
37849 rtx tmp0 = gen_reg_rtx (XFmode);
37851 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37853 emit_insn (gen_rtx_SET (VOIDmode, res,
37854 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37855 UNSPEC_TRUNC_NOOP)));
37857 break;
37858 case XFmode:
37859 emit_insn (gen_frndintxf2_floor (res, tmp1));
37860 break;
37861 case HImode:
37862 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37863 break;
37864 case SImode:
37865 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37866 break;
37867 case DImode:
37868 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37869 break;
37870 default:
37871 gcc_unreachable ();
37874 /* flags = signbit(a) */
37875 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37877 /* if (flags) then res = -res */
37878 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37879 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37880 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37881 pc_rtx);
37882 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37883 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37884 JUMP_LABEL (insn) = jump_label;
37886 emit_insn (gen_neg (res, res));
37888 emit_label (jump_label);
37889 LABEL_NUSES (jump_label) = 1;
37891 emit_move_insn (op0, res);
37894 /* Output code to perform a Newton-Rhapson approximation of a single precision
37895 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37897 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37899 rtx x0, x1, e0, e1;
37901 x0 = gen_reg_rtx (mode);
37902 e0 = gen_reg_rtx (mode);
37903 e1 = gen_reg_rtx (mode);
37904 x1 = gen_reg_rtx (mode);
37906 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37908 b = force_reg (mode, b);
37910 /* x0 = rcp(b) estimate */
37911 emit_insn (gen_rtx_SET (VOIDmode, x0,
37912 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37913 UNSPEC_RCP)));
37914 /* e0 = x0 * b */
37915 emit_insn (gen_rtx_SET (VOIDmode, e0,
37916 gen_rtx_MULT (mode, x0, b)));
37918 /* e0 = x0 * e0 */
37919 emit_insn (gen_rtx_SET (VOIDmode, e0,
37920 gen_rtx_MULT (mode, x0, e0)));
37922 /* e1 = x0 + x0 */
37923 emit_insn (gen_rtx_SET (VOIDmode, e1,
37924 gen_rtx_PLUS (mode, x0, x0)));
37926 /* x1 = e1 - e0 */
37927 emit_insn (gen_rtx_SET (VOIDmode, x1,
37928 gen_rtx_MINUS (mode, e1, e0)));
37930 /* res = a * x1 */
37931 emit_insn (gen_rtx_SET (VOIDmode, res,
37932 gen_rtx_MULT (mode, a, x1)));
37935 /* Output code to perform a Newton-Rhapson approximation of a
37936 single precision floating point [reciprocal] square root. */
37938 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37939 bool recip)
37941 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37942 REAL_VALUE_TYPE r;
37944 x0 = gen_reg_rtx (mode);
37945 e0 = gen_reg_rtx (mode);
37946 e1 = gen_reg_rtx (mode);
37947 e2 = gen_reg_rtx (mode);
37948 e3 = gen_reg_rtx (mode);
37950 real_from_integer (&r, VOIDmode, -3, -1, 0);
37951 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37953 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37954 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37956 if (VECTOR_MODE_P (mode))
37958 mthree = ix86_build_const_vector (mode, true, mthree);
37959 mhalf = ix86_build_const_vector (mode, true, mhalf);
37962 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37963 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37965 a = force_reg (mode, a);
37967 /* x0 = rsqrt(a) estimate */
37968 emit_insn (gen_rtx_SET (VOIDmode, x0,
37969 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37970 UNSPEC_RSQRT)));
37972 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37973 if (!recip)
37975 rtx zero, mask;
37977 zero = gen_reg_rtx (mode);
37978 mask = gen_reg_rtx (mode);
37980 zero = force_reg (mode, CONST0_RTX(mode));
37981 emit_insn (gen_rtx_SET (VOIDmode, mask,
37982 gen_rtx_NE (mode, zero, a)));
37984 emit_insn (gen_rtx_SET (VOIDmode, x0,
37985 gen_rtx_AND (mode, x0, mask)));
37988 /* e0 = x0 * a */
37989 emit_insn (gen_rtx_SET (VOIDmode, e0,
37990 gen_rtx_MULT (mode, x0, a)));
37991 /* e1 = e0 * x0 */
37992 emit_insn (gen_rtx_SET (VOIDmode, e1,
37993 gen_rtx_MULT (mode, e0, x0)));
37995 /* e2 = e1 - 3. */
37996 mthree = force_reg (mode, mthree);
37997 emit_insn (gen_rtx_SET (VOIDmode, e2,
37998 gen_rtx_PLUS (mode, e1, mthree)));
38000 mhalf = force_reg (mode, mhalf);
38001 if (recip)
38002 /* e3 = -.5 * x0 */
38003 emit_insn (gen_rtx_SET (VOIDmode, e3,
38004 gen_rtx_MULT (mode, x0, mhalf)));
38005 else
38006 /* e3 = -.5 * e0 */
38007 emit_insn (gen_rtx_SET (VOIDmode, e3,
38008 gen_rtx_MULT (mode, e0, mhalf)));
38009 /* ret = e2 * e3 */
38010 emit_insn (gen_rtx_SET (VOIDmode, res,
38011 gen_rtx_MULT (mode, e2, e3)));
38014 #ifdef TARGET_SOLARIS
38015 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38017 static void
38018 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38019 tree decl)
38021 /* With Binutils 2.15, the "@unwind" marker must be specified on
38022 every occurrence of the ".eh_frame" section, not just the first
38023 one. */
38024 if (TARGET_64BIT
38025 && strcmp (name, ".eh_frame") == 0)
38027 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38028 flags & SECTION_WRITE ? "aw" : "a");
38029 return;
38032 #ifndef USE_GAS
38033 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38035 solaris_elf_asm_comdat_section (name, flags, decl);
38036 return;
38038 #endif
38040 default_elf_asm_named_section (name, flags, decl);
38042 #endif /* TARGET_SOLARIS */
38044 /* Return the mangling of TYPE if it is an extended fundamental type. */
38046 static const char *
38047 ix86_mangle_type (const_tree type)
38049 type = TYPE_MAIN_VARIANT (type);
38051 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38052 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38053 return NULL;
38055 switch (TYPE_MODE (type))
38057 case TFmode:
38058 /* __float128 is "g". */
38059 return "g";
38060 case XFmode:
38061 /* "long double" or __float80 is "e". */
38062 return "e";
38063 default:
38064 return NULL;
38068 /* For 32-bit code we can save PIC register setup by using
38069 __stack_chk_fail_local hidden function instead of calling
38070 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38071 register, so it is better to call __stack_chk_fail directly. */
38073 static tree ATTRIBUTE_UNUSED
38074 ix86_stack_protect_fail (void)
38076 return TARGET_64BIT
38077 ? default_external_stack_protect_fail ()
38078 : default_hidden_stack_protect_fail ();
38081 /* Select a format to encode pointers in exception handling data. CODE
38082 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38083 true if the symbol may be affected by dynamic relocations.
38085 ??? All x86 object file formats are capable of representing this.
38086 After all, the relocation needed is the same as for the call insn.
38087 Whether or not a particular assembler allows us to enter such, I
38088 guess we'll have to see. */
38090 asm_preferred_eh_data_format (int code, int global)
38092 if (flag_pic)
38094 int type = DW_EH_PE_sdata8;
38095 if (!TARGET_64BIT
38096 || ix86_cmodel == CM_SMALL_PIC
38097 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38098 type = DW_EH_PE_sdata4;
38099 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38101 if (ix86_cmodel == CM_SMALL
38102 || (ix86_cmodel == CM_MEDIUM && code))
38103 return DW_EH_PE_udata4;
38104 return DW_EH_PE_absptr;
38107 /* Expand copysign from SIGN to the positive value ABS_VALUE
38108 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38109 the sign-bit. */
38110 static void
38111 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38113 enum machine_mode mode = GET_MODE (sign);
38114 rtx sgn = gen_reg_rtx (mode);
38115 if (mask == NULL_RTX)
38117 enum machine_mode vmode;
38119 if (mode == SFmode)
38120 vmode = V4SFmode;
38121 else if (mode == DFmode)
38122 vmode = V2DFmode;
38123 else
38124 vmode = mode;
38126 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38127 if (!VECTOR_MODE_P (mode))
38129 /* We need to generate a scalar mode mask in this case. */
38130 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38131 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38132 mask = gen_reg_rtx (mode);
38133 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38136 else
38137 mask = gen_rtx_NOT (mode, mask);
38138 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38139 gen_rtx_AND (mode, mask, sign)));
38140 emit_insn (gen_rtx_SET (VOIDmode, result,
38141 gen_rtx_IOR (mode, abs_value, sgn)));
38144 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38145 mask for masking out the sign-bit is stored in *SMASK, if that is
38146 non-null. */
38147 static rtx
38148 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38150 enum machine_mode vmode, mode = GET_MODE (op0);
38151 rtx xa, mask;
38153 xa = gen_reg_rtx (mode);
38154 if (mode == SFmode)
38155 vmode = V4SFmode;
38156 else if (mode == DFmode)
38157 vmode = V2DFmode;
38158 else
38159 vmode = mode;
38160 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38161 if (!VECTOR_MODE_P (mode))
38163 /* We need to generate a scalar mode mask in this case. */
38164 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38165 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38166 mask = gen_reg_rtx (mode);
38167 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38169 emit_insn (gen_rtx_SET (VOIDmode, xa,
38170 gen_rtx_AND (mode, op0, mask)));
38172 if (smask)
38173 *smask = mask;
38175 return xa;
38178 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38179 swapping the operands if SWAP_OPERANDS is true. The expanded
38180 code is a forward jump to a newly created label in case the
38181 comparison is true. The generated label rtx is returned. */
38182 static rtx
38183 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38184 bool swap_operands)
38186 rtx label, tmp;
38188 if (swap_operands)
38190 tmp = op0;
38191 op0 = op1;
38192 op1 = tmp;
38195 label = gen_label_rtx ();
38196 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
38197 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38198 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
38199 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
38200 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
38201 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
38202 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38203 JUMP_LABEL (tmp) = label;
38205 return label;
38208 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
38209 using comparison code CODE. Operands are swapped for the comparison if
38210 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
38211 static rtx
38212 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
38213 bool swap_operands)
38215 rtx (*insn)(rtx, rtx, rtx, rtx);
38216 enum machine_mode mode = GET_MODE (op0);
38217 rtx mask = gen_reg_rtx (mode);
38219 if (swap_operands)
38221 rtx tmp = op0;
38222 op0 = op1;
38223 op1 = tmp;
38226 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
38228 emit_insn (insn (mask, op0, op1,
38229 gen_rtx_fmt_ee (code, mode, op0, op1)));
38230 return mask;
38233 /* Generate and return a rtx of mode MODE for 2**n where n is the number
38234 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
38235 static rtx
38236 ix86_gen_TWO52 (enum machine_mode mode)
38238 REAL_VALUE_TYPE TWO52r;
38239 rtx TWO52;
38241 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
38242 TWO52 = const_double_from_real_value (TWO52r, mode);
38243 TWO52 = force_reg (mode, TWO52);
38245 return TWO52;
38248 /* Expand SSE sequence for computing lround from OP1 storing
38249 into OP0. */
38250 void
38251 ix86_expand_lround (rtx op0, rtx op1)
38253 /* C code for the stuff we're doing below:
38254 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
38255 return (long)tmp;
38257 enum machine_mode mode = GET_MODE (op1);
38258 const struct real_format *fmt;
38259 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38260 rtx adj;
38262 /* load nextafter (0.5, 0.0) */
38263 fmt = REAL_MODE_FORMAT (mode);
38264 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38265 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38267 /* adj = copysign (0.5, op1) */
38268 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
38269 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
38271 /* adj = op1 + adj */
38272 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
38274 /* op0 = (imode)adj */
38275 expand_fix (op0, adj, 0);
38278 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
38279 into OPERAND0. */
38280 void
38281 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
38283 /* C code for the stuff we're doing below (for do_floor):
38284 xi = (long)op1;
38285 xi -= (double)xi > op1 ? 1 : 0;
38286 return xi;
38288 enum machine_mode fmode = GET_MODE (op1);
38289 enum machine_mode imode = GET_MODE (op0);
38290 rtx ireg, freg, label, tmp;
38292 /* reg = (long)op1 */
38293 ireg = gen_reg_rtx (imode);
38294 expand_fix (ireg, op1, 0);
38296 /* freg = (double)reg */
38297 freg = gen_reg_rtx (fmode);
38298 expand_float (freg, ireg, 0);
38300 /* ireg = (freg > op1) ? ireg - 1 : ireg */
38301 label = ix86_expand_sse_compare_and_jump (UNLE,
38302 freg, op1, !do_floor);
38303 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
38304 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
38305 emit_move_insn (ireg, tmp);
38307 emit_label (label);
38308 LABEL_NUSES (label) = 1;
38310 emit_move_insn (op0, ireg);
38313 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
38314 result in OPERAND0. */
38315 void
38316 ix86_expand_rint (rtx operand0, rtx operand1)
38318 /* C code for the stuff we're doing below:
38319 xa = fabs (operand1);
38320 if (!isless (xa, 2**52))
38321 return operand1;
38322 xa = xa + 2**52 - 2**52;
38323 return copysign (xa, operand1);
38325 enum machine_mode mode = GET_MODE (operand0);
38326 rtx res, xa, label, TWO52, mask;
38328 res = gen_reg_rtx (mode);
38329 emit_move_insn (res, operand1);
38331 /* xa = abs (operand1) */
38332 xa = ix86_expand_sse_fabs (res, &mask);
38334 /* if (!isless (xa, TWO52)) goto label; */
38335 TWO52 = ix86_gen_TWO52 (mode);
38336 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38338 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38339 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38341 ix86_sse_copysign_to_positive (res, xa, res, mask);
38343 emit_label (label);
38344 LABEL_NUSES (label) = 1;
38346 emit_move_insn (operand0, res);
38349 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38350 into OPERAND0. */
38351 void
38352 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
38354 /* C code for the stuff we expand below.
38355 double xa = fabs (x), x2;
38356 if (!isless (xa, TWO52))
38357 return x;
38358 xa = xa + TWO52 - TWO52;
38359 x2 = copysign (xa, x);
38360 Compensate. Floor:
38361 if (x2 > x)
38362 x2 -= 1;
38363 Compensate. Ceil:
38364 if (x2 < x)
38365 x2 -= -1;
38366 return x2;
38368 enum machine_mode mode = GET_MODE (operand0);
38369 rtx xa, TWO52, tmp, label, one, res, mask;
38371 TWO52 = ix86_gen_TWO52 (mode);
38373 /* Temporary for holding the result, initialized to the input
38374 operand to ease control flow. */
38375 res = gen_reg_rtx (mode);
38376 emit_move_insn (res, operand1);
38378 /* xa = abs (operand1) */
38379 xa = ix86_expand_sse_fabs (res, &mask);
38381 /* if (!isless (xa, TWO52)) goto label; */
38382 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38384 /* xa = xa + TWO52 - TWO52; */
38385 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38386 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38388 /* xa = copysign (xa, operand1) */
38389 ix86_sse_copysign_to_positive (xa, xa, res, mask);
38391 /* generate 1.0 or -1.0 */
38392 one = force_reg (mode,
38393 const_double_from_real_value (do_floor
38394 ? dconst1 : dconstm1, mode));
38396 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38397 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38398 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38399 gen_rtx_AND (mode, one, tmp)));
38400 /* We always need to subtract here to preserve signed zero. */
38401 tmp = expand_simple_binop (mode, MINUS,
38402 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38403 emit_move_insn (res, tmp);
38405 emit_label (label);
38406 LABEL_NUSES (label) = 1;
38408 emit_move_insn (operand0, res);
38411 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38412 into OPERAND0. */
38413 void
38414 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
38416 /* C code for the stuff we expand below.
38417 double xa = fabs (x), x2;
38418 if (!isless (xa, TWO52))
38419 return x;
38420 x2 = (double)(long)x;
38421 Compensate. Floor:
38422 if (x2 > x)
38423 x2 -= 1;
38424 Compensate. Ceil:
38425 if (x2 < x)
38426 x2 += 1;
38427 if (HONOR_SIGNED_ZEROS (mode))
38428 return copysign (x2, x);
38429 return x2;
38431 enum machine_mode mode = GET_MODE (operand0);
38432 rtx xa, xi, TWO52, tmp, label, one, res, mask;
38434 TWO52 = ix86_gen_TWO52 (mode);
38436 /* Temporary for holding the result, initialized to the input
38437 operand to ease control flow. */
38438 res = gen_reg_rtx (mode);
38439 emit_move_insn (res, operand1);
38441 /* xa = abs (operand1) */
38442 xa = ix86_expand_sse_fabs (res, &mask);
38444 /* if (!isless (xa, TWO52)) goto label; */
38445 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38447 /* xa = (double)(long)x */
38448 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38449 expand_fix (xi, res, 0);
38450 expand_float (xa, xi, 0);
38452 /* generate 1.0 */
38453 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38455 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38456 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38457 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38458 gen_rtx_AND (mode, one, tmp)));
38459 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
38460 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38461 emit_move_insn (res, tmp);
38463 if (HONOR_SIGNED_ZEROS (mode))
38464 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38466 emit_label (label);
38467 LABEL_NUSES (label) = 1;
38469 emit_move_insn (operand0, res);
38472 /* Expand SSE sequence for computing round from OPERAND1 storing
38473 into OPERAND0. Sequence that works without relying on DImode truncation
38474 via cvttsd2siq that is only available on 64bit targets. */
38475 void
38476 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
38478 /* C code for the stuff we expand below.
38479 double xa = fabs (x), xa2, x2;
38480 if (!isless (xa, TWO52))
38481 return x;
38482 Using the absolute value and copying back sign makes
38483 -0.0 -> -0.0 correct.
38484 xa2 = xa + TWO52 - TWO52;
38485 Compensate.
38486 dxa = xa2 - xa;
38487 if (dxa <= -0.5)
38488 xa2 += 1;
38489 else if (dxa > 0.5)
38490 xa2 -= 1;
38491 x2 = copysign (xa2, x);
38492 return x2;
38494 enum machine_mode mode = GET_MODE (operand0);
38495 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
38497 TWO52 = ix86_gen_TWO52 (mode);
38499 /* Temporary for holding the result, initialized to the input
38500 operand to ease control flow. */
38501 res = gen_reg_rtx (mode);
38502 emit_move_insn (res, operand1);
38504 /* xa = abs (operand1) */
38505 xa = ix86_expand_sse_fabs (res, &mask);
38507 /* if (!isless (xa, TWO52)) goto label; */
38508 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38510 /* xa2 = xa + TWO52 - TWO52; */
38511 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38512 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
38514 /* dxa = xa2 - xa; */
38515 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
38517 /* generate 0.5, 1.0 and -0.5 */
38518 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
38519 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
38520 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
38521 0, OPTAB_DIRECT);
38523 /* Compensate. */
38524 tmp = gen_reg_rtx (mode);
38525 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
38526 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
38527 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38528 gen_rtx_AND (mode, one, tmp)));
38529 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38530 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
38531 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
38532 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38533 gen_rtx_AND (mode, one, tmp)));
38534 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38536 /* res = copysign (xa2, operand1) */
38537 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
38539 emit_label (label);
38540 LABEL_NUSES (label) = 1;
38542 emit_move_insn (operand0, res);
38545 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38546 into OPERAND0. */
38547 void
38548 ix86_expand_trunc (rtx operand0, rtx operand1)
38550 /* C code for SSE variant we expand below.
38551 double xa = fabs (x), x2;
38552 if (!isless (xa, TWO52))
38553 return x;
38554 x2 = (double)(long)x;
38555 if (HONOR_SIGNED_ZEROS (mode))
38556 return copysign (x2, x);
38557 return x2;
38559 enum machine_mode mode = GET_MODE (operand0);
38560 rtx xa, xi, TWO52, label, res, mask;
38562 TWO52 = ix86_gen_TWO52 (mode);
38564 /* Temporary for holding the result, initialized to the input
38565 operand to ease control flow. */
38566 res = gen_reg_rtx (mode);
38567 emit_move_insn (res, operand1);
38569 /* xa = abs (operand1) */
38570 xa = ix86_expand_sse_fabs (res, &mask);
38572 /* if (!isless (xa, TWO52)) goto label; */
38573 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38575 /* x = (double)(long)x */
38576 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38577 expand_fix (xi, res, 0);
38578 expand_float (res, xi, 0);
38580 if (HONOR_SIGNED_ZEROS (mode))
38581 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38583 emit_label (label);
38584 LABEL_NUSES (label) = 1;
38586 emit_move_insn (operand0, res);
38589 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38590 into OPERAND0. */
38591 void
38592 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38594 enum machine_mode mode = GET_MODE (operand0);
38595 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38597 /* C code for SSE variant we expand below.
38598 double xa = fabs (x), x2;
38599 if (!isless (xa, TWO52))
38600 return x;
38601 xa2 = xa + TWO52 - TWO52;
38602 Compensate:
38603 if (xa2 > xa)
38604 xa2 -= 1.0;
38605 x2 = copysign (xa2, x);
38606 return x2;
38609 TWO52 = ix86_gen_TWO52 (mode);
38611 /* Temporary for holding the result, initialized to the input
38612 operand to ease control flow. */
38613 res = gen_reg_rtx (mode);
38614 emit_move_insn (res, operand1);
38616 /* xa = abs (operand1) */
38617 xa = ix86_expand_sse_fabs (res, &smask);
38619 /* if (!isless (xa, TWO52)) goto label; */
38620 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38622 /* res = xa + TWO52 - TWO52; */
38623 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38624 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38625 emit_move_insn (res, tmp);
38627 /* generate 1.0 */
38628 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38630 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38631 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38632 emit_insn (gen_rtx_SET (VOIDmode, mask,
38633 gen_rtx_AND (mode, mask, one)));
38634 tmp = expand_simple_binop (mode, MINUS,
38635 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38636 emit_move_insn (res, tmp);
38638 /* res = copysign (res, operand1) */
38639 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38641 emit_label (label);
38642 LABEL_NUSES (label) = 1;
38644 emit_move_insn (operand0, res);
38647 /* Expand SSE sequence for computing round from OPERAND1 storing
38648 into OPERAND0. */
38649 void
38650 ix86_expand_round (rtx operand0, rtx operand1)
38652 /* C code for the stuff we're doing below:
38653 double xa = fabs (x);
38654 if (!isless (xa, TWO52))
38655 return x;
38656 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38657 return copysign (xa, x);
38659 enum machine_mode mode = GET_MODE (operand0);
38660 rtx res, TWO52, xa, label, xi, half, mask;
38661 const struct real_format *fmt;
38662 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38664 /* Temporary for holding the result, initialized to the input
38665 operand to ease control flow. */
38666 res = gen_reg_rtx (mode);
38667 emit_move_insn (res, operand1);
38669 TWO52 = ix86_gen_TWO52 (mode);
38670 xa = ix86_expand_sse_fabs (res, &mask);
38671 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38673 /* load nextafter (0.5, 0.0) */
38674 fmt = REAL_MODE_FORMAT (mode);
38675 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38676 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38678 /* xa = xa + 0.5 */
38679 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38680 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38682 /* xa = (double)(int64_t)xa */
38683 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38684 expand_fix (xi, xa, 0);
38685 expand_float (xa, xi, 0);
38687 /* res = copysign (xa, operand1) */
38688 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38690 emit_label (label);
38691 LABEL_NUSES (label) = 1;
38693 emit_move_insn (operand0, res);
38696 /* Expand SSE sequence for computing round
38697 from OP1 storing into OP0 using sse4 round insn. */
38698 void
38699 ix86_expand_round_sse4 (rtx op0, rtx op1)
38701 enum machine_mode mode = GET_MODE (op0);
38702 rtx e1, e2, res, half;
38703 const struct real_format *fmt;
38704 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38705 rtx (*gen_copysign) (rtx, rtx, rtx);
38706 rtx (*gen_round) (rtx, rtx, rtx);
38708 switch (mode)
38710 case SFmode:
38711 gen_copysign = gen_copysignsf3;
38712 gen_round = gen_sse4_1_roundsf2;
38713 break;
38714 case DFmode:
38715 gen_copysign = gen_copysigndf3;
38716 gen_round = gen_sse4_1_rounddf2;
38717 break;
38718 default:
38719 gcc_unreachable ();
38722 /* round (a) = trunc (a + copysign (0.5, a)) */
38724 /* load nextafter (0.5, 0.0) */
38725 fmt = REAL_MODE_FORMAT (mode);
38726 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38727 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38728 half = const_double_from_real_value (pred_half, mode);
38730 /* e1 = copysign (0.5, op1) */
38731 e1 = gen_reg_rtx (mode);
38732 emit_insn (gen_copysign (e1, half, op1));
38734 /* e2 = op1 + e1 */
38735 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38737 /* res = trunc (e2) */
38738 res = gen_reg_rtx (mode);
38739 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38741 emit_move_insn (op0, res);
38745 /* Table of valid machine attributes. */
38746 static const struct attribute_spec ix86_attribute_table[] =
38748 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38749 affects_type_identity } */
38750 /* Stdcall attribute says callee is responsible for popping arguments
38751 if they are not variable. */
38752 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38753 true },
38754 /* Fastcall attribute says callee is responsible for popping arguments
38755 if they are not variable. */
38756 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38757 true },
38758 /* Thiscall attribute says callee is responsible for popping arguments
38759 if they are not variable. */
38760 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38761 true },
38762 /* Cdecl attribute says the callee is a normal C declaration */
38763 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38764 true },
38765 /* Regparm attribute specifies how many integer arguments are to be
38766 passed in registers. */
38767 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38768 true },
38769 /* Sseregparm attribute says we are using x86_64 calling conventions
38770 for FP arguments. */
38771 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38772 true },
38773 /* The transactional memory builtins are implicitly regparm or fastcall
38774 depending on the ABI. Override the generic do-nothing attribute that
38775 these builtins were declared with. */
38776 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38777 true },
38778 /* force_align_arg_pointer says this function realigns the stack at entry. */
38779 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38780 false, true, true, ix86_handle_cconv_attribute, false },
38781 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38782 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38783 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38784 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38785 false },
38786 #endif
38787 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38788 false },
38789 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38790 false },
38791 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38792 SUBTARGET_ATTRIBUTE_TABLE,
38793 #endif
38794 /* ms_abi and sysv_abi calling convention function attributes. */
38795 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38796 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38797 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38798 false },
38799 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38800 ix86_handle_callee_pop_aggregate_return, true },
38801 /* End element. */
38802 { NULL, 0, 0, false, false, false, NULL, false }
38805 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38806 static int
38807 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38808 tree vectype,
38809 int misalign ATTRIBUTE_UNUSED)
38811 unsigned elements;
38813 switch (type_of_cost)
38815 case scalar_stmt:
38816 return ix86_cost->scalar_stmt_cost;
38818 case scalar_load:
38819 return ix86_cost->scalar_load_cost;
38821 case scalar_store:
38822 return ix86_cost->scalar_store_cost;
38824 case vector_stmt:
38825 return ix86_cost->vec_stmt_cost;
38827 case vector_load:
38828 return ix86_cost->vec_align_load_cost;
38830 case vector_store:
38831 return ix86_cost->vec_store_cost;
38833 case vec_to_scalar:
38834 return ix86_cost->vec_to_scalar_cost;
38836 case scalar_to_vec:
38837 return ix86_cost->scalar_to_vec_cost;
38839 case unaligned_load:
38840 case unaligned_store:
38841 return ix86_cost->vec_unalign_load_cost;
38843 case cond_branch_taken:
38844 return ix86_cost->cond_taken_branch_cost;
38846 case cond_branch_not_taken:
38847 return ix86_cost->cond_not_taken_branch_cost;
38849 case vec_perm:
38850 case vec_promote_demote:
38851 return ix86_cost->vec_stmt_cost;
38853 case vec_construct:
38854 elements = TYPE_VECTOR_SUBPARTS (vectype);
38855 return elements / 2 + 1;
38857 default:
38858 gcc_unreachable ();
38862 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38863 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38864 insn every time. */
38866 static GTY(()) rtx vselect_insn;
38868 /* Initialize vselect_insn. */
38870 static void
38871 init_vselect_insn (void)
38873 unsigned i;
38874 rtx x;
38876 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38877 for (i = 0; i < MAX_VECT_LEN; ++i)
38878 XVECEXP (x, 0, i) = const0_rtx;
38879 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38880 const0_rtx), x);
38881 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38882 start_sequence ();
38883 vselect_insn = emit_insn (x);
38884 end_sequence ();
38887 /* Construct (set target (vec_select op0 (parallel perm))) and
38888 return true if that's a valid instruction in the active ISA. */
38890 static bool
38891 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38892 unsigned nelt, bool testing_p)
38894 unsigned int i;
38895 rtx x, save_vconcat;
38896 int icode;
38898 if (vselect_insn == NULL_RTX)
38899 init_vselect_insn ();
38901 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38902 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38903 for (i = 0; i < nelt; ++i)
38904 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38905 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38906 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38907 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38908 SET_DEST (PATTERN (vselect_insn)) = target;
38909 icode = recog_memoized (vselect_insn);
38911 if (icode >= 0 && !testing_p)
38912 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38914 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38915 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38916 INSN_CODE (vselect_insn) = -1;
38918 return icode >= 0;
38921 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38923 static bool
38924 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38925 const unsigned char *perm, unsigned nelt,
38926 bool testing_p)
38928 enum machine_mode v2mode;
38929 rtx x;
38930 bool ok;
38932 if (vselect_insn == NULL_RTX)
38933 init_vselect_insn ();
38935 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38936 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38937 PUT_MODE (x, v2mode);
38938 XEXP (x, 0) = op0;
38939 XEXP (x, 1) = op1;
38940 ok = expand_vselect (target, x, perm, nelt, testing_p);
38941 XEXP (x, 0) = const0_rtx;
38942 XEXP (x, 1) = const0_rtx;
38943 return ok;
38946 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38947 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38949 static bool
38950 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38952 enum machine_mode vmode = d->vmode;
38953 unsigned i, mask, nelt = d->nelt;
38954 rtx target, op0, op1, x;
38955 rtx rperm[32], vperm;
38957 if (d->one_operand_p)
38958 return false;
38959 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38961 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38963 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38965 else
38966 return false;
38968 /* This is a blend, not a permute. Elements must stay in their
38969 respective lanes. */
38970 for (i = 0; i < nelt; ++i)
38972 unsigned e = d->perm[i];
38973 if (!(e == i || e == i + nelt))
38974 return false;
38977 if (d->testing_p)
38978 return true;
38980 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38981 decision should be extracted elsewhere, so that we only try that
38982 sequence once all budget==3 options have been tried. */
38983 target = d->target;
38984 op0 = d->op0;
38985 op1 = d->op1;
38986 mask = 0;
38988 switch (vmode)
38990 case V4DFmode:
38991 case V8SFmode:
38992 case V2DFmode:
38993 case V4SFmode:
38994 case V8HImode:
38995 case V8SImode:
38996 for (i = 0; i < nelt; ++i)
38997 mask |= (d->perm[i] >= nelt) << i;
38998 break;
39000 case V2DImode:
39001 for (i = 0; i < 2; ++i)
39002 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39003 vmode = V8HImode;
39004 goto do_subreg;
39006 case V4SImode:
39007 for (i = 0; i < 4; ++i)
39008 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39009 vmode = V8HImode;
39010 goto do_subreg;
39012 case V16QImode:
39013 /* See if bytes move in pairs so we can use pblendw with
39014 an immediate argument, rather than pblendvb with a vector
39015 argument. */
39016 for (i = 0; i < 16; i += 2)
39017 if (d->perm[i] + 1 != d->perm[i + 1])
39019 use_pblendvb:
39020 for (i = 0; i < nelt; ++i)
39021 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39023 finish_pblendvb:
39024 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39025 vperm = force_reg (vmode, vperm);
39027 if (GET_MODE_SIZE (vmode) == 16)
39028 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39029 else
39030 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39031 return true;
39034 for (i = 0; i < 8; ++i)
39035 mask |= (d->perm[i * 2] >= 16) << i;
39036 vmode = V8HImode;
39037 /* FALLTHRU */
39039 do_subreg:
39040 target = gen_lowpart (vmode, target);
39041 op0 = gen_lowpart (vmode, op0);
39042 op1 = gen_lowpart (vmode, op1);
39043 break;
39045 case V32QImode:
39046 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39047 for (i = 0; i < 32; i += 2)
39048 if (d->perm[i] + 1 != d->perm[i + 1])
39049 goto use_pblendvb;
39050 /* See if bytes move in quadruplets. If yes, vpblendd
39051 with immediate can be used. */
39052 for (i = 0; i < 32; i += 4)
39053 if (d->perm[i] + 2 != d->perm[i + 2])
39054 break;
39055 if (i < 32)
39057 /* See if bytes move the same in both lanes. If yes,
39058 vpblendw with immediate can be used. */
39059 for (i = 0; i < 16; i += 2)
39060 if (d->perm[i] + 16 != d->perm[i + 16])
39061 goto use_pblendvb;
39063 /* Use vpblendw. */
39064 for (i = 0; i < 16; ++i)
39065 mask |= (d->perm[i * 2] >= 32) << i;
39066 vmode = V16HImode;
39067 goto do_subreg;
39070 /* Use vpblendd. */
39071 for (i = 0; i < 8; ++i)
39072 mask |= (d->perm[i * 4] >= 32) << i;
39073 vmode = V8SImode;
39074 goto do_subreg;
39076 case V16HImode:
39077 /* See if words move in pairs. If yes, vpblendd can be used. */
39078 for (i = 0; i < 16; i += 2)
39079 if (d->perm[i] + 1 != d->perm[i + 1])
39080 break;
39081 if (i < 16)
39083 /* See if words move the same in both lanes. If not,
39084 vpblendvb must be used. */
39085 for (i = 0; i < 8; i++)
39086 if (d->perm[i] + 8 != d->perm[i + 8])
39088 /* Use vpblendvb. */
39089 for (i = 0; i < 32; ++i)
39090 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39092 vmode = V32QImode;
39093 nelt = 32;
39094 target = gen_lowpart (vmode, target);
39095 op0 = gen_lowpart (vmode, op0);
39096 op1 = gen_lowpart (vmode, op1);
39097 goto finish_pblendvb;
39100 /* Use vpblendw. */
39101 for (i = 0; i < 16; ++i)
39102 mask |= (d->perm[i] >= 16) << i;
39103 break;
39106 /* Use vpblendd. */
39107 for (i = 0; i < 8; ++i)
39108 mask |= (d->perm[i * 2] >= 16) << i;
39109 vmode = V8SImode;
39110 goto do_subreg;
39112 case V4DImode:
39113 /* Use vpblendd. */
39114 for (i = 0; i < 4; ++i)
39115 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39116 vmode = V8SImode;
39117 goto do_subreg;
39119 default:
39120 gcc_unreachable ();
39123 /* This matches five different patterns with the different modes. */
39124 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39125 x = gen_rtx_SET (VOIDmode, target, x);
39126 emit_insn (x);
39128 return true;
39131 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39132 in terms of the variable form of vpermilps.
39134 Note that we will have already failed the immediate input vpermilps,
39135 which requires that the high and low part shuffle be identical; the
39136 variable form doesn't require that. */
39138 static bool
39139 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39141 rtx rperm[8], vperm;
39142 unsigned i;
39144 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39145 return false;
39147 /* We can only permute within the 128-bit lane. */
39148 for (i = 0; i < 8; ++i)
39150 unsigned e = d->perm[i];
39151 if (i < 4 ? e >= 4 : e < 4)
39152 return false;
39155 if (d->testing_p)
39156 return true;
39158 for (i = 0; i < 8; ++i)
39160 unsigned e = d->perm[i];
39162 /* Within each 128-bit lane, the elements of op0 are numbered
39163 from 0 and the elements of op1 are numbered from 4. */
39164 if (e >= 8 + 4)
39165 e -= 8;
39166 else if (e >= 4)
39167 e -= 4;
39169 rperm[i] = GEN_INT (e);
39172 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39173 vperm = force_reg (V8SImode, vperm);
39174 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39176 return true;
39179 /* Return true if permutation D can be performed as VMODE permutation
39180 instead. */
39182 static bool
39183 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39185 unsigned int i, j, chunk;
39187 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39188 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39189 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39190 return false;
39192 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39193 return true;
39195 chunk = d->nelt / GET_MODE_NUNITS (vmode);
39196 for (i = 0; i < d->nelt; i += chunk)
39197 if (d->perm[i] & (chunk - 1))
39198 return false;
39199 else
39200 for (j = 1; j < chunk; ++j)
39201 if (d->perm[i] + j != d->perm[i + j])
39202 return false;
39204 return true;
39207 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39208 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
39210 static bool
39211 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
39213 unsigned i, nelt, eltsz, mask;
39214 unsigned char perm[32];
39215 enum machine_mode vmode = V16QImode;
39216 rtx rperm[32], vperm, target, op0, op1;
39218 nelt = d->nelt;
39220 if (!d->one_operand_p)
39222 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
39224 if (TARGET_AVX2
39225 && valid_perm_using_mode_p (V2TImode, d))
39227 if (d->testing_p)
39228 return true;
39230 /* Use vperm2i128 insn. The pattern uses
39231 V4DImode instead of V2TImode. */
39232 target = gen_lowpart (V4DImode, d->target);
39233 op0 = gen_lowpart (V4DImode, d->op0);
39234 op1 = gen_lowpart (V4DImode, d->op1);
39235 rperm[0]
39236 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
39237 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
39238 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
39239 return true;
39241 return false;
39244 else
39246 if (GET_MODE_SIZE (d->vmode) == 16)
39248 if (!TARGET_SSSE3)
39249 return false;
39251 else if (GET_MODE_SIZE (d->vmode) == 32)
39253 if (!TARGET_AVX2)
39254 return false;
39256 /* V4DImode should be already handled through
39257 expand_vselect by vpermq instruction. */
39258 gcc_assert (d->vmode != V4DImode);
39260 vmode = V32QImode;
39261 if (d->vmode == V8SImode
39262 || d->vmode == V16HImode
39263 || d->vmode == V32QImode)
39265 /* First see if vpermq can be used for
39266 V8SImode/V16HImode/V32QImode. */
39267 if (valid_perm_using_mode_p (V4DImode, d))
39269 for (i = 0; i < 4; i++)
39270 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
39271 if (d->testing_p)
39272 return true;
39273 return expand_vselect (gen_lowpart (V4DImode, d->target),
39274 gen_lowpart (V4DImode, d->op0),
39275 perm, 4, false);
39278 /* Next see if vpermd can be used. */
39279 if (valid_perm_using_mode_p (V8SImode, d))
39280 vmode = V8SImode;
39282 /* Or if vpermps can be used. */
39283 else if (d->vmode == V8SFmode)
39284 vmode = V8SImode;
39286 if (vmode == V32QImode)
39288 /* vpshufb only works intra lanes, it is not
39289 possible to shuffle bytes in between the lanes. */
39290 for (i = 0; i < nelt; ++i)
39291 if ((d->perm[i] ^ i) & (nelt / 2))
39292 return false;
39295 else
39296 return false;
39299 if (d->testing_p)
39300 return true;
39302 if (vmode == V8SImode)
39303 for (i = 0; i < 8; ++i)
39304 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
39305 else
39307 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39308 if (!d->one_operand_p)
39309 mask = 2 * nelt - 1;
39310 else if (vmode == V16QImode)
39311 mask = nelt - 1;
39312 else
39313 mask = nelt / 2 - 1;
39315 for (i = 0; i < nelt; ++i)
39317 unsigned j, e = d->perm[i] & mask;
39318 for (j = 0; j < eltsz; ++j)
39319 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
39323 vperm = gen_rtx_CONST_VECTOR (vmode,
39324 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
39325 vperm = force_reg (vmode, vperm);
39327 target = gen_lowpart (vmode, d->target);
39328 op0 = gen_lowpart (vmode, d->op0);
39329 if (d->one_operand_p)
39331 if (vmode == V16QImode)
39332 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
39333 else if (vmode == V32QImode)
39334 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
39335 else if (vmode == V8SFmode)
39336 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
39337 else
39338 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
39340 else
39342 op1 = gen_lowpart (vmode, d->op1);
39343 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
39346 return true;
39349 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
39350 in a single instruction. */
39352 static bool
39353 expand_vec_perm_1 (struct expand_vec_perm_d *d)
39355 unsigned i, nelt = d->nelt;
39356 unsigned char perm2[MAX_VECT_LEN];
39358 /* Check plain VEC_SELECT first, because AVX has instructions that could
39359 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
39360 input where SEL+CONCAT may not. */
39361 if (d->one_operand_p)
39363 int mask = nelt - 1;
39364 bool identity_perm = true;
39365 bool broadcast_perm = true;
39367 for (i = 0; i < nelt; i++)
39369 perm2[i] = d->perm[i] & mask;
39370 if (perm2[i] != i)
39371 identity_perm = false;
39372 if (perm2[i])
39373 broadcast_perm = false;
39376 if (identity_perm)
39378 if (!d->testing_p)
39379 emit_move_insn (d->target, d->op0);
39380 return true;
39382 else if (broadcast_perm && TARGET_AVX2)
39384 /* Use vpbroadcast{b,w,d}. */
39385 rtx (*gen) (rtx, rtx) = NULL;
39386 switch (d->vmode)
39388 case V32QImode:
39389 gen = gen_avx2_pbroadcastv32qi_1;
39390 break;
39391 case V16HImode:
39392 gen = gen_avx2_pbroadcastv16hi_1;
39393 break;
39394 case V8SImode:
39395 gen = gen_avx2_pbroadcastv8si_1;
39396 break;
39397 case V16QImode:
39398 gen = gen_avx2_pbroadcastv16qi;
39399 break;
39400 case V8HImode:
39401 gen = gen_avx2_pbroadcastv8hi;
39402 break;
39403 case V8SFmode:
39404 gen = gen_avx2_vec_dupv8sf_1;
39405 break;
39406 /* For other modes prefer other shuffles this function creates. */
39407 default: break;
39409 if (gen != NULL)
39411 if (!d->testing_p)
39412 emit_insn (gen (d->target, d->op0));
39413 return true;
39417 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
39418 return true;
39420 /* There are plenty of patterns in sse.md that are written for
39421 SEL+CONCAT and are not replicated for a single op. Perhaps
39422 that should be changed, to avoid the nastiness here. */
39424 /* Recognize interleave style patterns, which means incrementing
39425 every other permutation operand. */
39426 for (i = 0; i < nelt; i += 2)
39428 perm2[i] = d->perm[i] & mask;
39429 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
39431 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39432 d->testing_p))
39433 return true;
39435 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
39436 if (nelt >= 4)
39438 for (i = 0; i < nelt; i += 4)
39440 perm2[i + 0] = d->perm[i + 0] & mask;
39441 perm2[i + 1] = d->perm[i + 1] & mask;
39442 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
39443 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
39446 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39447 d->testing_p))
39448 return true;
39452 /* Finally, try the fully general two operand permute. */
39453 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
39454 d->testing_p))
39455 return true;
39457 /* Recognize interleave style patterns with reversed operands. */
39458 if (!d->one_operand_p)
39460 for (i = 0; i < nelt; ++i)
39462 unsigned e = d->perm[i];
39463 if (e >= nelt)
39464 e -= nelt;
39465 else
39466 e += nelt;
39467 perm2[i] = e;
39470 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
39471 d->testing_p))
39472 return true;
39475 /* Try the SSE4.1 blend variable merge instructions. */
39476 if (expand_vec_perm_blend (d))
39477 return true;
39479 /* Try one of the AVX vpermil variable permutations. */
39480 if (expand_vec_perm_vpermil (d))
39481 return true;
39483 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
39484 vpshufb, vpermd, vpermps or vpermq variable permutation. */
39485 if (expand_vec_perm_pshufb (d))
39486 return true;
39488 return false;
39491 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39492 in terms of a pair of pshuflw + pshufhw instructions. */
39494 static bool
39495 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
39497 unsigned char perm2[MAX_VECT_LEN];
39498 unsigned i;
39499 bool ok;
39501 if (d->vmode != V8HImode || !d->one_operand_p)
39502 return false;
39504 /* The two permutations only operate in 64-bit lanes. */
39505 for (i = 0; i < 4; ++i)
39506 if (d->perm[i] >= 4)
39507 return false;
39508 for (i = 4; i < 8; ++i)
39509 if (d->perm[i] < 4)
39510 return false;
39512 if (d->testing_p)
39513 return true;
39515 /* Emit the pshuflw. */
39516 memcpy (perm2, d->perm, 4);
39517 for (i = 4; i < 8; ++i)
39518 perm2[i] = i;
39519 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
39520 gcc_assert (ok);
39522 /* Emit the pshufhw. */
39523 memcpy (perm2 + 4, d->perm + 4, 4);
39524 for (i = 0; i < 4; ++i)
39525 perm2[i] = i;
39526 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
39527 gcc_assert (ok);
39529 return true;
39532 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39533 the permutation using the SSSE3 palignr instruction. This succeeds
39534 when all of the elements in PERM fit within one vector and we merely
39535 need to shift them down so that a single vector permutation has a
39536 chance to succeed. */
39538 static bool
39539 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
39541 unsigned i, nelt = d->nelt;
39542 unsigned min, max;
39543 bool in_order, ok;
39544 rtx shift;
39546 /* Even with AVX, palignr only operates on 128-bit vectors. */
39547 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39548 return false;
39550 min = nelt, max = 0;
39551 for (i = 0; i < nelt; ++i)
39553 unsigned e = d->perm[i];
39554 if (e < min)
39555 min = e;
39556 if (e > max)
39557 max = e;
39559 if (min == 0 || max - min >= nelt)
39560 return false;
39562 /* Given that we have SSSE3, we know we'll be able to implement the
39563 single operand permutation after the palignr with pshufb. */
39564 if (d->testing_p)
39565 return true;
39567 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39568 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39569 gen_lowpart (TImode, d->op1),
39570 gen_lowpart (TImode, d->op0), shift));
39572 d->op0 = d->op1 = d->target;
39573 d->one_operand_p = true;
39575 in_order = true;
39576 for (i = 0; i < nelt; ++i)
39578 unsigned e = d->perm[i] - min;
39579 if (e != i)
39580 in_order = false;
39581 d->perm[i] = e;
39584 /* Test for the degenerate case where the alignment by itself
39585 produces the desired permutation. */
39586 if (in_order)
39587 return true;
39589 ok = expand_vec_perm_1 (d);
39590 gcc_assert (ok);
39592 return ok;
39595 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39597 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39598 a two vector permutation into a single vector permutation by using
39599 an interleave operation to merge the vectors. */
39601 static bool
39602 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39604 struct expand_vec_perm_d dremap, dfinal;
39605 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39606 unsigned HOST_WIDE_INT contents;
39607 unsigned char remap[2 * MAX_VECT_LEN];
39608 rtx seq;
39609 bool ok, same_halves = false;
39611 if (GET_MODE_SIZE (d->vmode) == 16)
39613 if (d->one_operand_p)
39614 return false;
39616 else if (GET_MODE_SIZE (d->vmode) == 32)
39618 if (!TARGET_AVX)
39619 return false;
39620 /* For 32-byte modes allow even d->one_operand_p.
39621 The lack of cross-lane shuffling in some instructions
39622 might prevent a single insn shuffle. */
39623 dfinal = *d;
39624 dfinal.testing_p = true;
39625 /* If expand_vec_perm_interleave3 can expand this into
39626 a 3 insn sequence, give up and let it be expanded as
39627 3 insn sequence. While that is one insn longer,
39628 it doesn't need a memory operand and in the common
39629 case that both interleave low and high permutations
39630 with the same operands are adjacent needs 4 insns
39631 for both after CSE. */
39632 if (expand_vec_perm_interleave3 (&dfinal))
39633 return false;
39635 else
39636 return false;
39638 /* Examine from whence the elements come. */
39639 contents = 0;
39640 for (i = 0; i < nelt; ++i)
39641 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39643 memset (remap, 0xff, sizeof (remap));
39644 dremap = *d;
39646 if (GET_MODE_SIZE (d->vmode) == 16)
39648 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39650 /* Split the two input vectors into 4 halves. */
39651 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39652 h2 = h1 << nelt2;
39653 h3 = h2 << nelt2;
39654 h4 = h3 << nelt2;
39656 /* If the elements from the low halves use interleave low, and similarly
39657 for interleave high. If the elements are from mis-matched halves, we
39658 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39659 if ((contents & (h1 | h3)) == contents)
39661 /* punpckl* */
39662 for (i = 0; i < nelt2; ++i)
39664 remap[i] = i * 2;
39665 remap[i + nelt] = i * 2 + 1;
39666 dremap.perm[i * 2] = i;
39667 dremap.perm[i * 2 + 1] = i + nelt;
39669 if (!TARGET_SSE2 && d->vmode == V4SImode)
39670 dremap.vmode = V4SFmode;
39672 else if ((contents & (h2 | h4)) == contents)
39674 /* punpckh* */
39675 for (i = 0; i < nelt2; ++i)
39677 remap[i + nelt2] = i * 2;
39678 remap[i + nelt + nelt2] = i * 2 + 1;
39679 dremap.perm[i * 2] = i + nelt2;
39680 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39682 if (!TARGET_SSE2 && d->vmode == V4SImode)
39683 dremap.vmode = V4SFmode;
39685 else if ((contents & (h1 | h4)) == contents)
39687 /* shufps */
39688 for (i = 0; i < nelt2; ++i)
39690 remap[i] = i;
39691 remap[i + nelt + nelt2] = i + nelt2;
39692 dremap.perm[i] = i;
39693 dremap.perm[i + nelt2] = i + nelt + nelt2;
39695 if (nelt != 4)
39697 /* shufpd */
39698 dremap.vmode = V2DImode;
39699 dremap.nelt = 2;
39700 dremap.perm[0] = 0;
39701 dremap.perm[1] = 3;
39704 else if ((contents & (h2 | h3)) == contents)
39706 /* shufps */
39707 for (i = 0; i < nelt2; ++i)
39709 remap[i + nelt2] = i;
39710 remap[i + nelt] = i + nelt2;
39711 dremap.perm[i] = i + nelt2;
39712 dremap.perm[i + nelt2] = i + nelt;
39714 if (nelt != 4)
39716 /* shufpd */
39717 dremap.vmode = V2DImode;
39718 dremap.nelt = 2;
39719 dremap.perm[0] = 1;
39720 dremap.perm[1] = 2;
39723 else
39724 return false;
39726 else
39728 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39729 unsigned HOST_WIDE_INT q[8];
39730 unsigned int nonzero_halves[4];
39732 /* Split the two input vectors into 8 quarters. */
39733 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39734 for (i = 1; i < 8; ++i)
39735 q[i] = q[0] << (nelt4 * i);
39736 for (i = 0; i < 4; ++i)
39737 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39739 nonzero_halves[nzcnt] = i;
39740 ++nzcnt;
39743 if (nzcnt == 1)
39745 gcc_assert (d->one_operand_p);
39746 nonzero_halves[1] = nonzero_halves[0];
39747 same_halves = true;
39749 else if (d->one_operand_p)
39751 gcc_assert (nonzero_halves[0] == 0);
39752 gcc_assert (nonzero_halves[1] == 1);
39755 if (nzcnt <= 2)
39757 if (d->perm[0] / nelt2 == nonzero_halves[1])
39759 /* Attempt to increase the likelihood that dfinal
39760 shuffle will be intra-lane. */
39761 char tmph = nonzero_halves[0];
39762 nonzero_halves[0] = nonzero_halves[1];
39763 nonzero_halves[1] = tmph;
39766 /* vperm2f128 or vperm2i128. */
39767 for (i = 0; i < nelt2; ++i)
39769 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39770 remap[i + nonzero_halves[0] * nelt2] = i;
39771 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39772 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39775 if (d->vmode != V8SFmode
39776 && d->vmode != V4DFmode
39777 && d->vmode != V8SImode)
39779 dremap.vmode = V8SImode;
39780 dremap.nelt = 8;
39781 for (i = 0; i < 4; ++i)
39783 dremap.perm[i] = i + nonzero_halves[0] * 4;
39784 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39788 else if (d->one_operand_p)
39789 return false;
39790 else if (TARGET_AVX2
39791 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39793 /* vpunpckl* */
39794 for (i = 0; i < nelt4; ++i)
39796 remap[i] = i * 2;
39797 remap[i + nelt] = i * 2 + 1;
39798 remap[i + nelt2] = i * 2 + nelt2;
39799 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39800 dremap.perm[i * 2] = i;
39801 dremap.perm[i * 2 + 1] = i + nelt;
39802 dremap.perm[i * 2 + nelt2] = i + nelt2;
39803 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39806 else if (TARGET_AVX2
39807 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39809 /* vpunpckh* */
39810 for (i = 0; i < nelt4; ++i)
39812 remap[i + nelt4] = i * 2;
39813 remap[i + nelt + nelt4] = i * 2 + 1;
39814 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39815 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39816 dremap.perm[i * 2] = i + nelt4;
39817 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39818 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39819 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39822 else
39823 return false;
39826 /* Use the remapping array set up above to move the elements from their
39827 swizzled locations into their final destinations. */
39828 dfinal = *d;
39829 for (i = 0; i < nelt; ++i)
39831 unsigned e = remap[d->perm[i]];
39832 gcc_assert (e < nelt);
39833 /* If same_halves is true, both halves of the remapped vector are the
39834 same. Avoid cross-lane accesses if possible. */
39835 if (same_halves && i >= nelt2)
39837 gcc_assert (e < nelt2);
39838 dfinal.perm[i] = e + nelt2;
39840 else
39841 dfinal.perm[i] = e;
39843 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39844 dfinal.op1 = dfinal.op0;
39845 dfinal.one_operand_p = true;
39846 dremap.target = dfinal.op0;
39848 /* Test if the final remap can be done with a single insn. For V4SFmode or
39849 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39850 start_sequence ();
39851 ok = expand_vec_perm_1 (&dfinal);
39852 seq = get_insns ();
39853 end_sequence ();
39855 if (!ok)
39856 return false;
39858 if (d->testing_p)
39859 return true;
39861 if (dremap.vmode != dfinal.vmode)
39863 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39864 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39865 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39868 ok = expand_vec_perm_1 (&dremap);
39869 gcc_assert (ok);
39871 emit_insn (seq);
39872 return true;
39875 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39876 a single vector cross-lane permutation into vpermq followed
39877 by any of the single insn permutations. */
39879 static bool
39880 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39882 struct expand_vec_perm_d dremap, dfinal;
39883 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39884 unsigned contents[2];
39885 bool ok;
39887 if (!(TARGET_AVX2
39888 && (d->vmode == V32QImode || d->vmode == V16HImode)
39889 && d->one_operand_p))
39890 return false;
39892 contents[0] = 0;
39893 contents[1] = 0;
39894 for (i = 0; i < nelt2; ++i)
39896 contents[0] |= 1u << (d->perm[i] / nelt4);
39897 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39900 for (i = 0; i < 2; ++i)
39902 unsigned int cnt = 0;
39903 for (j = 0; j < 4; ++j)
39904 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39905 return false;
39908 if (d->testing_p)
39909 return true;
39911 dremap = *d;
39912 dremap.vmode = V4DImode;
39913 dremap.nelt = 4;
39914 dremap.target = gen_reg_rtx (V4DImode);
39915 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39916 dremap.op1 = dremap.op0;
39917 dremap.one_operand_p = true;
39918 for (i = 0; i < 2; ++i)
39920 unsigned int cnt = 0;
39921 for (j = 0; j < 4; ++j)
39922 if ((contents[i] & (1u << j)) != 0)
39923 dremap.perm[2 * i + cnt++] = j;
39924 for (; cnt < 2; ++cnt)
39925 dremap.perm[2 * i + cnt] = 0;
39928 dfinal = *d;
39929 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39930 dfinal.op1 = dfinal.op0;
39931 dfinal.one_operand_p = true;
39932 for (i = 0, j = 0; i < nelt; ++i)
39934 if (i == nelt2)
39935 j = 2;
39936 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39937 if ((d->perm[i] / nelt4) == dremap.perm[j])
39939 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39940 dfinal.perm[i] |= nelt4;
39941 else
39942 gcc_unreachable ();
39945 ok = expand_vec_perm_1 (&dremap);
39946 gcc_assert (ok);
39948 ok = expand_vec_perm_1 (&dfinal);
39949 gcc_assert (ok);
39951 return true;
39954 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39955 a vector permutation using two instructions, vperm2f128 resp.
39956 vperm2i128 followed by any single in-lane permutation. */
39958 static bool
39959 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39961 struct expand_vec_perm_d dfirst, dsecond;
39962 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39963 bool ok;
39965 if (!TARGET_AVX
39966 || GET_MODE_SIZE (d->vmode) != 32
39967 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39968 return false;
39970 dsecond = *d;
39971 dsecond.one_operand_p = false;
39972 dsecond.testing_p = true;
39974 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39975 immediate. For perm < 16 the second permutation uses
39976 d->op0 as first operand, for perm >= 16 it uses d->op1
39977 as first operand. The second operand is the result of
39978 vperm2[fi]128. */
39979 for (perm = 0; perm < 32; perm++)
39981 /* Ignore permutations which do not move anything cross-lane. */
39982 if (perm < 16)
39984 /* The second shuffle for e.g. V4DFmode has
39985 0123 and ABCD operands.
39986 Ignore AB23, as 23 is already in the second lane
39987 of the first operand. */
39988 if ((perm & 0xc) == (1 << 2)) continue;
39989 /* And 01CD, as 01 is in the first lane of the first
39990 operand. */
39991 if ((perm & 3) == 0) continue;
39992 /* And 4567, as then the vperm2[fi]128 doesn't change
39993 anything on the original 4567 second operand. */
39994 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39996 else
39998 /* The second shuffle for e.g. V4DFmode has
39999 4567 and ABCD operands.
40000 Ignore AB67, as 67 is already in the second lane
40001 of the first operand. */
40002 if ((perm & 0xc) == (3 << 2)) continue;
40003 /* And 45CD, as 45 is in the first lane of the first
40004 operand. */
40005 if ((perm & 3) == 2) continue;
40006 /* And 0123, as then the vperm2[fi]128 doesn't change
40007 anything on the original 0123 first operand. */
40008 if ((perm & 0xf) == (1 << 2)) continue;
40011 for (i = 0; i < nelt; i++)
40013 j = d->perm[i] / nelt2;
40014 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40015 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40016 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40017 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40018 else
40019 break;
40022 if (i == nelt)
40024 start_sequence ();
40025 ok = expand_vec_perm_1 (&dsecond);
40026 end_sequence ();
40028 else
40029 ok = false;
40031 if (ok)
40033 if (d->testing_p)
40034 return true;
40036 /* Found a usable second shuffle. dfirst will be
40037 vperm2f128 on d->op0 and d->op1. */
40038 dsecond.testing_p = false;
40039 dfirst = *d;
40040 dfirst.target = gen_reg_rtx (d->vmode);
40041 for (i = 0; i < nelt; i++)
40042 dfirst.perm[i] = (i & (nelt2 - 1))
40043 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40045 ok = expand_vec_perm_1 (&dfirst);
40046 gcc_assert (ok);
40048 /* And dsecond is some single insn shuffle, taking
40049 d->op0 and result of vperm2f128 (if perm < 16) or
40050 d->op1 and result of vperm2f128 (otherwise). */
40051 dsecond.op1 = dfirst.target;
40052 if (perm >= 16)
40053 dsecond.op0 = dfirst.op1;
40055 ok = expand_vec_perm_1 (&dsecond);
40056 gcc_assert (ok);
40058 return true;
40061 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40062 if (d->one_operand_p)
40063 return false;
40066 return false;
40069 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40070 a two vector permutation using 2 intra-lane interleave insns
40071 and cross-lane shuffle for 32-byte vectors. */
40073 static bool
40074 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40076 unsigned i, nelt;
40077 rtx (*gen) (rtx, rtx, rtx);
40079 if (d->one_operand_p)
40080 return false;
40081 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40083 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40085 else
40086 return false;
40088 nelt = d->nelt;
40089 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40090 return false;
40091 for (i = 0; i < nelt; i += 2)
40092 if (d->perm[i] != d->perm[0] + i / 2
40093 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40094 return false;
40096 if (d->testing_p)
40097 return true;
40099 switch (d->vmode)
40101 case V32QImode:
40102 if (d->perm[0])
40103 gen = gen_vec_interleave_highv32qi;
40104 else
40105 gen = gen_vec_interleave_lowv32qi;
40106 break;
40107 case V16HImode:
40108 if (d->perm[0])
40109 gen = gen_vec_interleave_highv16hi;
40110 else
40111 gen = gen_vec_interleave_lowv16hi;
40112 break;
40113 case V8SImode:
40114 if (d->perm[0])
40115 gen = gen_vec_interleave_highv8si;
40116 else
40117 gen = gen_vec_interleave_lowv8si;
40118 break;
40119 case V4DImode:
40120 if (d->perm[0])
40121 gen = gen_vec_interleave_highv4di;
40122 else
40123 gen = gen_vec_interleave_lowv4di;
40124 break;
40125 case V8SFmode:
40126 if (d->perm[0])
40127 gen = gen_vec_interleave_highv8sf;
40128 else
40129 gen = gen_vec_interleave_lowv8sf;
40130 break;
40131 case V4DFmode:
40132 if (d->perm[0])
40133 gen = gen_vec_interleave_highv4df;
40134 else
40135 gen = gen_vec_interleave_lowv4df;
40136 break;
40137 default:
40138 gcc_unreachable ();
40141 emit_insn (gen (d->target, d->op0, d->op1));
40142 return true;
40145 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40146 a single vector permutation using a single intra-lane vector
40147 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40148 the non-swapped and swapped vectors together. */
40150 static bool
40151 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40153 struct expand_vec_perm_d dfirst, dsecond;
40154 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40155 rtx seq;
40156 bool ok;
40157 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40159 if (!TARGET_AVX
40160 || TARGET_AVX2
40161 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40162 || !d->one_operand_p)
40163 return false;
40165 dfirst = *d;
40166 for (i = 0; i < nelt; i++)
40167 dfirst.perm[i] = 0xff;
40168 for (i = 0, msk = 0; i < nelt; i++)
40170 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40171 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40172 return false;
40173 dfirst.perm[j] = d->perm[i];
40174 if (j != i)
40175 msk |= (1 << i);
40177 for (i = 0; i < nelt; i++)
40178 if (dfirst.perm[i] == 0xff)
40179 dfirst.perm[i] = i;
40181 if (!d->testing_p)
40182 dfirst.target = gen_reg_rtx (dfirst.vmode);
40184 start_sequence ();
40185 ok = expand_vec_perm_1 (&dfirst);
40186 seq = get_insns ();
40187 end_sequence ();
40189 if (!ok)
40190 return false;
40192 if (d->testing_p)
40193 return true;
40195 emit_insn (seq);
40197 dsecond = *d;
40198 dsecond.op0 = dfirst.target;
40199 dsecond.op1 = dfirst.target;
40200 dsecond.one_operand_p = true;
40201 dsecond.target = gen_reg_rtx (dsecond.vmode);
40202 for (i = 0; i < nelt; i++)
40203 dsecond.perm[i] = i ^ nelt2;
40205 ok = expand_vec_perm_1 (&dsecond);
40206 gcc_assert (ok);
40208 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
40209 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
40210 return true;
40213 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
40214 permutation using two vperm2f128, followed by a vshufpd insn blending
40215 the two vectors together. */
40217 static bool
40218 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
40220 struct expand_vec_perm_d dfirst, dsecond, dthird;
40221 bool ok;
40223 if (!TARGET_AVX || (d->vmode != V4DFmode))
40224 return false;
40226 if (d->testing_p)
40227 return true;
40229 dfirst = *d;
40230 dsecond = *d;
40231 dthird = *d;
40233 dfirst.perm[0] = (d->perm[0] & ~1);
40234 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
40235 dfirst.perm[2] = (d->perm[2] & ~1);
40236 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
40237 dsecond.perm[0] = (d->perm[1] & ~1);
40238 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
40239 dsecond.perm[2] = (d->perm[3] & ~1);
40240 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
40241 dthird.perm[0] = (d->perm[0] % 2);
40242 dthird.perm[1] = (d->perm[1] % 2) + 4;
40243 dthird.perm[2] = (d->perm[2] % 2) + 2;
40244 dthird.perm[3] = (d->perm[3] % 2) + 6;
40246 dfirst.target = gen_reg_rtx (dfirst.vmode);
40247 dsecond.target = gen_reg_rtx (dsecond.vmode);
40248 dthird.op0 = dfirst.target;
40249 dthird.op1 = dsecond.target;
40250 dthird.one_operand_p = false;
40252 canonicalize_perm (&dfirst);
40253 canonicalize_perm (&dsecond);
40255 ok = expand_vec_perm_1 (&dfirst)
40256 && expand_vec_perm_1 (&dsecond)
40257 && expand_vec_perm_1 (&dthird);
40259 gcc_assert (ok);
40261 return true;
40264 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
40265 permutation with two pshufb insns and an ior. We should have already
40266 failed all two instruction sequences. */
40268 static bool
40269 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
40271 rtx rperm[2][16], vperm, l, h, op, m128;
40272 unsigned int i, nelt, eltsz;
40274 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40275 return false;
40276 gcc_assert (!d->one_operand_p);
40278 nelt = d->nelt;
40279 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40281 /* Generate two permutation masks. If the required element is within
40282 the given vector it is shuffled into the proper lane. If the required
40283 element is in the other vector, force a zero into the lane by setting
40284 bit 7 in the permutation mask. */
40285 m128 = GEN_INT (-128);
40286 for (i = 0; i < nelt; ++i)
40288 unsigned j, e = d->perm[i];
40289 unsigned which = (e >= nelt);
40290 if (e >= nelt)
40291 e -= nelt;
40293 for (j = 0; j < eltsz; ++j)
40295 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
40296 rperm[1-which][i*eltsz + j] = m128;
40300 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
40301 vperm = force_reg (V16QImode, vperm);
40303 l = gen_reg_rtx (V16QImode);
40304 op = gen_lowpart (V16QImode, d->op0);
40305 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
40307 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
40308 vperm = force_reg (V16QImode, vperm);
40310 h = gen_reg_rtx (V16QImode);
40311 op = gen_lowpart (V16QImode, d->op1);
40312 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
40314 op = gen_lowpart (V16QImode, d->target);
40315 emit_insn (gen_iorv16qi3 (op, l, h));
40317 return true;
40320 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
40321 with two vpshufb insns, vpermq and vpor. We should have already failed
40322 all two or three instruction sequences. */
40324 static bool
40325 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
40327 rtx rperm[2][32], vperm, l, h, hp, op, m128;
40328 unsigned int i, nelt, eltsz;
40330 if (!TARGET_AVX2
40331 || !d->one_operand_p
40332 || (d->vmode != V32QImode && d->vmode != V16HImode))
40333 return false;
40335 if (d->testing_p)
40336 return true;
40338 nelt = d->nelt;
40339 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40341 /* Generate two permutation masks. If the required element is within
40342 the same lane, it is shuffled in. If the required element from the
40343 other lane, force a zero by setting bit 7 in the permutation mask.
40344 In the other mask the mask has non-negative elements if element
40345 is requested from the other lane, but also moved to the other lane,
40346 so that the result of vpshufb can have the two V2TImode halves
40347 swapped. */
40348 m128 = GEN_INT (-128);
40349 for (i = 0; i < nelt; ++i)
40351 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40352 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40354 for (j = 0; j < eltsz; ++j)
40356 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
40357 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
40361 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40362 vperm = force_reg (V32QImode, vperm);
40364 h = gen_reg_rtx (V32QImode);
40365 op = gen_lowpart (V32QImode, d->op0);
40366 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40368 /* Swap the 128-byte lanes of h into hp. */
40369 hp = gen_reg_rtx (V4DImode);
40370 op = gen_lowpart (V4DImode, h);
40371 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
40372 const1_rtx));
40374 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40375 vperm = force_reg (V32QImode, vperm);
40377 l = gen_reg_rtx (V32QImode);
40378 op = gen_lowpart (V32QImode, d->op0);
40379 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40381 op = gen_lowpart (V32QImode, d->target);
40382 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
40384 return true;
40387 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
40388 and extract-odd permutations of two V32QImode and V16QImode operand
40389 with two vpshufb insns, vpor and vpermq. We should have already
40390 failed all two or three instruction sequences. */
40392 static bool
40393 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
40395 rtx rperm[2][32], vperm, l, h, ior, op, m128;
40396 unsigned int i, nelt, eltsz;
40398 if (!TARGET_AVX2
40399 || d->one_operand_p
40400 || (d->vmode != V32QImode && d->vmode != V16HImode))
40401 return false;
40403 for (i = 0; i < d->nelt; ++i)
40404 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
40405 return false;
40407 if (d->testing_p)
40408 return true;
40410 nelt = d->nelt;
40411 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40413 /* Generate two permutation masks. In the first permutation mask
40414 the first quarter will contain indexes for the first half
40415 of the op0, the second quarter will contain bit 7 set, third quarter
40416 will contain indexes for the second half of the op0 and the
40417 last quarter bit 7 set. In the second permutation mask
40418 the first quarter will contain bit 7 set, the second quarter
40419 indexes for the first half of the op1, the third quarter bit 7 set
40420 and last quarter indexes for the second half of the op1.
40421 I.e. the first mask e.g. for V32QImode extract even will be:
40422 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
40423 (all values masked with 0xf except for -128) and second mask
40424 for extract even will be
40425 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
40426 m128 = GEN_INT (-128);
40427 for (i = 0; i < nelt; ++i)
40429 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40430 unsigned which = d->perm[i] >= nelt;
40431 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
40433 for (j = 0; j < eltsz; ++j)
40435 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
40436 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
40440 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40441 vperm = force_reg (V32QImode, vperm);
40443 l = gen_reg_rtx (V32QImode);
40444 op = gen_lowpart (V32QImode, d->op0);
40445 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40447 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40448 vperm = force_reg (V32QImode, vperm);
40450 h = gen_reg_rtx (V32QImode);
40451 op = gen_lowpart (V32QImode, d->op1);
40452 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40454 ior = gen_reg_rtx (V32QImode);
40455 emit_insn (gen_iorv32qi3 (ior, l, h));
40457 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
40458 op = gen_lowpart (V4DImode, d->target);
40459 ior = gen_lowpart (V4DImode, ior);
40460 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
40461 const1_rtx, GEN_INT (3)));
40463 return true;
40466 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
40467 and extract-odd permutations. */
40469 static bool
40470 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
40472 rtx t1, t2, t3;
40474 switch (d->vmode)
40476 case V4DFmode:
40477 t1 = gen_reg_rtx (V4DFmode);
40478 t2 = gen_reg_rtx (V4DFmode);
40480 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40481 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
40482 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
40484 /* Now an unpck[lh]pd will produce the result required. */
40485 if (odd)
40486 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
40487 else
40488 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
40489 emit_insn (t3);
40490 break;
40492 case V8SFmode:
40494 int mask = odd ? 0xdd : 0x88;
40496 t1 = gen_reg_rtx (V8SFmode);
40497 t2 = gen_reg_rtx (V8SFmode);
40498 t3 = gen_reg_rtx (V8SFmode);
40500 /* Shuffle within the 128-bit lanes to produce:
40501 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
40502 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
40503 GEN_INT (mask)));
40505 /* Shuffle the lanes around to produce:
40506 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
40507 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
40508 GEN_INT (0x3)));
40510 /* Shuffle within the 128-bit lanes to produce:
40511 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
40512 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
40514 /* Shuffle within the 128-bit lanes to produce:
40515 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
40516 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
40518 /* Shuffle the lanes around to produce:
40519 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
40520 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
40521 GEN_INT (0x20)));
40523 break;
40525 case V2DFmode:
40526 case V4SFmode:
40527 case V2DImode:
40528 case V4SImode:
40529 /* These are always directly implementable by expand_vec_perm_1. */
40530 gcc_unreachable ();
40532 case V8HImode:
40533 if (TARGET_SSSE3)
40534 return expand_vec_perm_pshufb2 (d);
40535 else
40537 /* We need 2*log2(N)-1 operations to achieve odd/even
40538 with interleave. */
40539 t1 = gen_reg_rtx (V8HImode);
40540 t2 = gen_reg_rtx (V8HImode);
40541 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
40542 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
40543 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
40544 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
40545 if (odd)
40546 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
40547 else
40548 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
40549 emit_insn (t3);
40551 break;
40553 case V16QImode:
40554 if (TARGET_SSSE3)
40555 return expand_vec_perm_pshufb2 (d);
40556 else
40558 t1 = gen_reg_rtx (V16QImode);
40559 t2 = gen_reg_rtx (V16QImode);
40560 t3 = gen_reg_rtx (V16QImode);
40561 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40562 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40563 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40564 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40565 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40566 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40567 if (odd)
40568 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40569 else
40570 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40571 emit_insn (t3);
40573 break;
40575 case V16HImode:
40576 case V32QImode:
40577 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40579 case V4DImode:
40580 if (!TARGET_AVX2)
40582 struct expand_vec_perm_d d_copy = *d;
40583 d_copy.vmode = V4DFmode;
40584 d_copy.target = gen_lowpart (V4DFmode, d->target);
40585 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40586 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40587 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40590 t1 = gen_reg_rtx (V4DImode);
40591 t2 = gen_reg_rtx (V4DImode);
40593 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40594 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40595 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40597 /* Now an vpunpck[lh]qdq will produce the result required. */
40598 if (odd)
40599 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40600 else
40601 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40602 emit_insn (t3);
40603 break;
40605 case V8SImode:
40606 if (!TARGET_AVX2)
40608 struct expand_vec_perm_d d_copy = *d;
40609 d_copy.vmode = V8SFmode;
40610 d_copy.target = gen_lowpart (V8SFmode, d->target);
40611 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40612 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40613 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40616 t1 = gen_reg_rtx (V8SImode);
40617 t2 = gen_reg_rtx (V8SImode);
40619 /* Shuffle the lanes around into
40620 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40621 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40622 gen_lowpart (V4DImode, d->op0),
40623 gen_lowpart (V4DImode, d->op1),
40624 GEN_INT (0x20)));
40625 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40626 gen_lowpart (V4DImode, d->op0),
40627 gen_lowpart (V4DImode, d->op1),
40628 GEN_INT (0x31)));
40630 /* Swap the 2nd and 3rd position in each lane into
40631 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40632 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40633 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40634 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40635 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40637 /* Now an vpunpck[lh]qdq will produce
40638 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40639 if (odd)
40640 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40641 gen_lowpart (V4DImode, t1),
40642 gen_lowpart (V4DImode, t2));
40643 else
40644 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40645 gen_lowpart (V4DImode, t1),
40646 gen_lowpart (V4DImode, t2));
40647 emit_insn (t3);
40648 break;
40650 default:
40651 gcc_unreachable ();
40654 return true;
40657 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40658 extract-even and extract-odd permutations. */
40660 static bool
40661 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40663 unsigned i, odd, nelt = d->nelt;
40665 odd = d->perm[0];
40666 if (odd != 0 && odd != 1)
40667 return false;
40669 for (i = 1; i < nelt; ++i)
40670 if (d->perm[i] != 2 * i + odd)
40671 return false;
40673 return expand_vec_perm_even_odd_1 (d, odd);
40676 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40677 permutations. We assume that expand_vec_perm_1 has already failed. */
40679 static bool
40680 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40682 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40683 enum machine_mode vmode = d->vmode;
40684 unsigned char perm2[4];
40685 rtx op0 = d->op0;
40686 bool ok;
40688 switch (vmode)
40690 case V4DFmode:
40691 case V8SFmode:
40692 /* These are special-cased in sse.md so that we can optionally
40693 use the vbroadcast instruction. They expand to two insns
40694 if the input happens to be in a register. */
40695 gcc_unreachable ();
40697 case V2DFmode:
40698 case V2DImode:
40699 case V4SFmode:
40700 case V4SImode:
40701 /* These are always implementable using standard shuffle patterns. */
40702 gcc_unreachable ();
40704 case V8HImode:
40705 case V16QImode:
40706 /* These can be implemented via interleave. We save one insn by
40707 stopping once we have promoted to V4SImode and then use pshufd. */
40710 rtx dest;
40711 rtx (*gen) (rtx, rtx, rtx)
40712 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40713 : gen_vec_interleave_lowv8hi;
40715 if (elt >= nelt2)
40717 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40718 : gen_vec_interleave_highv8hi;
40719 elt -= nelt2;
40721 nelt2 /= 2;
40723 dest = gen_reg_rtx (vmode);
40724 emit_insn (gen (dest, op0, op0));
40725 vmode = get_mode_wider_vector (vmode);
40726 op0 = gen_lowpart (vmode, dest);
40728 while (vmode != V4SImode);
40730 memset (perm2, elt, 4);
40731 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40732 d->testing_p);
40733 gcc_assert (ok);
40734 return true;
40736 case V32QImode:
40737 case V16HImode:
40738 case V8SImode:
40739 case V4DImode:
40740 /* For AVX2 broadcasts of the first element vpbroadcast* or
40741 vpermq should be used by expand_vec_perm_1. */
40742 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40743 return false;
40745 default:
40746 gcc_unreachable ();
40750 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40751 broadcast permutations. */
40753 static bool
40754 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40756 unsigned i, elt, nelt = d->nelt;
40758 if (!d->one_operand_p)
40759 return false;
40761 elt = d->perm[0];
40762 for (i = 1; i < nelt; ++i)
40763 if (d->perm[i] != elt)
40764 return false;
40766 return expand_vec_perm_broadcast_1 (d);
40769 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40770 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40771 all the shorter instruction sequences. */
40773 static bool
40774 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40776 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40777 unsigned int i, nelt, eltsz;
40778 bool used[4];
40780 if (!TARGET_AVX2
40781 || d->one_operand_p
40782 || (d->vmode != V32QImode && d->vmode != V16HImode))
40783 return false;
40785 if (d->testing_p)
40786 return true;
40788 nelt = d->nelt;
40789 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40791 /* Generate 4 permutation masks. If the required element is within
40792 the same lane, it is shuffled in. If the required element from the
40793 other lane, force a zero by setting bit 7 in the permutation mask.
40794 In the other mask the mask has non-negative elements if element
40795 is requested from the other lane, but also moved to the other lane,
40796 so that the result of vpshufb can have the two V2TImode halves
40797 swapped. */
40798 m128 = GEN_INT (-128);
40799 for (i = 0; i < 32; ++i)
40801 rperm[0][i] = m128;
40802 rperm[1][i] = m128;
40803 rperm[2][i] = m128;
40804 rperm[3][i] = m128;
40806 used[0] = false;
40807 used[1] = false;
40808 used[2] = false;
40809 used[3] = false;
40810 for (i = 0; i < nelt; ++i)
40812 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40813 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40814 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40816 for (j = 0; j < eltsz; ++j)
40817 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40818 used[which] = true;
40821 for (i = 0; i < 2; ++i)
40823 if (!used[2 * i + 1])
40825 h[i] = NULL_RTX;
40826 continue;
40828 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40829 gen_rtvec_v (32, rperm[2 * i + 1]));
40830 vperm = force_reg (V32QImode, vperm);
40831 h[i] = gen_reg_rtx (V32QImode);
40832 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40833 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40836 /* Swap the 128-byte lanes of h[X]. */
40837 for (i = 0; i < 2; ++i)
40839 if (h[i] == NULL_RTX)
40840 continue;
40841 op = gen_reg_rtx (V4DImode);
40842 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40843 const2_rtx, GEN_INT (3), const0_rtx,
40844 const1_rtx));
40845 h[i] = gen_lowpart (V32QImode, op);
40848 for (i = 0; i < 2; ++i)
40850 if (!used[2 * i])
40852 l[i] = NULL_RTX;
40853 continue;
40855 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40856 vperm = force_reg (V32QImode, vperm);
40857 l[i] = gen_reg_rtx (V32QImode);
40858 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40859 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40862 for (i = 0; i < 2; ++i)
40864 if (h[i] && l[i])
40866 op = gen_reg_rtx (V32QImode);
40867 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40868 l[i] = op;
40870 else if (h[i])
40871 l[i] = h[i];
40874 gcc_assert (l[0] && l[1]);
40875 op = gen_lowpart (V32QImode, d->target);
40876 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40877 return true;
40880 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40881 With all of the interface bits taken care of, perform the expansion
40882 in D and return true on success. */
40884 static bool
40885 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40887 /* Try a single instruction expansion. */
40888 if (expand_vec_perm_1 (d))
40889 return true;
40891 /* Try sequences of two instructions. */
40893 if (expand_vec_perm_pshuflw_pshufhw (d))
40894 return true;
40896 if (expand_vec_perm_palignr (d))
40897 return true;
40899 if (expand_vec_perm_interleave2 (d))
40900 return true;
40902 if (expand_vec_perm_broadcast (d))
40903 return true;
40905 if (expand_vec_perm_vpermq_perm_1 (d))
40906 return true;
40908 if (expand_vec_perm_vperm2f128 (d))
40909 return true;
40911 /* Try sequences of three instructions. */
40913 if (expand_vec_perm_2vperm2f128_vshuf (d))
40914 return true;
40916 if (expand_vec_perm_pshufb2 (d))
40917 return true;
40919 if (expand_vec_perm_interleave3 (d))
40920 return true;
40922 if (expand_vec_perm_vperm2f128_vblend (d))
40923 return true;
40925 /* Try sequences of four instructions. */
40927 if (expand_vec_perm_vpshufb2_vpermq (d))
40928 return true;
40930 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40931 return true;
40933 /* ??? Look for narrow permutations whose element orderings would
40934 allow the promotion to a wider mode. */
40936 /* ??? Look for sequences of interleave or a wider permute that place
40937 the data into the correct lanes for a half-vector shuffle like
40938 pshuf[lh]w or vpermilps. */
40940 /* ??? Look for sequences of interleave that produce the desired results.
40941 The combinatorics of punpck[lh] get pretty ugly... */
40943 if (expand_vec_perm_even_odd (d))
40944 return true;
40946 /* Even longer sequences. */
40947 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40948 return true;
40950 return false;
40953 /* If a permutation only uses one operand, make it clear. Returns true
40954 if the permutation references both operands. */
40956 static bool
40957 canonicalize_perm (struct expand_vec_perm_d *d)
40959 int i, which, nelt = d->nelt;
40961 for (i = which = 0; i < nelt; ++i)
40962 which |= (d->perm[i] < nelt ? 1 : 2);
40964 d->one_operand_p = true;
40965 switch (which)
40967 default:
40968 gcc_unreachable();
40970 case 3:
40971 if (!rtx_equal_p (d->op0, d->op1))
40973 d->one_operand_p = false;
40974 break;
40976 /* The elements of PERM do not suggest that only the first operand
40977 is used, but both operands are identical. Allow easier matching
40978 of the permutation by folding the permutation into the single
40979 input vector. */
40980 /* FALLTHRU */
40982 case 2:
40983 for (i = 0; i < nelt; ++i)
40984 d->perm[i] &= nelt - 1;
40985 d->op0 = d->op1;
40986 break;
40988 case 1:
40989 d->op1 = d->op0;
40990 break;
40993 return (which == 3);
40996 bool
40997 ix86_expand_vec_perm_const (rtx operands[4])
40999 struct expand_vec_perm_d d;
41000 unsigned char perm[MAX_VECT_LEN];
41001 int i, nelt;
41002 bool two_args;
41003 rtx sel;
41005 d.target = operands[0];
41006 d.op0 = operands[1];
41007 d.op1 = operands[2];
41008 sel = operands[3];
41010 d.vmode = GET_MODE (d.target);
41011 gcc_assert (VECTOR_MODE_P (d.vmode));
41012 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41013 d.testing_p = false;
41015 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41016 gcc_assert (XVECLEN (sel, 0) == nelt);
41017 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41019 for (i = 0; i < nelt; ++i)
41021 rtx e = XVECEXP (sel, 0, i);
41022 int ei = INTVAL (e) & (2 * nelt - 1);
41023 d.perm[i] = ei;
41024 perm[i] = ei;
41027 two_args = canonicalize_perm (&d);
41029 if (ix86_expand_vec_perm_const_1 (&d))
41030 return true;
41032 /* If the selector says both arguments are needed, but the operands are the
41033 same, the above tried to expand with one_operand_p and flattened selector.
41034 If that didn't work, retry without one_operand_p; we succeeded with that
41035 during testing. */
41036 if (two_args && d.one_operand_p)
41038 d.one_operand_p = false;
41039 memcpy (d.perm, perm, sizeof (perm));
41040 return ix86_expand_vec_perm_const_1 (&d);
41043 return false;
41046 /* Implement targetm.vectorize.vec_perm_const_ok. */
41048 static bool
41049 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41050 const unsigned char *sel)
41052 struct expand_vec_perm_d d;
41053 unsigned int i, nelt, which;
41054 bool ret;
41056 d.vmode = vmode;
41057 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41058 d.testing_p = true;
41060 /* Given sufficient ISA support we can just return true here
41061 for selected vector modes. */
41062 if (GET_MODE_SIZE (d.vmode) == 16)
41064 /* All implementable with a single vpperm insn. */
41065 if (TARGET_XOP)
41066 return true;
41067 /* All implementable with 2 pshufb + 1 ior. */
41068 if (TARGET_SSSE3)
41069 return true;
41070 /* All implementable with shufpd or unpck[lh]pd. */
41071 if (d.nelt == 2)
41072 return true;
41075 /* Extract the values from the vector CST into the permutation
41076 array in D. */
41077 memcpy (d.perm, sel, nelt);
41078 for (i = which = 0; i < nelt; ++i)
41080 unsigned char e = d.perm[i];
41081 gcc_assert (e < 2 * nelt);
41082 which |= (e < nelt ? 1 : 2);
41085 /* For all elements from second vector, fold the elements to first. */
41086 if (which == 2)
41087 for (i = 0; i < nelt; ++i)
41088 d.perm[i] -= nelt;
41090 /* Check whether the mask can be applied to the vector type. */
41091 d.one_operand_p = (which != 3);
41093 /* Implementable with shufps or pshufd. */
41094 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41095 return true;
41097 /* Otherwise we have to go through the motions and see if we can
41098 figure out how to generate the requested permutation. */
41099 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41100 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41101 if (!d.one_operand_p)
41102 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41104 start_sequence ();
41105 ret = ix86_expand_vec_perm_const_1 (&d);
41106 end_sequence ();
41108 return ret;
41111 void
41112 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41114 struct expand_vec_perm_d d;
41115 unsigned i, nelt;
41117 d.target = targ;
41118 d.op0 = op0;
41119 d.op1 = op1;
41120 d.vmode = GET_MODE (targ);
41121 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41122 d.one_operand_p = false;
41123 d.testing_p = false;
41125 for (i = 0; i < nelt; ++i)
41126 d.perm[i] = i * 2 + odd;
41128 /* We'll either be able to implement the permutation directly... */
41129 if (expand_vec_perm_1 (&d))
41130 return;
41132 /* ... or we use the special-case patterns. */
41133 expand_vec_perm_even_odd_1 (&d, odd);
41136 static void
41137 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41139 struct expand_vec_perm_d d;
41140 unsigned i, nelt, base;
41141 bool ok;
41143 d.target = targ;
41144 d.op0 = op0;
41145 d.op1 = op1;
41146 d.vmode = GET_MODE (targ);
41147 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41148 d.one_operand_p = false;
41149 d.testing_p = false;
41151 base = high_p ? nelt / 2 : 0;
41152 for (i = 0; i < nelt / 2; ++i)
41154 d.perm[i * 2] = i + base;
41155 d.perm[i * 2 + 1] = i + base + nelt;
41158 /* Note that for AVX this isn't one instruction. */
41159 ok = ix86_expand_vec_perm_const_1 (&d);
41160 gcc_assert (ok);
41164 /* Expand a vector operation CODE for a V*QImode in terms of the
41165 same operation on V*HImode. */
41167 void
41168 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
41170 enum machine_mode qimode = GET_MODE (dest);
41171 enum machine_mode himode;
41172 rtx (*gen_il) (rtx, rtx, rtx);
41173 rtx (*gen_ih) (rtx, rtx, rtx);
41174 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
41175 struct expand_vec_perm_d d;
41176 bool ok, full_interleave;
41177 bool uns_p = false;
41178 int i;
41180 switch (qimode)
41182 case V16QImode:
41183 himode = V8HImode;
41184 gen_il = gen_vec_interleave_lowv16qi;
41185 gen_ih = gen_vec_interleave_highv16qi;
41186 break;
41187 case V32QImode:
41188 himode = V16HImode;
41189 gen_il = gen_avx2_interleave_lowv32qi;
41190 gen_ih = gen_avx2_interleave_highv32qi;
41191 break;
41192 default:
41193 gcc_unreachable ();
41196 op2_l = op2_h = op2;
41197 switch (code)
41199 case MULT:
41200 /* Unpack data such that we've got a source byte in each low byte of
41201 each word. We don't care what goes into the high byte of each word.
41202 Rather than trying to get zero in there, most convenient is to let
41203 it be a copy of the low byte. */
41204 op2_l = gen_reg_rtx (qimode);
41205 op2_h = gen_reg_rtx (qimode);
41206 emit_insn (gen_il (op2_l, op2, op2));
41207 emit_insn (gen_ih (op2_h, op2, op2));
41208 /* FALLTHRU */
41210 op1_l = gen_reg_rtx (qimode);
41211 op1_h = gen_reg_rtx (qimode);
41212 emit_insn (gen_il (op1_l, op1, op1));
41213 emit_insn (gen_ih (op1_h, op1, op1));
41214 full_interleave = qimode == V16QImode;
41215 break;
41217 case ASHIFT:
41218 case LSHIFTRT:
41219 uns_p = true;
41220 /* FALLTHRU */
41221 case ASHIFTRT:
41222 op1_l = gen_reg_rtx (himode);
41223 op1_h = gen_reg_rtx (himode);
41224 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
41225 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
41226 full_interleave = true;
41227 break;
41228 default:
41229 gcc_unreachable ();
41232 /* Perform the operation. */
41233 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
41234 1, OPTAB_DIRECT);
41235 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
41236 1, OPTAB_DIRECT);
41237 gcc_assert (res_l && res_h);
41239 /* Merge the data back into the right place. */
41240 d.target = dest;
41241 d.op0 = gen_lowpart (qimode, res_l);
41242 d.op1 = gen_lowpart (qimode, res_h);
41243 d.vmode = qimode;
41244 d.nelt = GET_MODE_NUNITS (qimode);
41245 d.one_operand_p = false;
41246 d.testing_p = false;
41248 if (full_interleave)
41250 /* For SSE2, we used an full interleave, so the desired
41251 results are in the even elements. */
41252 for (i = 0; i < 32; ++i)
41253 d.perm[i] = i * 2;
41255 else
41257 /* For AVX, the interleave used above was not cross-lane. So the
41258 extraction is evens but with the second and third quarter swapped.
41259 Happily, that is even one insn shorter than even extraction. */
41260 for (i = 0; i < 32; ++i)
41261 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
41264 ok = ix86_expand_vec_perm_const_1 (&d);
41265 gcc_assert (ok);
41267 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41268 gen_rtx_fmt_ee (code, qimode, op1, op2));
41271 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
41272 if op is CONST_VECTOR with all odd elements equal to their
41273 preceding element. */
41275 static bool
41276 const_vector_equal_evenodd_p (rtx op)
41278 enum machine_mode mode = GET_MODE (op);
41279 int i, nunits = GET_MODE_NUNITS (mode);
41280 if (GET_CODE (op) != CONST_VECTOR
41281 || nunits != CONST_VECTOR_NUNITS (op))
41282 return false;
41283 for (i = 0; i < nunits; i += 2)
41284 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
41285 return false;
41286 return true;
41289 void
41290 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
41291 bool uns_p, bool odd_p)
41293 enum machine_mode mode = GET_MODE (op1);
41294 enum machine_mode wmode = GET_MODE (dest);
41295 rtx x;
41296 rtx orig_op1 = op1, orig_op2 = op2;
41298 if (!nonimmediate_operand (op1, mode))
41299 op1 = force_reg (mode, op1);
41300 if (!nonimmediate_operand (op2, mode))
41301 op2 = force_reg (mode, op2);
41303 /* We only play even/odd games with vectors of SImode. */
41304 gcc_assert (mode == V4SImode || mode == V8SImode);
41306 /* If we're looking for the odd results, shift those members down to
41307 the even slots. For some cpus this is faster than a PSHUFD. */
41308 if (odd_p)
41310 /* For XOP use vpmacsdqh, but only for smult, as it is only
41311 signed. */
41312 if (TARGET_XOP && mode == V4SImode && !uns_p)
41314 x = force_reg (wmode, CONST0_RTX (wmode));
41315 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
41316 return;
41319 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
41320 if (!const_vector_equal_evenodd_p (orig_op1))
41321 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
41322 x, NULL, 1, OPTAB_DIRECT);
41323 if (!const_vector_equal_evenodd_p (orig_op2))
41324 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
41325 x, NULL, 1, OPTAB_DIRECT);
41326 op1 = gen_lowpart (mode, op1);
41327 op2 = gen_lowpart (mode, op2);
41330 if (mode == V8SImode)
41332 if (uns_p)
41333 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
41334 else
41335 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
41337 else if (uns_p)
41338 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
41339 else if (TARGET_SSE4_1)
41340 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
41341 else
41343 rtx s1, s2, t0, t1, t2;
41345 /* The easiest way to implement this without PMULDQ is to go through
41346 the motions as if we are performing a full 64-bit multiply. With
41347 the exception that we need to do less shuffling of the elements. */
41349 /* Compute the sign-extension, aka highparts, of the two operands. */
41350 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41351 op1, pc_rtx, pc_rtx);
41352 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41353 op2, pc_rtx, pc_rtx);
41355 /* Multiply LO(A) * HI(B), and vice-versa. */
41356 t1 = gen_reg_rtx (wmode);
41357 t2 = gen_reg_rtx (wmode);
41358 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
41359 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
41361 /* Multiply LO(A) * LO(B). */
41362 t0 = gen_reg_rtx (wmode);
41363 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
41365 /* Combine and shift the highparts into place. */
41366 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
41367 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
41368 1, OPTAB_DIRECT);
41370 /* Combine high and low parts. */
41371 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
41372 return;
41374 emit_insn (x);
41377 void
41378 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
41379 bool uns_p, bool high_p)
41381 enum machine_mode wmode = GET_MODE (dest);
41382 enum machine_mode mode = GET_MODE (op1);
41383 rtx t1, t2, t3, t4, mask;
41385 switch (mode)
41387 case V4SImode:
41388 t1 = gen_reg_rtx (mode);
41389 t2 = gen_reg_rtx (mode);
41390 if (TARGET_XOP && !uns_p)
41392 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
41393 shuffle the elements once so that all elements are in the right
41394 place for immediate use: { A C B D }. */
41395 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
41396 const1_rtx, GEN_INT (3)));
41397 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
41398 const1_rtx, GEN_INT (3)));
41400 else
41402 /* Put the elements into place for the multiply. */
41403 ix86_expand_vec_interleave (t1, op1, op1, high_p);
41404 ix86_expand_vec_interleave (t2, op2, op2, high_p);
41405 high_p = false;
41407 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
41408 break;
41410 case V8SImode:
41411 /* Shuffle the elements between the lanes. After this we
41412 have { A B E F | C D G H } for each operand. */
41413 t1 = gen_reg_rtx (V4DImode);
41414 t2 = gen_reg_rtx (V4DImode);
41415 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
41416 const0_rtx, const2_rtx,
41417 const1_rtx, GEN_INT (3)));
41418 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
41419 const0_rtx, const2_rtx,
41420 const1_rtx, GEN_INT (3)));
41422 /* Shuffle the elements within the lanes. After this we
41423 have { A A B B | C C D D } or { E E F F | G G H H }. */
41424 t3 = gen_reg_rtx (V8SImode);
41425 t4 = gen_reg_rtx (V8SImode);
41426 mask = GEN_INT (high_p
41427 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
41428 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
41429 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
41430 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
41432 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
41433 break;
41435 case V8HImode:
41436 case V16HImode:
41437 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
41438 uns_p, OPTAB_DIRECT);
41439 t2 = expand_binop (mode,
41440 uns_p ? umul_highpart_optab : smul_highpart_optab,
41441 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
41442 gcc_assert (t1 && t2);
41444 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
41445 break;
41447 case V16QImode:
41448 case V32QImode:
41449 t1 = gen_reg_rtx (wmode);
41450 t2 = gen_reg_rtx (wmode);
41451 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
41452 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
41454 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
41455 break;
41457 default:
41458 gcc_unreachable ();
41462 void
41463 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
41465 rtx res_1, res_2;
41467 res_1 = gen_reg_rtx (V4SImode);
41468 res_2 = gen_reg_rtx (V4SImode);
41469 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
41470 op1, op2, true, false);
41471 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
41472 op1, op2, true, true);
41474 /* Move the results in element 2 down to element 1; we don't care
41475 what goes in elements 2 and 3. Then we can merge the parts
41476 back together with an interleave.
41478 Note that two other sequences were tried:
41479 (1) Use interleaves at the start instead of psrldq, which allows
41480 us to use a single shufps to merge things back at the end.
41481 (2) Use shufps here to combine the two vectors, then pshufd to
41482 put the elements in the correct order.
41483 In both cases the cost of the reformatting stall was too high
41484 and the overall sequence slower. */
41486 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
41487 const0_rtx, const0_rtx));
41488 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
41489 const0_rtx, const0_rtx));
41490 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
41492 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
41495 void
41496 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
41498 enum machine_mode mode = GET_MODE (op0);
41499 rtx t1, t2, t3, t4, t5, t6;
41501 if (TARGET_XOP && mode == V2DImode)
41503 /* op1: A,B,C,D, op2: E,F,G,H */
41504 op1 = gen_lowpart (V4SImode, op1);
41505 op2 = gen_lowpart (V4SImode, op2);
41507 t1 = gen_reg_rtx (V4SImode);
41508 t2 = gen_reg_rtx (V4SImode);
41509 t3 = gen_reg_rtx (V2DImode);
41510 t4 = gen_reg_rtx (V2DImode);
41512 /* t1: B,A,D,C */
41513 emit_insn (gen_sse2_pshufd_1 (t1, op1,
41514 GEN_INT (1),
41515 GEN_INT (0),
41516 GEN_INT (3),
41517 GEN_INT (2)));
41519 /* t2: (B*E),(A*F),(D*G),(C*H) */
41520 emit_insn (gen_mulv4si3 (t2, t1, op2));
41522 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
41523 emit_insn (gen_xop_phadddq (t3, t2));
41525 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
41526 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
41528 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
41529 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
41531 else
41533 enum machine_mode nmode;
41534 rtx (*umul) (rtx, rtx, rtx);
41536 if (mode == V2DImode)
41538 umul = gen_vec_widen_umult_even_v4si;
41539 nmode = V4SImode;
41541 else if (mode == V4DImode)
41543 umul = gen_vec_widen_umult_even_v8si;
41544 nmode = V8SImode;
41546 else
41547 gcc_unreachable ();
41550 /* Multiply low parts. */
41551 t1 = gen_reg_rtx (mode);
41552 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
41554 /* Shift input vectors right 32 bits so we can multiply high parts. */
41555 t6 = GEN_INT (32);
41556 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
41557 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
41559 /* Multiply high parts by low parts. */
41560 t4 = gen_reg_rtx (mode);
41561 t5 = gen_reg_rtx (mode);
41562 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
41563 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
41565 /* Combine and shift the highparts back. */
41566 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
41567 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
41569 /* Combine high and low parts. */
41570 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
41573 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41574 gen_rtx_MULT (mode, op1, op2));
41577 /* Expand an insert into a vector register through pinsr insn.
41578 Return true if successful. */
41580 bool
41581 ix86_expand_pinsr (rtx *operands)
41583 rtx dst = operands[0];
41584 rtx src = operands[3];
41586 unsigned int size = INTVAL (operands[1]);
41587 unsigned int pos = INTVAL (operands[2]);
41589 if (GET_CODE (dst) == SUBREG)
41591 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41592 dst = SUBREG_REG (dst);
41595 if (GET_CODE (src) == SUBREG)
41596 src = SUBREG_REG (src);
41598 switch (GET_MODE (dst))
41600 case V16QImode:
41601 case V8HImode:
41602 case V4SImode:
41603 case V2DImode:
41605 enum machine_mode srcmode, dstmode;
41606 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41608 srcmode = mode_for_size (size, MODE_INT, 0);
41610 switch (srcmode)
41612 case QImode:
41613 if (!TARGET_SSE4_1)
41614 return false;
41615 dstmode = V16QImode;
41616 pinsr = gen_sse4_1_pinsrb;
41617 break;
41619 case HImode:
41620 if (!TARGET_SSE2)
41621 return false;
41622 dstmode = V8HImode;
41623 pinsr = gen_sse2_pinsrw;
41624 break;
41626 case SImode:
41627 if (!TARGET_SSE4_1)
41628 return false;
41629 dstmode = V4SImode;
41630 pinsr = gen_sse4_1_pinsrd;
41631 break;
41633 case DImode:
41634 gcc_assert (TARGET_64BIT);
41635 if (!TARGET_SSE4_1)
41636 return false;
41637 dstmode = V2DImode;
41638 pinsr = gen_sse4_1_pinsrq;
41639 break;
41641 default:
41642 return false;
41645 dst = gen_lowpart (dstmode, dst);
41646 src = gen_lowpart (srcmode, src);
41648 pos /= size;
41650 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41651 return true;
41654 default:
41655 return false;
41659 /* This function returns the calling abi specific va_list type node.
41660 It returns the FNDECL specific va_list type. */
41662 static tree
41663 ix86_fn_abi_va_list (tree fndecl)
41665 if (!TARGET_64BIT)
41666 return va_list_type_node;
41667 gcc_assert (fndecl != NULL_TREE);
41669 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41670 return ms_va_list_type_node;
41671 else
41672 return sysv_va_list_type_node;
41675 /* Returns the canonical va_list type specified by TYPE. If there
41676 is no valid TYPE provided, it return NULL_TREE. */
41678 static tree
41679 ix86_canonical_va_list_type (tree type)
41681 tree wtype, htype;
41683 /* Resolve references and pointers to va_list type. */
41684 if (TREE_CODE (type) == MEM_REF)
41685 type = TREE_TYPE (type);
41686 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41687 type = TREE_TYPE (type);
41688 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41689 type = TREE_TYPE (type);
41691 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41693 wtype = va_list_type_node;
41694 gcc_assert (wtype != NULL_TREE);
41695 htype = type;
41696 if (TREE_CODE (wtype) == ARRAY_TYPE)
41698 /* If va_list is an array type, the argument may have decayed
41699 to a pointer type, e.g. by being passed to another function.
41700 In that case, unwrap both types so that we can compare the
41701 underlying records. */
41702 if (TREE_CODE (htype) == ARRAY_TYPE
41703 || POINTER_TYPE_P (htype))
41705 wtype = TREE_TYPE (wtype);
41706 htype = TREE_TYPE (htype);
41709 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41710 return va_list_type_node;
41711 wtype = sysv_va_list_type_node;
41712 gcc_assert (wtype != NULL_TREE);
41713 htype = type;
41714 if (TREE_CODE (wtype) == ARRAY_TYPE)
41716 /* If va_list is an array type, the argument may have decayed
41717 to a pointer type, e.g. by being passed to another function.
41718 In that case, unwrap both types so that we can compare the
41719 underlying records. */
41720 if (TREE_CODE (htype) == ARRAY_TYPE
41721 || POINTER_TYPE_P (htype))
41723 wtype = TREE_TYPE (wtype);
41724 htype = TREE_TYPE (htype);
41727 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41728 return sysv_va_list_type_node;
41729 wtype = ms_va_list_type_node;
41730 gcc_assert (wtype != NULL_TREE);
41731 htype = type;
41732 if (TREE_CODE (wtype) == ARRAY_TYPE)
41734 /* If va_list is an array type, the argument may have decayed
41735 to a pointer type, e.g. by being passed to another function.
41736 In that case, unwrap both types so that we can compare the
41737 underlying records. */
41738 if (TREE_CODE (htype) == ARRAY_TYPE
41739 || POINTER_TYPE_P (htype))
41741 wtype = TREE_TYPE (wtype);
41742 htype = TREE_TYPE (htype);
41745 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41746 return ms_va_list_type_node;
41747 return NULL_TREE;
41749 return std_canonical_va_list_type (type);
41752 /* Iterate through the target-specific builtin types for va_list.
41753 IDX denotes the iterator, *PTREE is set to the result type of
41754 the va_list builtin, and *PNAME to its internal type.
41755 Returns zero if there is no element for this index, otherwise
41756 IDX should be increased upon the next call.
41757 Note, do not iterate a base builtin's name like __builtin_va_list.
41758 Used from c_common_nodes_and_builtins. */
41760 static int
41761 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41763 if (TARGET_64BIT)
41765 switch (idx)
41767 default:
41768 break;
41770 case 0:
41771 *ptree = ms_va_list_type_node;
41772 *pname = "__builtin_ms_va_list";
41773 return 1;
41775 case 1:
41776 *ptree = sysv_va_list_type_node;
41777 *pname = "__builtin_sysv_va_list";
41778 return 1;
41782 return 0;
41785 #undef TARGET_SCHED_DISPATCH
41786 #define TARGET_SCHED_DISPATCH has_dispatch
41787 #undef TARGET_SCHED_DISPATCH_DO
41788 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41789 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41790 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41791 #undef TARGET_SCHED_REORDER
41792 #define TARGET_SCHED_REORDER ix86_sched_reorder
41793 #undef TARGET_SCHED_ADJUST_PRIORITY
41794 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41795 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41796 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
41797 ix86_dependencies_evaluation_hook
41799 /* The size of the dispatch window is the total number of bytes of
41800 object code allowed in a window. */
41801 #define DISPATCH_WINDOW_SIZE 16
41803 /* Number of dispatch windows considered for scheduling. */
41804 #define MAX_DISPATCH_WINDOWS 3
41806 /* Maximum number of instructions in a window. */
41807 #define MAX_INSN 4
41809 /* Maximum number of immediate operands in a window. */
41810 #define MAX_IMM 4
41812 /* Maximum number of immediate bits allowed in a window. */
41813 #define MAX_IMM_SIZE 128
41815 /* Maximum number of 32 bit immediates allowed in a window. */
41816 #define MAX_IMM_32 4
41818 /* Maximum number of 64 bit immediates allowed in a window. */
41819 #define MAX_IMM_64 2
41821 /* Maximum total of loads or prefetches allowed in a window. */
41822 #define MAX_LOAD 2
41824 /* Maximum total of stores allowed in a window. */
41825 #define MAX_STORE 1
41827 #undef BIG
41828 #define BIG 100
41831 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41832 enum dispatch_group {
41833 disp_no_group = 0,
41834 disp_load,
41835 disp_store,
41836 disp_load_store,
41837 disp_prefetch,
41838 disp_imm,
41839 disp_imm_32,
41840 disp_imm_64,
41841 disp_branch,
41842 disp_cmp,
41843 disp_jcc,
41844 disp_last
41847 /* Number of allowable groups in a dispatch window. It is an array
41848 indexed by dispatch_group enum. 100 is used as a big number,
41849 because the number of these kind of operations does not have any
41850 effect in dispatch window, but we need them for other reasons in
41851 the table. */
41852 static unsigned int num_allowable_groups[disp_last] = {
41853 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41856 char group_name[disp_last + 1][16] = {
41857 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41858 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41859 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41862 /* Instruction path. */
41863 enum insn_path {
41864 no_path = 0,
41865 path_single, /* Single micro op. */
41866 path_double, /* Double micro op. */
41867 path_multi, /* Instructions with more than 2 micro op.. */
41868 last_path
41871 /* sched_insn_info defines a window to the instructions scheduled in
41872 the basic block. It contains a pointer to the insn_info table and
41873 the instruction scheduled.
41875 Windows are allocated for each basic block and are linked
41876 together. */
41877 typedef struct sched_insn_info_s {
41878 rtx insn;
41879 enum dispatch_group group;
41880 enum insn_path path;
41881 int byte_len;
41882 int imm_bytes;
41883 } sched_insn_info;
41885 /* Linked list of dispatch windows. This is a two way list of
41886 dispatch windows of a basic block. It contains information about
41887 the number of uops in the window and the total number of
41888 instructions and of bytes in the object code for this dispatch
41889 window. */
41890 typedef struct dispatch_windows_s {
41891 int num_insn; /* Number of insn in the window. */
41892 int num_uops; /* Number of uops in the window. */
41893 int window_size; /* Number of bytes in the window. */
41894 int window_num; /* Window number between 0 or 1. */
41895 int num_imm; /* Number of immediates in an insn. */
41896 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41897 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41898 int imm_size; /* Total immediates in the window. */
41899 int num_loads; /* Total memory loads in the window. */
41900 int num_stores; /* Total memory stores in the window. */
41901 int violation; /* Violation exists in window. */
41902 sched_insn_info *window; /* Pointer to the window. */
41903 struct dispatch_windows_s *next;
41904 struct dispatch_windows_s *prev;
41905 } dispatch_windows;
41907 /* Immediate valuse used in an insn. */
41908 typedef struct imm_info_s
41910 int imm;
41911 int imm32;
41912 int imm64;
41913 } imm_info;
41915 static dispatch_windows *dispatch_window_list;
41916 static dispatch_windows *dispatch_window_list1;
41918 /* Get dispatch group of insn. */
41920 static enum dispatch_group
41921 get_mem_group (rtx insn)
41923 enum attr_memory memory;
41925 if (INSN_CODE (insn) < 0)
41926 return disp_no_group;
41927 memory = get_attr_memory (insn);
41928 if (memory == MEMORY_STORE)
41929 return disp_store;
41931 if (memory == MEMORY_LOAD)
41932 return disp_load;
41934 if (memory == MEMORY_BOTH)
41935 return disp_load_store;
41937 return disp_no_group;
41940 /* Return true if insn is a compare instruction. */
41942 static bool
41943 is_cmp (rtx insn)
41945 enum attr_type type;
41947 type = get_attr_type (insn);
41948 return (type == TYPE_TEST
41949 || type == TYPE_ICMP
41950 || type == TYPE_FCMP
41951 || GET_CODE (PATTERN (insn)) == COMPARE);
41954 /* Return true if a dispatch violation encountered. */
41956 static bool
41957 dispatch_violation (void)
41959 if (dispatch_window_list->next)
41960 return dispatch_window_list->next->violation;
41961 return dispatch_window_list->violation;
41964 /* Return true if insn is a branch instruction. */
41966 static bool
41967 is_branch (rtx insn)
41969 return (CALL_P (insn) || JUMP_P (insn));
41972 /* Return true if insn is a prefetch instruction. */
41974 static bool
41975 is_prefetch (rtx insn)
41977 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41980 /* This function initializes a dispatch window and the list container holding a
41981 pointer to the window. */
41983 static void
41984 init_window (int window_num)
41986 int i;
41987 dispatch_windows *new_list;
41989 if (window_num == 0)
41990 new_list = dispatch_window_list;
41991 else
41992 new_list = dispatch_window_list1;
41994 new_list->num_insn = 0;
41995 new_list->num_uops = 0;
41996 new_list->window_size = 0;
41997 new_list->next = NULL;
41998 new_list->prev = NULL;
41999 new_list->window_num = window_num;
42000 new_list->num_imm = 0;
42001 new_list->num_imm_32 = 0;
42002 new_list->num_imm_64 = 0;
42003 new_list->imm_size = 0;
42004 new_list->num_loads = 0;
42005 new_list->num_stores = 0;
42006 new_list->violation = false;
42008 for (i = 0; i < MAX_INSN; i++)
42010 new_list->window[i].insn = NULL;
42011 new_list->window[i].group = disp_no_group;
42012 new_list->window[i].path = no_path;
42013 new_list->window[i].byte_len = 0;
42014 new_list->window[i].imm_bytes = 0;
42016 return;
42019 /* This function allocates and initializes a dispatch window and the
42020 list container holding a pointer to the window. */
42022 static dispatch_windows *
42023 allocate_window (void)
42025 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42026 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42028 return new_list;
42031 /* This routine initializes the dispatch scheduling information. It
42032 initiates building dispatch scheduler tables and constructs the
42033 first dispatch window. */
42035 static void
42036 init_dispatch_sched (void)
42038 /* Allocate a dispatch list and a window. */
42039 dispatch_window_list = allocate_window ();
42040 dispatch_window_list1 = allocate_window ();
42041 init_window (0);
42042 init_window (1);
42045 /* This function returns true if a branch is detected. End of a basic block
42046 does not have to be a branch, but here we assume only branches end a
42047 window. */
42049 static bool
42050 is_end_basic_block (enum dispatch_group group)
42052 return group == disp_branch;
42055 /* This function is called when the end of a window processing is reached. */
42057 static void
42058 process_end_window (void)
42060 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42061 if (dispatch_window_list->next)
42063 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42064 gcc_assert (dispatch_window_list->window_size
42065 + dispatch_window_list1->window_size <= 48);
42066 init_window (1);
42068 init_window (0);
42071 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42072 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42073 for 48 bytes of instructions. Note that these windows are not dispatch
42074 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42076 static dispatch_windows *
42077 allocate_next_window (int window_num)
42079 if (window_num == 0)
42081 if (dispatch_window_list->next)
42082 init_window (1);
42083 init_window (0);
42084 return dispatch_window_list;
42087 dispatch_window_list->next = dispatch_window_list1;
42088 dispatch_window_list1->prev = dispatch_window_list;
42090 return dispatch_window_list1;
42093 /* Increment the number of immediate operands of an instruction. */
42095 static int
42096 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
42098 if (*in_rtx == 0)
42099 return 0;
42101 switch ( GET_CODE (*in_rtx))
42103 case CONST:
42104 case SYMBOL_REF:
42105 case CONST_INT:
42106 (imm_values->imm)++;
42107 if (x86_64_immediate_operand (*in_rtx, SImode))
42108 (imm_values->imm32)++;
42109 else
42110 (imm_values->imm64)++;
42111 break;
42113 case CONST_DOUBLE:
42114 (imm_values->imm)++;
42115 (imm_values->imm64)++;
42116 break;
42118 case CODE_LABEL:
42119 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
42121 (imm_values->imm)++;
42122 (imm_values->imm32)++;
42124 break;
42126 default:
42127 break;
42130 return 0;
42133 /* Compute number of immediate operands of an instruction. */
42135 static void
42136 find_constant (rtx in_rtx, imm_info *imm_values)
42138 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
42139 (rtx_function) find_constant_1, (void *) imm_values);
42142 /* Return total size of immediate operands of an instruction along with number
42143 of corresponding immediate-operands. It initializes its parameters to zero
42144 befor calling FIND_CONSTANT.
42145 INSN is the input instruction. IMM is the total of immediates.
42146 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
42147 bit immediates. */
42149 static int
42150 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
42152 imm_info imm_values = {0, 0, 0};
42154 find_constant (insn, &imm_values);
42155 *imm = imm_values.imm;
42156 *imm32 = imm_values.imm32;
42157 *imm64 = imm_values.imm64;
42158 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
42161 /* This function indicates if an operand of an instruction is an
42162 immediate. */
42164 static bool
42165 has_immediate (rtx insn)
42167 int num_imm_operand;
42168 int num_imm32_operand;
42169 int num_imm64_operand;
42171 if (insn)
42172 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42173 &num_imm64_operand);
42174 return false;
42177 /* Return single or double path for instructions. */
42179 static enum insn_path
42180 get_insn_path (rtx insn)
42182 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
42184 if ((int)path == 0)
42185 return path_single;
42187 if ((int)path == 1)
42188 return path_double;
42190 return path_multi;
42193 /* Return insn dispatch group. */
42195 static enum dispatch_group
42196 get_insn_group (rtx insn)
42198 enum dispatch_group group = get_mem_group (insn);
42199 if (group)
42200 return group;
42202 if (is_branch (insn))
42203 return disp_branch;
42205 if (is_cmp (insn))
42206 return disp_cmp;
42208 if (has_immediate (insn))
42209 return disp_imm;
42211 if (is_prefetch (insn))
42212 return disp_prefetch;
42214 return disp_no_group;
42217 /* Count number of GROUP restricted instructions in a dispatch
42218 window WINDOW_LIST. */
42220 static int
42221 count_num_restricted (rtx insn, dispatch_windows *window_list)
42223 enum dispatch_group group = get_insn_group (insn);
42224 int imm_size;
42225 int num_imm_operand;
42226 int num_imm32_operand;
42227 int num_imm64_operand;
42229 if (group == disp_no_group)
42230 return 0;
42232 if (group == disp_imm)
42234 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42235 &num_imm64_operand);
42236 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
42237 || num_imm_operand + window_list->num_imm > MAX_IMM
42238 || (num_imm32_operand > 0
42239 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
42240 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
42241 || (num_imm64_operand > 0
42242 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
42243 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
42244 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
42245 && num_imm64_operand > 0
42246 && ((window_list->num_imm_64 > 0
42247 && window_list->num_insn >= 2)
42248 || window_list->num_insn >= 3)))
42249 return BIG;
42251 return 1;
42254 if ((group == disp_load_store
42255 && (window_list->num_loads >= MAX_LOAD
42256 || window_list->num_stores >= MAX_STORE))
42257 || ((group == disp_load
42258 || group == disp_prefetch)
42259 && window_list->num_loads >= MAX_LOAD)
42260 || (group == disp_store
42261 && window_list->num_stores >= MAX_STORE))
42262 return BIG;
42264 return 1;
42267 /* This function returns true if insn satisfies dispatch rules on the
42268 last window scheduled. */
42270 static bool
42271 fits_dispatch_window (rtx insn)
42273 dispatch_windows *window_list = dispatch_window_list;
42274 dispatch_windows *window_list_next = dispatch_window_list->next;
42275 unsigned int num_restrict;
42276 enum dispatch_group group = get_insn_group (insn);
42277 enum insn_path path = get_insn_path (insn);
42278 int sum;
42280 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
42281 instructions should be given the lowest priority in the
42282 scheduling process in Haifa scheduler to make sure they will be
42283 scheduled in the same dispatch window as the reference to them. */
42284 if (group == disp_jcc || group == disp_cmp)
42285 return false;
42287 /* Check nonrestricted. */
42288 if (group == disp_no_group || group == disp_branch)
42289 return true;
42291 /* Get last dispatch window. */
42292 if (window_list_next)
42293 window_list = window_list_next;
42295 if (window_list->window_num == 1)
42297 sum = window_list->prev->window_size + window_list->window_size;
42299 if (sum == 32
42300 || (min_insn_size (insn) + sum) >= 48)
42301 /* Window 1 is full. Go for next window. */
42302 return true;
42305 num_restrict = count_num_restricted (insn, window_list);
42307 if (num_restrict > num_allowable_groups[group])
42308 return false;
42310 /* See if it fits in the first window. */
42311 if (window_list->window_num == 0)
42313 /* The first widow should have only single and double path
42314 uops. */
42315 if (path == path_double
42316 && (window_list->num_uops + 2) > MAX_INSN)
42317 return false;
42318 else if (path != path_single)
42319 return false;
42321 return true;
42324 /* Add an instruction INSN with NUM_UOPS micro-operations to the
42325 dispatch window WINDOW_LIST. */
42327 static void
42328 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
42330 int byte_len = min_insn_size (insn);
42331 int num_insn = window_list->num_insn;
42332 int imm_size;
42333 sched_insn_info *window = window_list->window;
42334 enum dispatch_group group = get_insn_group (insn);
42335 enum insn_path path = get_insn_path (insn);
42336 int num_imm_operand;
42337 int num_imm32_operand;
42338 int num_imm64_operand;
42340 if (!window_list->violation && group != disp_cmp
42341 && !fits_dispatch_window (insn))
42342 window_list->violation = true;
42344 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42345 &num_imm64_operand);
42347 /* Initialize window with new instruction. */
42348 window[num_insn].insn = insn;
42349 window[num_insn].byte_len = byte_len;
42350 window[num_insn].group = group;
42351 window[num_insn].path = path;
42352 window[num_insn].imm_bytes = imm_size;
42354 window_list->window_size += byte_len;
42355 window_list->num_insn = num_insn + 1;
42356 window_list->num_uops = window_list->num_uops + num_uops;
42357 window_list->imm_size += imm_size;
42358 window_list->num_imm += num_imm_operand;
42359 window_list->num_imm_32 += num_imm32_operand;
42360 window_list->num_imm_64 += num_imm64_operand;
42362 if (group == disp_store)
42363 window_list->num_stores += 1;
42364 else if (group == disp_load
42365 || group == disp_prefetch)
42366 window_list->num_loads += 1;
42367 else if (group == disp_load_store)
42369 window_list->num_stores += 1;
42370 window_list->num_loads += 1;
42374 /* Adds a scheduled instruction, INSN, to the current dispatch window.
42375 If the total bytes of instructions or the number of instructions in
42376 the window exceed allowable, it allocates a new window. */
42378 static void
42379 add_to_dispatch_window (rtx insn)
42381 int byte_len;
42382 dispatch_windows *window_list;
42383 dispatch_windows *next_list;
42384 dispatch_windows *window0_list;
42385 enum insn_path path;
42386 enum dispatch_group insn_group;
42387 bool insn_fits;
42388 int num_insn;
42389 int num_uops;
42390 int window_num;
42391 int insn_num_uops;
42392 int sum;
42394 if (INSN_CODE (insn) < 0)
42395 return;
42397 byte_len = min_insn_size (insn);
42398 window_list = dispatch_window_list;
42399 next_list = window_list->next;
42400 path = get_insn_path (insn);
42401 insn_group = get_insn_group (insn);
42403 /* Get the last dispatch window. */
42404 if (next_list)
42405 window_list = dispatch_window_list->next;
42407 if (path == path_single)
42408 insn_num_uops = 1;
42409 else if (path == path_double)
42410 insn_num_uops = 2;
42411 else
42412 insn_num_uops = (int) path;
42414 /* If current window is full, get a new window.
42415 Window number zero is full, if MAX_INSN uops are scheduled in it.
42416 Window number one is full, if window zero's bytes plus window
42417 one's bytes is 32, or if the bytes of the new instruction added
42418 to the total makes it greater than 48, or it has already MAX_INSN
42419 instructions in it. */
42420 num_insn = window_list->num_insn;
42421 num_uops = window_list->num_uops;
42422 window_num = window_list->window_num;
42423 insn_fits = fits_dispatch_window (insn);
42425 if (num_insn >= MAX_INSN
42426 || num_uops + insn_num_uops > MAX_INSN
42427 || !(insn_fits))
42429 window_num = ~window_num & 1;
42430 window_list = allocate_next_window (window_num);
42433 if (window_num == 0)
42435 add_insn_window (insn, window_list, insn_num_uops);
42436 if (window_list->num_insn >= MAX_INSN
42437 && insn_group == disp_branch)
42439 process_end_window ();
42440 return;
42443 else if (window_num == 1)
42445 window0_list = window_list->prev;
42446 sum = window0_list->window_size + window_list->window_size;
42447 if (sum == 32
42448 || (byte_len + sum) >= 48)
42450 process_end_window ();
42451 window_list = dispatch_window_list;
42454 add_insn_window (insn, window_list, insn_num_uops);
42456 else
42457 gcc_unreachable ();
42459 if (is_end_basic_block (insn_group))
42461 /* End of basic block is reached do end-basic-block process. */
42462 process_end_window ();
42463 return;
42467 /* Print the dispatch window, WINDOW_NUM, to FILE. */
42469 DEBUG_FUNCTION static void
42470 debug_dispatch_window_file (FILE *file, int window_num)
42472 dispatch_windows *list;
42473 int i;
42475 if (window_num == 0)
42476 list = dispatch_window_list;
42477 else
42478 list = dispatch_window_list1;
42480 fprintf (file, "Window #%d:\n", list->window_num);
42481 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
42482 list->num_insn, list->num_uops, list->window_size);
42483 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42484 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
42486 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
42487 list->num_stores);
42488 fprintf (file, " insn info:\n");
42490 for (i = 0; i < MAX_INSN; i++)
42492 if (!list->window[i].insn)
42493 break;
42494 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
42495 i, group_name[list->window[i].group],
42496 i, (void *)list->window[i].insn,
42497 i, list->window[i].path,
42498 i, list->window[i].byte_len,
42499 i, list->window[i].imm_bytes);
42503 /* Print to stdout a dispatch window. */
42505 DEBUG_FUNCTION void
42506 debug_dispatch_window (int window_num)
42508 debug_dispatch_window_file (stdout, window_num);
42511 /* Print INSN dispatch information to FILE. */
42513 DEBUG_FUNCTION static void
42514 debug_insn_dispatch_info_file (FILE *file, rtx insn)
42516 int byte_len;
42517 enum insn_path path;
42518 enum dispatch_group group;
42519 int imm_size;
42520 int num_imm_operand;
42521 int num_imm32_operand;
42522 int num_imm64_operand;
42524 if (INSN_CODE (insn) < 0)
42525 return;
42527 byte_len = min_insn_size (insn);
42528 path = get_insn_path (insn);
42529 group = get_insn_group (insn);
42530 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42531 &num_imm64_operand);
42533 fprintf (file, " insn info:\n");
42534 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
42535 group_name[group], path, byte_len);
42536 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42537 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
42540 /* Print to STDERR the status of the ready list with respect to
42541 dispatch windows. */
42543 DEBUG_FUNCTION void
42544 debug_ready_dispatch (void)
42546 int i;
42547 int no_ready = number_in_ready ();
42549 fprintf (stdout, "Number of ready: %d\n", no_ready);
42551 for (i = 0; i < no_ready; i++)
42552 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
42555 /* This routine is the driver of the dispatch scheduler. */
42557 static void
42558 do_dispatch (rtx insn, int mode)
42560 if (mode == DISPATCH_INIT)
42561 init_dispatch_sched ();
42562 else if (mode == ADD_TO_DISPATCH_WINDOW)
42563 add_to_dispatch_window (insn);
42566 /* Return TRUE if Dispatch Scheduling is supported. */
42568 static bool
42569 has_dispatch (rtx insn, int action)
42571 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
42572 && flag_dispatch_scheduler)
42573 switch (action)
42575 default:
42576 return false;
42578 case IS_DISPATCH_ON:
42579 return true;
42580 break;
42582 case IS_CMP:
42583 return is_cmp (insn);
42585 case DISPATCH_VIOLATION:
42586 return dispatch_violation ();
42588 case FITS_DISPATCH_WINDOW:
42589 return fits_dispatch_window (insn);
42592 return false;
42595 /* Implementation of reassociation_width target hook used by
42596 reassoc phase to identify parallelism level in reassociated
42597 tree. Statements tree_code is passed in OPC. Arguments type
42598 is passed in MODE.
42600 Currently parallel reassociation is enabled for Atom
42601 processors only and we set reassociation width to be 2
42602 because Atom may issue up to 2 instructions per cycle.
42604 Return value should be fixed if parallel reassociation is
42605 enabled for other processors. */
42607 static int
42608 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42609 enum machine_mode mode)
42611 int res = 1;
42613 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42614 res = 2;
42615 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42616 res = 2;
42618 return res;
42621 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42622 place emms and femms instructions. */
42624 static enum machine_mode
42625 ix86_preferred_simd_mode (enum machine_mode mode)
42627 if (!TARGET_SSE)
42628 return word_mode;
42630 switch (mode)
42632 case QImode:
42633 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42634 case HImode:
42635 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42636 case SImode:
42637 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42638 case DImode:
42639 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42641 case SFmode:
42642 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42643 return V8SFmode;
42644 else
42645 return V4SFmode;
42647 case DFmode:
42648 if (!TARGET_VECTORIZE_DOUBLE)
42649 return word_mode;
42650 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42651 return V4DFmode;
42652 else if (TARGET_SSE2)
42653 return V2DFmode;
42654 /* FALLTHRU */
42656 default:
42657 return word_mode;
42661 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42662 vectors. */
42664 static unsigned int
42665 ix86_autovectorize_vector_sizes (void)
42667 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42672 /* Return class of registers which could be used for pseudo of MODE
42673 and of class RCLASS for spilling instead of memory. Return NO_REGS
42674 if it is not possible or non-profitable. */
42675 static reg_class_t
42676 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42678 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42679 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42680 && INTEGER_CLASS_P (rclass))
42681 return SSE_REGS;
42682 return NO_REGS;
42685 /* Implement targetm.vectorize.init_cost. */
42687 static void *
42688 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42690 unsigned *cost = XNEWVEC (unsigned, 3);
42691 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42692 return cost;
42695 /* Implement targetm.vectorize.add_stmt_cost. */
42697 static unsigned
42698 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42699 struct _stmt_vec_info *stmt_info, int misalign,
42700 enum vect_cost_model_location where)
42702 unsigned *cost = (unsigned *) data;
42703 unsigned retval = 0;
42705 if (flag_vect_cost_model)
42707 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42708 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42710 /* Statements in an inner loop relative to the loop being
42711 vectorized are weighted more heavily. The value here is
42712 arbitrary and could potentially be improved with analysis. */
42713 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42714 count *= 50; /* FIXME. */
42716 retval = (unsigned) (count * stmt_cost);
42717 cost[where] += retval;
42720 return retval;
42723 /* Implement targetm.vectorize.finish_cost. */
42725 static void
42726 ix86_finish_cost (void *data, unsigned *prologue_cost,
42727 unsigned *body_cost, unsigned *epilogue_cost)
42729 unsigned *cost = (unsigned *) data;
42730 *prologue_cost = cost[vect_prologue];
42731 *body_cost = cost[vect_body];
42732 *epilogue_cost = cost[vect_epilogue];
42735 /* Implement targetm.vectorize.destroy_cost_data. */
42737 static void
42738 ix86_destroy_cost_data (void *data)
42740 free (data);
42743 /* Validate target specific memory model bits in VAL. */
42745 static unsigned HOST_WIDE_INT
42746 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42748 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42749 bool strong;
42751 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42752 |MEMMODEL_MASK)
42753 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42755 warning (OPT_Winvalid_memory_model,
42756 "Unknown architecture specific memory model");
42757 return MEMMODEL_SEQ_CST;
42759 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42760 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42762 warning (OPT_Winvalid_memory_model,
42763 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42764 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42766 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42768 warning (OPT_Winvalid_memory_model,
42769 "HLE_RELEASE not used with RELEASE or stronger memory model");
42770 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42772 return val;
42775 /* Initialize the GCC target structure. */
42776 #undef TARGET_RETURN_IN_MEMORY
42777 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42779 #undef TARGET_LEGITIMIZE_ADDRESS
42780 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42782 #undef TARGET_ATTRIBUTE_TABLE
42783 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42784 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
42785 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
42786 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42787 # undef TARGET_MERGE_DECL_ATTRIBUTES
42788 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42789 #endif
42791 #undef TARGET_COMP_TYPE_ATTRIBUTES
42792 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42794 #undef TARGET_INIT_BUILTINS
42795 #define TARGET_INIT_BUILTINS ix86_init_builtins
42796 #undef TARGET_BUILTIN_DECL
42797 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42798 #undef TARGET_EXPAND_BUILTIN
42799 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42801 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42802 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42803 ix86_builtin_vectorized_function
42805 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42806 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42808 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42809 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42811 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42812 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42814 #undef TARGET_BUILTIN_RECIPROCAL
42815 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42817 #undef TARGET_ASM_FUNCTION_EPILOGUE
42818 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42820 #undef TARGET_ENCODE_SECTION_INFO
42821 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42822 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42823 #else
42824 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42825 #endif
42827 #undef TARGET_ASM_OPEN_PAREN
42828 #define TARGET_ASM_OPEN_PAREN ""
42829 #undef TARGET_ASM_CLOSE_PAREN
42830 #define TARGET_ASM_CLOSE_PAREN ""
42832 #undef TARGET_ASM_BYTE_OP
42833 #define TARGET_ASM_BYTE_OP ASM_BYTE
42835 #undef TARGET_ASM_ALIGNED_HI_OP
42836 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42837 #undef TARGET_ASM_ALIGNED_SI_OP
42838 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42839 #ifdef ASM_QUAD
42840 #undef TARGET_ASM_ALIGNED_DI_OP
42841 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42842 #endif
42844 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42845 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42847 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42848 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42850 #undef TARGET_ASM_UNALIGNED_HI_OP
42851 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42852 #undef TARGET_ASM_UNALIGNED_SI_OP
42853 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42854 #undef TARGET_ASM_UNALIGNED_DI_OP
42855 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42857 #undef TARGET_PRINT_OPERAND
42858 #define TARGET_PRINT_OPERAND ix86_print_operand
42859 #undef TARGET_PRINT_OPERAND_ADDRESS
42860 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42861 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42862 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42863 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42864 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42866 #undef TARGET_SCHED_INIT_GLOBAL
42867 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42868 #undef TARGET_SCHED_ADJUST_COST
42869 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42870 #undef TARGET_SCHED_ISSUE_RATE
42871 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42872 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42873 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42874 ia32_multipass_dfa_lookahead
42876 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42877 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42879 #undef TARGET_MEMMODEL_CHECK
42880 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42882 #ifdef HAVE_AS_TLS
42883 #undef TARGET_HAVE_TLS
42884 #define TARGET_HAVE_TLS true
42885 #endif
42886 #undef TARGET_CANNOT_FORCE_CONST_MEM
42887 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42888 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42889 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42891 #undef TARGET_DELEGITIMIZE_ADDRESS
42892 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42894 #undef TARGET_MS_BITFIELD_LAYOUT_P
42895 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42897 #if TARGET_MACHO
42898 #undef TARGET_BINDS_LOCAL_P
42899 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42900 #endif
42901 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42902 #undef TARGET_BINDS_LOCAL_P
42903 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42904 #endif
42906 #undef TARGET_ASM_OUTPUT_MI_THUNK
42907 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42908 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42909 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42911 #undef TARGET_ASM_FILE_START
42912 #define TARGET_ASM_FILE_START x86_file_start
42914 #undef TARGET_OPTION_OVERRIDE
42915 #define TARGET_OPTION_OVERRIDE ix86_option_override
42917 #undef TARGET_REGISTER_MOVE_COST
42918 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42919 #undef TARGET_MEMORY_MOVE_COST
42920 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42921 #undef TARGET_RTX_COSTS
42922 #define TARGET_RTX_COSTS ix86_rtx_costs
42923 #undef TARGET_ADDRESS_COST
42924 #define TARGET_ADDRESS_COST ix86_address_cost
42926 #undef TARGET_FIXED_CONDITION_CODE_REGS
42927 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42928 #undef TARGET_CC_MODES_COMPATIBLE
42929 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42931 #undef TARGET_MACHINE_DEPENDENT_REORG
42932 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42934 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42935 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42937 #undef TARGET_BUILD_BUILTIN_VA_LIST
42938 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42940 #undef TARGET_FOLD_BUILTIN
42941 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42943 #undef TARGET_COMPARE_VERSION_PRIORITY
42944 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42946 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42947 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42948 ix86_generate_version_dispatcher_body
42950 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42951 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42952 ix86_get_function_versions_dispatcher
42954 #undef TARGET_ENUM_VA_LIST_P
42955 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42957 #undef TARGET_FN_ABI_VA_LIST
42958 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42960 #undef TARGET_CANONICAL_VA_LIST_TYPE
42961 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42963 #undef TARGET_EXPAND_BUILTIN_VA_START
42964 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42966 #undef TARGET_MD_ASM_CLOBBERS
42967 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42969 #undef TARGET_PROMOTE_PROTOTYPES
42970 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42971 #undef TARGET_STRUCT_VALUE_RTX
42972 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42973 #undef TARGET_SETUP_INCOMING_VARARGS
42974 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42975 #undef TARGET_MUST_PASS_IN_STACK
42976 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42977 #undef TARGET_FUNCTION_ARG_ADVANCE
42978 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42979 #undef TARGET_FUNCTION_ARG
42980 #define TARGET_FUNCTION_ARG ix86_function_arg
42981 #undef TARGET_FUNCTION_ARG_BOUNDARY
42982 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42983 #undef TARGET_PASS_BY_REFERENCE
42984 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42985 #undef TARGET_INTERNAL_ARG_POINTER
42986 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42987 #undef TARGET_UPDATE_STACK_BOUNDARY
42988 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42989 #undef TARGET_GET_DRAP_RTX
42990 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42991 #undef TARGET_STRICT_ARGUMENT_NAMING
42992 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42993 #undef TARGET_STATIC_CHAIN
42994 #define TARGET_STATIC_CHAIN ix86_static_chain
42995 #undef TARGET_TRAMPOLINE_INIT
42996 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42997 #undef TARGET_RETURN_POPS_ARGS
42998 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
43000 #undef TARGET_LEGITIMATE_COMBINED_INSN
43001 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
43003 #undef TARGET_ASAN_SHADOW_OFFSET
43004 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
43006 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
43007 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
43009 #undef TARGET_SCALAR_MODE_SUPPORTED_P
43010 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
43012 #undef TARGET_VECTOR_MODE_SUPPORTED_P
43013 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
43015 #undef TARGET_C_MODE_FOR_SUFFIX
43016 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
43018 #ifdef HAVE_AS_TLS
43019 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
43020 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
43021 #endif
43023 #ifdef SUBTARGET_INSERT_ATTRIBUTES
43024 #undef TARGET_INSERT_ATTRIBUTES
43025 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
43026 #endif
43028 #undef TARGET_MANGLE_TYPE
43029 #define TARGET_MANGLE_TYPE ix86_mangle_type
43031 #if !TARGET_MACHO
43032 #undef TARGET_STACK_PROTECT_FAIL
43033 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
43034 #endif
43036 #undef TARGET_FUNCTION_VALUE
43037 #define TARGET_FUNCTION_VALUE ix86_function_value
43039 #undef TARGET_FUNCTION_VALUE_REGNO_P
43040 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
43042 #undef TARGET_PROMOTE_FUNCTION_MODE
43043 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
43045 #undef TARGET_MEMBER_TYPE_FORCES_BLK
43046 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
43048 #undef TARGET_INSTANTIATE_DECLS
43049 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
43051 #undef TARGET_SECONDARY_RELOAD
43052 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
43054 #undef TARGET_CLASS_MAX_NREGS
43055 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
43057 #undef TARGET_PREFERRED_RELOAD_CLASS
43058 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
43059 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
43060 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
43061 #undef TARGET_CLASS_LIKELY_SPILLED_P
43062 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
43064 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
43065 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
43066 ix86_builtin_vectorization_cost
43067 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
43068 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
43069 ix86_vectorize_vec_perm_const_ok
43070 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
43071 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
43072 ix86_preferred_simd_mode
43073 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
43074 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
43075 ix86_autovectorize_vector_sizes
43076 #undef TARGET_VECTORIZE_INIT_COST
43077 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
43078 #undef TARGET_VECTORIZE_ADD_STMT_COST
43079 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
43080 #undef TARGET_VECTORIZE_FINISH_COST
43081 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
43082 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
43083 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
43085 #undef TARGET_SET_CURRENT_FUNCTION
43086 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
43088 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
43089 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
43091 #undef TARGET_OPTION_SAVE
43092 #define TARGET_OPTION_SAVE ix86_function_specific_save
43094 #undef TARGET_OPTION_RESTORE
43095 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
43097 #undef TARGET_OPTION_PRINT
43098 #define TARGET_OPTION_PRINT ix86_function_specific_print
43100 #undef TARGET_OPTION_FUNCTION_VERSIONS
43101 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
43103 #undef TARGET_CAN_INLINE_P
43104 #define TARGET_CAN_INLINE_P ix86_can_inline_p
43106 #undef TARGET_EXPAND_TO_RTL_HOOK
43107 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
43109 #undef TARGET_LEGITIMATE_ADDRESS_P
43110 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
43112 #undef TARGET_LRA_P
43113 #define TARGET_LRA_P hook_bool_void_true
43115 #undef TARGET_REGISTER_PRIORITY
43116 #define TARGET_REGISTER_PRIORITY ix86_register_priority
43118 #undef TARGET_REGISTER_USAGE_LEVELING_P
43119 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
43121 #undef TARGET_LEGITIMATE_CONSTANT_P
43122 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
43124 #undef TARGET_FRAME_POINTER_REQUIRED
43125 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
43127 #undef TARGET_CAN_ELIMINATE
43128 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
43130 #undef TARGET_EXTRA_LIVE_ON_ENTRY
43131 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
43133 #undef TARGET_ASM_CODE_END
43134 #define TARGET_ASM_CODE_END ix86_code_end
43136 #undef TARGET_CONDITIONAL_REGISTER_USAGE
43137 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
43139 #if TARGET_MACHO
43140 #undef TARGET_INIT_LIBFUNCS
43141 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
43142 #endif
43144 #undef TARGET_SPILL_CLASS
43145 #define TARGET_SPILL_CLASS ix86_spill_class
43147 struct gcc_target targetm = TARGET_INITIALIZER;
43149 #include "gt-i386.h"