Reverting merge from trunk
[official-gcc.git] / gcc / config / i386 / i386.c
blobbb6d15a2b2acf9231defc58b8623331c9340274b
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "gimplify.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63 #include "dumpfile.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
68 static rtx legitimize_dllimport_symbol (rtx, bool);
69 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
70 static rtx legitimize_pe_coff_symbol (rtx, bool);
72 #ifndef CHECK_STACK_LIMIT
73 #define CHECK_STACK_LIMIT (-1)
74 #endif
76 /* Return index of given mode in mult and division cost tables. */
77 #define MODE_INDEX(mode) \
78 ((mode) == QImode ? 0 \
79 : (mode) == HImode ? 1 \
80 : (mode) == SImode ? 2 \
81 : (mode) == DImode ? 3 \
82 : 4)
84 /* Processor costs (relative to an add) */
85 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
86 #define COSTS_N_BYTES(N) ((N) * 2)
88 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
90 static stringop_algs ix86_size_memcpy[2] = {
91 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
92 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
93 static stringop_algs ix86_size_memset[2] = {
94 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
95 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
97 const
98 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
99 COSTS_N_BYTES (2), /* cost of an add instruction */
100 COSTS_N_BYTES (3), /* cost of a lea instruction */
101 COSTS_N_BYTES (2), /* variable shift costs */
102 COSTS_N_BYTES (3), /* constant shift costs */
103 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
104 COSTS_N_BYTES (3), /* HI */
105 COSTS_N_BYTES (3), /* SI */
106 COSTS_N_BYTES (3), /* DI */
107 COSTS_N_BYTES (5)}, /* other */
108 0, /* cost of multiply per each bit set */
109 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
110 COSTS_N_BYTES (3), /* HI */
111 COSTS_N_BYTES (3), /* SI */
112 COSTS_N_BYTES (3), /* DI */
113 COSTS_N_BYTES (5)}, /* other */
114 COSTS_N_BYTES (3), /* cost of movsx */
115 COSTS_N_BYTES (3), /* cost of movzx */
116 0, /* "large" insn */
117 2, /* MOVE_RATIO */
118 2, /* cost for loading QImode using movzbl */
119 {2, 2, 2}, /* cost of loading integer registers
120 in QImode, HImode and SImode.
121 Relative to reg-reg move (2). */
122 {2, 2, 2}, /* cost of storing integer registers */
123 2, /* cost of reg,reg fld/fst */
124 {2, 2, 2}, /* cost of loading fp registers
125 in SFmode, DFmode and XFmode */
126 {2, 2, 2}, /* cost of storing fp registers
127 in SFmode, DFmode and XFmode */
128 3, /* cost of moving MMX register */
129 {3, 3}, /* cost of loading MMX registers
130 in SImode and DImode */
131 {3, 3}, /* cost of storing MMX registers
132 in SImode and DImode */
133 3, /* cost of moving SSE register */
134 {3, 3, 3}, /* cost of loading SSE registers
135 in SImode, DImode and TImode */
136 {3, 3, 3}, /* cost of storing SSE registers
137 in SImode, DImode and TImode */
138 3, /* MMX or SSE register to integer */
139 0, /* size of l1 cache */
140 0, /* size of l2 cache */
141 0, /* size of prefetch block */
142 0, /* number of parallel prefetches */
143 2, /* Branch cost */
144 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
145 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
146 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
147 COSTS_N_BYTES (2), /* cost of FABS instruction. */
148 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
149 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
150 ix86_size_memcpy,
151 ix86_size_memset,
152 1, /* scalar_stmt_cost. */
153 1, /* scalar load_cost. */
154 1, /* scalar_store_cost. */
155 1, /* vec_stmt_cost. */
156 1, /* vec_to_scalar_cost. */
157 1, /* scalar_to_vec_cost. */
158 1, /* vec_align_load_cost. */
159 1, /* vec_unalign_load_cost. */
160 1, /* vec_store_cost. */
161 1, /* cond_taken_branch_cost. */
162 1, /* cond_not_taken_branch_cost. */
165 /* Processor costs (relative to an add) */
166 static stringop_algs i386_memcpy[2] = {
167 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
168 DUMMY_STRINGOP_ALGS};
169 static stringop_algs i386_memset[2] = {
170 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
171 DUMMY_STRINGOP_ALGS};
173 static const
174 struct processor_costs i386_cost = { /* 386 specific costs */
175 COSTS_N_INSNS (1), /* cost of an add instruction */
176 COSTS_N_INSNS (1), /* cost of a lea instruction */
177 COSTS_N_INSNS (3), /* variable shift costs */
178 COSTS_N_INSNS (2), /* constant shift costs */
179 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
180 COSTS_N_INSNS (6), /* HI */
181 COSTS_N_INSNS (6), /* SI */
182 COSTS_N_INSNS (6), /* DI */
183 COSTS_N_INSNS (6)}, /* other */
184 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
185 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
186 COSTS_N_INSNS (23), /* HI */
187 COSTS_N_INSNS (23), /* SI */
188 COSTS_N_INSNS (23), /* DI */
189 COSTS_N_INSNS (23)}, /* other */
190 COSTS_N_INSNS (3), /* cost of movsx */
191 COSTS_N_INSNS (2), /* cost of movzx */
192 15, /* "large" insn */
193 3, /* MOVE_RATIO */
194 4, /* cost for loading QImode using movzbl */
195 {2, 4, 2}, /* cost of loading integer registers
196 in QImode, HImode and SImode.
197 Relative to reg-reg move (2). */
198 {2, 4, 2}, /* cost of storing integer registers */
199 2, /* cost of reg,reg fld/fst */
200 {8, 8, 8}, /* cost of loading fp registers
201 in SFmode, DFmode and XFmode */
202 {8, 8, 8}, /* cost of storing fp registers
203 in SFmode, DFmode and XFmode */
204 2, /* cost of moving MMX register */
205 {4, 8}, /* cost of loading MMX registers
206 in SImode and DImode */
207 {4, 8}, /* cost of storing MMX registers
208 in SImode and DImode */
209 2, /* cost of moving SSE register */
210 {4, 8, 16}, /* cost of loading SSE registers
211 in SImode, DImode and TImode */
212 {4, 8, 16}, /* cost of storing SSE registers
213 in SImode, DImode and TImode */
214 3, /* MMX or SSE register to integer */
215 0, /* size of l1 cache */
216 0, /* size of l2 cache */
217 0, /* size of prefetch block */
218 0, /* number of parallel prefetches */
219 1, /* Branch cost */
220 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
221 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
222 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
223 COSTS_N_INSNS (22), /* cost of FABS instruction. */
224 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
225 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
226 i386_memcpy,
227 i386_memset,
228 1, /* scalar_stmt_cost. */
229 1, /* scalar load_cost. */
230 1, /* scalar_store_cost. */
231 1, /* vec_stmt_cost. */
232 1, /* vec_to_scalar_cost. */
233 1, /* scalar_to_vec_cost. */
234 1, /* vec_align_load_cost. */
235 2, /* vec_unalign_load_cost. */
236 1, /* vec_store_cost. */
237 3, /* cond_taken_branch_cost. */
238 1, /* cond_not_taken_branch_cost. */
241 static stringop_algs i486_memcpy[2] = {
242 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
243 DUMMY_STRINGOP_ALGS};
244 static stringop_algs i486_memset[2] = {
245 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
246 DUMMY_STRINGOP_ALGS};
248 static const
249 struct processor_costs i486_cost = { /* 486 specific costs */
250 COSTS_N_INSNS (1), /* cost of an add instruction */
251 COSTS_N_INSNS (1), /* cost of a lea instruction */
252 COSTS_N_INSNS (3), /* variable shift costs */
253 COSTS_N_INSNS (2), /* constant shift costs */
254 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
255 COSTS_N_INSNS (12), /* HI */
256 COSTS_N_INSNS (12), /* SI */
257 COSTS_N_INSNS (12), /* DI */
258 COSTS_N_INSNS (12)}, /* other */
259 1, /* cost of multiply per each bit set */
260 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
261 COSTS_N_INSNS (40), /* HI */
262 COSTS_N_INSNS (40), /* SI */
263 COSTS_N_INSNS (40), /* DI */
264 COSTS_N_INSNS (40)}, /* other */
265 COSTS_N_INSNS (3), /* cost of movsx */
266 COSTS_N_INSNS (2), /* cost of movzx */
267 15, /* "large" insn */
268 3, /* MOVE_RATIO */
269 4, /* cost for loading QImode using movzbl */
270 {2, 4, 2}, /* cost of loading integer registers
271 in QImode, HImode and SImode.
272 Relative to reg-reg move (2). */
273 {2, 4, 2}, /* cost of storing integer registers */
274 2, /* cost of reg,reg fld/fst */
275 {8, 8, 8}, /* cost of loading fp registers
276 in SFmode, DFmode and XFmode */
277 {8, 8, 8}, /* cost of storing fp registers
278 in SFmode, DFmode and XFmode */
279 2, /* cost of moving MMX register */
280 {4, 8}, /* cost of loading MMX registers
281 in SImode and DImode */
282 {4, 8}, /* cost of storing MMX registers
283 in SImode and DImode */
284 2, /* cost of moving SSE register */
285 {4, 8, 16}, /* cost of loading SSE registers
286 in SImode, DImode and TImode */
287 {4, 8, 16}, /* cost of storing SSE registers
288 in SImode, DImode and TImode */
289 3, /* MMX or SSE register to integer */
290 4, /* size of l1 cache. 486 has 8kB cache
291 shared for code and data, so 4kB is
292 not really precise. */
293 4, /* size of l2 cache */
294 0, /* size of prefetch block */
295 0, /* number of parallel prefetches */
296 1, /* Branch cost */
297 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
298 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
299 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
300 COSTS_N_INSNS (3), /* cost of FABS instruction. */
301 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
302 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
303 i486_memcpy,
304 i486_memset,
305 1, /* scalar_stmt_cost. */
306 1, /* scalar load_cost. */
307 1, /* scalar_store_cost. */
308 1, /* vec_stmt_cost. */
309 1, /* vec_to_scalar_cost. */
310 1, /* scalar_to_vec_cost. */
311 1, /* vec_align_load_cost. */
312 2, /* vec_unalign_load_cost. */
313 1, /* vec_store_cost. */
314 3, /* cond_taken_branch_cost. */
315 1, /* cond_not_taken_branch_cost. */
318 static stringop_algs pentium_memcpy[2] = {
319 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
320 DUMMY_STRINGOP_ALGS};
321 static stringop_algs pentium_memset[2] = {
322 {libcall, {{-1, rep_prefix_4_byte, false}}},
323 DUMMY_STRINGOP_ALGS};
325 static const
326 struct processor_costs pentium_cost = {
327 COSTS_N_INSNS (1), /* cost of an add instruction */
328 COSTS_N_INSNS (1), /* cost of a lea instruction */
329 COSTS_N_INSNS (4), /* variable shift costs */
330 COSTS_N_INSNS (1), /* constant shift costs */
331 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
332 COSTS_N_INSNS (11), /* HI */
333 COSTS_N_INSNS (11), /* SI */
334 COSTS_N_INSNS (11), /* DI */
335 COSTS_N_INSNS (11)}, /* other */
336 0, /* cost of multiply per each bit set */
337 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
338 COSTS_N_INSNS (25), /* HI */
339 COSTS_N_INSNS (25), /* SI */
340 COSTS_N_INSNS (25), /* DI */
341 COSTS_N_INSNS (25)}, /* other */
342 COSTS_N_INSNS (3), /* cost of movsx */
343 COSTS_N_INSNS (2), /* cost of movzx */
344 8, /* "large" insn */
345 6, /* MOVE_RATIO */
346 6, /* cost for loading QImode using movzbl */
347 {2, 4, 2}, /* cost of loading integer registers
348 in QImode, HImode and SImode.
349 Relative to reg-reg move (2). */
350 {2, 4, 2}, /* cost of storing integer registers */
351 2, /* cost of reg,reg fld/fst */
352 {2, 2, 6}, /* cost of loading fp registers
353 in SFmode, DFmode and XFmode */
354 {4, 4, 6}, /* cost of storing fp registers
355 in SFmode, DFmode and XFmode */
356 8, /* cost of moving MMX register */
357 {8, 8}, /* cost of loading MMX registers
358 in SImode and DImode */
359 {8, 8}, /* cost of storing MMX registers
360 in SImode and DImode */
361 2, /* cost of moving SSE register */
362 {4, 8, 16}, /* cost of loading SSE registers
363 in SImode, DImode and TImode */
364 {4, 8, 16}, /* cost of storing SSE registers
365 in SImode, DImode and TImode */
366 3, /* MMX or SSE register to integer */
367 8, /* size of l1 cache. */
368 8, /* size of l2 cache */
369 0, /* size of prefetch block */
370 0, /* number of parallel prefetches */
371 2, /* Branch cost */
372 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
373 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
374 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
375 COSTS_N_INSNS (1), /* cost of FABS instruction. */
376 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
377 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
378 pentium_memcpy,
379 pentium_memset,
380 1, /* scalar_stmt_cost. */
381 1, /* scalar load_cost. */
382 1, /* scalar_store_cost. */
383 1, /* vec_stmt_cost. */
384 1, /* vec_to_scalar_cost. */
385 1, /* scalar_to_vec_cost. */
386 1, /* vec_align_load_cost. */
387 2, /* vec_unalign_load_cost. */
388 1, /* vec_store_cost. */
389 3, /* cond_taken_branch_cost. */
390 1, /* cond_not_taken_branch_cost. */
393 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
394 (we ensure the alignment). For small blocks inline loop is still a
395 noticeable win, for bigger blocks either rep movsl or rep movsb is
396 way to go. Rep movsb has apparently more expensive startup time in CPU,
397 but after 4K the difference is down in the noise. */
398 static stringop_algs pentiumpro_memcpy[2] = {
399 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
400 {8192, rep_prefix_4_byte, false},
401 {-1, rep_prefix_1_byte, false}}},
402 DUMMY_STRINGOP_ALGS};
403 static stringop_algs pentiumpro_memset[2] = {
404 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
405 {8192, rep_prefix_4_byte, false},
406 {-1, libcall, false}}},
407 DUMMY_STRINGOP_ALGS};
408 static const
409 struct processor_costs pentiumpro_cost = {
410 COSTS_N_INSNS (1), /* cost of an add instruction */
411 COSTS_N_INSNS (1), /* cost of a lea instruction */
412 COSTS_N_INSNS (1), /* variable shift costs */
413 COSTS_N_INSNS (1), /* constant shift costs */
414 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
415 COSTS_N_INSNS (4), /* HI */
416 COSTS_N_INSNS (4), /* SI */
417 COSTS_N_INSNS (4), /* DI */
418 COSTS_N_INSNS (4)}, /* other */
419 0, /* cost of multiply per each bit set */
420 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
421 COSTS_N_INSNS (17), /* HI */
422 COSTS_N_INSNS (17), /* SI */
423 COSTS_N_INSNS (17), /* DI */
424 COSTS_N_INSNS (17)}, /* other */
425 COSTS_N_INSNS (1), /* cost of movsx */
426 COSTS_N_INSNS (1), /* cost of movzx */
427 8, /* "large" insn */
428 6, /* MOVE_RATIO */
429 2, /* cost for loading QImode using movzbl */
430 {4, 4, 4}, /* cost of loading integer registers
431 in QImode, HImode and SImode.
432 Relative to reg-reg move (2). */
433 {2, 2, 2}, /* cost of storing integer registers */
434 2, /* cost of reg,reg fld/fst */
435 {2, 2, 6}, /* cost of loading fp registers
436 in SFmode, DFmode and XFmode */
437 {4, 4, 6}, /* cost of storing fp registers
438 in SFmode, DFmode and XFmode */
439 2, /* cost of moving MMX register */
440 {2, 2}, /* cost of loading MMX registers
441 in SImode and DImode */
442 {2, 2}, /* cost of storing MMX registers
443 in SImode and DImode */
444 2, /* cost of moving SSE register */
445 {2, 2, 8}, /* cost of loading SSE registers
446 in SImode, DImode and TImode */
447 {2, 2, 8}, /* cost of storing SSE registers
448 in SImode, DImode and TImode */
449 3, /* MMX or SSE register to integer */
450 8, /* size of l1 cache. */
451 256, /* size of l2 cache */
452 32, /* size of prefetch block */
453 6, /* number of parallel prefetches */
454 2, /* Branch cost */
455 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
456 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
457 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
458 COSTS_N_INSNS (2), /* cost of FABS instruction. */
459 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
460 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
461 pentiumpro_memcpy,
462 pentiumpro_memset,
463 1, /* scalar_stmt_cost. */
464 1, /* scalar load_cost. */
465 1, /* scalar_store_cost. */
466 1, /* vec_stmt_cost. */
467 1, /* vec_to_scalar_cost. */
468 1, /* scalar_to_vec_cost. */
469 1, /* vec_align_load_cost. */
470 2, /* vec_unalign_load_cost. */
471 1, /* vec_store_cost. */
472 3, /* cond_taken_branch_cost. */
473 1, /* cond_not_taken_branch_cost. */
476 static stringop_algs geode_memcpy[2] = {
477 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
478 DUMMY_STRINGOP_ALGS};
479 static stringop_algs geode_memset[2] = {
480 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
481 DUMMY_STRINGOP_ALGS};
482 static const
483 struct processor_costs geode_cost = {
484 COSTS_N_INSNS (1), /* cost of an add instruction */
485 COSTS_N_INSNS (1), /* cost of a lea instruction */
486 COSTS_N_INSNS (2), /* variable shift costs */
487 COSTS_N_INSNS (1), /* constant shift costs */
488 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
489 COSTS_N_INSNS (4), /* HI */
490 COSTS_N_INSNS (7), /* SI */
491 COSTS_N_INSNS (7), /* DI */
492 COSTS_N_INSNS (7)}, /* other */
493 0, /* cost of multiply per each bit set */
494 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
495 COSTS_N_INSNS (23), /* HI */
496 COSTS_N_INSNS (39), /* SI */
497 COSTS_N_INSNS (39), /* DI */
498 COSTS_N_INSNS (39)}, /* other */
499 COSTS_N_INSNS (1), /* cost of movsx */
500 COSTS_N_INSNS (1), /* cost of movzx */
501 8, /* "large" insn */
502 4, /* MOVE_RATIO */
503 1, /* cost for loading QImode using movzbl */
504 {1, 1, 1}, /* cost of loading integer registers
505 in QImode, HImode and SImode.
506 Relative to reg-reg move (2). */
507 {1, 1, 1}, /* cost of storing integer registers */
508 1, /* cost of reg,reg fld/fst */
509 {1, 1, 1}, /* cost of loading fp registers
510 in SFmode, DFmode and XFmode */
511 {4, 6, 6}, /* cost of storing fp registers
512 in SFmode, DFmode and XFmode */
514 1, /* cost of moving MMX register */
515 {1, 1}, /* cost of loading MMX registers
516 in SImode and DImode */
517 {1, 1}, /* cost of storing MMX registers
518 in SImode and DImode */
519 1, /* cost of moving SSE register */
520 {1, 1, 1}, /* cost of loading SSE registers
521 in SImode, DImode and TImode */
522 {1, 1, 1}, /* cost of storing SSE registers
523 in SImode, DImode and TImode */
524 1, /* MMX or SSE register to integer */
525 64, /* size of l1 cache. */
526 128, /* size of l2 cache. */
527 32, /* size of prefetch block */
528 1, /* number of parallel prefetches */
529 1, /* Branch cost */
530 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
531 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
532 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
533 COSTS_N_INSNS (1), /* cost of FABS instruction. */
534 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
535 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
536 geode_memcpy,
537 geode_memset,
538 1, /* scalar_stmt_cost. */
539 1, /* scalar load_cost. */
540 1, /* scalar_store_cost. */
541 1, /* vec_stmt_cost. */
542 1, /* vec_to_scalar_cost. */
543 1, /* scalar_to_vec_cost. */
544 1, /* vec_align_load_cost. */
545 2, /* vec_unalign_load_cost. */
546 1, /* vec_store_cost. */
547 3, /* cond_taken_branch_cost. */
548 1, /* cond_not_taken_branch_cost. */
551 static stringop_algs k6_memcpy[2] = {
552 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
553 DUMMY_STRINGOP_ALGS};
554 static stringop_algs k6_memset[2] = {
555 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
556 DUMMY_STRINGOP_ALGS};
557 static const
558 struct processor_costs k6_cost = {
559 COSTS_N_INSNS (1), /* cost of an add instruction */
560 COSTS_N_INSNS (2), /* cost of a lea instruction */
561 COSTS_N_INSNS (1), /* variable shift costs */
562 COSTS_N_INSNS (1), /* constant shift costs */
563 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
564 COSTS_N_INSNS (3), /* HI */
565 COSTS_N_INSNS (3), /* SI */
566 COSTS_N_INSNS (3), /* DI */
567 COSTS_N_INSNS (3)}, /* other */
568 0, /* cost of multiply per each bit set */
569 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
570 COSTS_N_INSNS (18), /* HI */
571 COSTS_N_INSNS (18), /* SI */
572 COSTS_N_INSNS (18), /* DI */
573 COSTS_N_INSNS (18)}, /* other */
574 COSTS_N_INSNS (2), /* cost of movsx */
575 COSTS_N_INSNS (2), /* cost of movzx */
576 8, /* "large" insn */
577 4, /* MOVE_RATIO */
578 3, /* cost for loading QImode using movzbl */
579 {4, 5, 4}, /* cost of loading integer registers
580 in QImode, HImode and SImode.
581 Relative to reg-reg move (2). */
582 {2, 3, 2}, /* cost of storing integer registers */
583 4, /* cost of reg,reg fld/fst */
584 {6, 6, 6}, /* cost of loading fp registers
585 in SFmode, DFmode and XFmode */
586 {4, 4, 4}, /* cost of storing fp registers
587 in SFmode, DFmode and XFmode */
588 2, /* cost of moving MMX register */
589 {2, 2}, /* cost of loading MMX registers
590 in SImode and DImode */
591 {2, 2}, /* cost of storing MMX registers
592 in SImode and DImode */
593 2, /* cost of moving SSE register */
594 {2, 2, 8}, /* cost of loading SSE registers
595 in SImode, DImode and TImode */
596 {2, 2, 8}, /* cost of storing SSE registers
597 in SImode, DImode and TImode */
598 6, /* MMX or SSE register to integer */
599 32, /* size of l1 cache. */
600 32, /* size of l2 cache. Some models
601 have integrated l2 cache, but
602 optimizing for k6 is not important
603 enough to worry about that. */
604 32, /* size of prefetch block */
605 1, /* number of parallel prefetches */
606 1, /* Branch cost */
607 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
608 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
609 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
610 COSTS_N_INSNS (2), /* cost of FABS instruction. */
611 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
612 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
613 k6_memcpy,
614 k6_memset,
615 1, /* scalar_stmt_cost. */
616 1, /* scalar load_cost. */
617 1, /* scalar_store_cost. */
618 1, /* vec_stmt_cost. */
619 1, /* vec_to_scalar_cost. */
620 1, /* scalar_to_vec_cost. */
621 1, /* vec_align_load_cost. */
622 2, /* vec_unalign_load_cost. */
623 1, /* vec_store_cost. */
624 3, /* cond_taken_branch_cost. */
625 1, /* cond_not_taken_branch_cost. */
628 /* For some reason, Athlon deals better with REP prefix (relative to loops)
629 compared to K8. Alignment becomes important after 8 bytes for memcpy and
630 128 bytes for memset. */
631 static stringop_algs athlon_memcpy[2] = {
632 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
633 DUMMY_STRINGOP_ALGS};
634 static stringop_algs athlon_memset[2] = {
635 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
636 DUMMY_STRINGOP_ALGS};
637 static const
638 struct processor_costs athlon_cost = {
639 COSTS_N_INSNS (1), /* cost of an add instruction */
640 COSTS_N_INSNS (2), /* cost of a lea instruction */
641 COSTS_N_INSNS (1), /* variable shift costs */
642 COSTS_N_INSNS (1), /* constant shift costs */
643 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
644 COSTS_N_INSNS (5), /* HI */
645 COSTS_N_INSNS (5), /* SI */
646 COSTS_N_INSNS (5), /* DI */
647 COSTS_N_INSNS (5)}, /* other */
648 0, /* cost of multiply per each bit set */
649 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
650 COSTS_N_INSNS (26), /* HI */
651 COSTS_N_INSNS (42), /* SI */
652 COSTS_N_INSNS (74), /* DI */
653 COSTS_N_INSNS (74)}, /* other */
654 COSTS_N_INSNS (1), /* cost of movsx */
655 COSTS_N_INSNS (1), /* cost of movzx */
656 8, /* "large" insn */
657 9, /* MOVE_RATIO */
658 4, /* cost for loading QImode using movzbl */
659 {3, 4, 3}, /* cost of loading integer registers
660 in QImode, HImode and SImode.
661 Relative to reg-reg move (2). */
662 {3, 4, 3}, /* cost of storing integer registers */
663 4, /* cost of reg,reg fld/fst */
664 {4, 4, 12}, /* cost of loading fp registers
665 in SFmode, DFmode and XFmode */
666 {6, 6, 8}, /* cost of storing fp registers
667 in SFmode, DFmode and XFmode */
668 2, /* cost of moving MMX register */
669 {4, 4}, /* cost of loading MMX registers
670 in SImode and DImode */
671 {4, 4}, /* cost of storing MMX registers
672 in SImode and DImode */
673 2, /* cost of moving SSE register */
674 {4, 4, 6}, /* cost of loading SSE registers
675 in SImode, DImode and TImode */
676 {4, 4, 5}, /* cost of storing SSE registers
677 in SImode, DImode and TImode */
678 5, /* MMX or SSE register to integer */
679 64, /* size of l1 cache. */
680 256, /* size of l2 cache. */
681 64, /* size of prefetch block */
682 6, /* number of parallel prefetches */
683 5, /* Branch cost */
684 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
685 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
686 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
687 COSTS_N_INSNS (2), /* cost of FABS instruction. */
688 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
689 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
690 athlon_memcpy,
691 athlon_memset,
692 1, /* scalar_stmt_cost. */
693 1, /* scalar load_cost. */
694 1, /* scalar_store_cost. */
695 1, /* vec_stmt_cost. */
696 1, /* vec_to_scalar_cost. */
697 1, /* scalar_to_vec_cost. */
698 1, /* vec_align_load_cost. */
699 2, /* vec_unalign_load_cost. */
700 1, /* vec_store_cost. */
701 3, /* cond_taken_branch_cost. */
702 1, /* cond_not_taken_branch_cost. */
705 /* K8 has optimized REP instruction for medium sized blocks, but for very
706 small blocks it is better to use loop. For large blocks, libcall can
707 do nontemporary accesses and beat inline considerably. */
708 static stringop_algs k8_memcpy[2] = {
709 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
710 {-1, rep_prefix_4_byte, false}}},
711 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
712 {-1, libcall, false}}}};
713 static stringop_algs k8_memset[2] = {
714 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
715 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
716 {libcall, {{48, unrolled_loop, false},
717 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
718 static const
719 struct processor_costs k8_cost = {
720 COSTS_N_INSNS (1), /* cost of an add instruction */
721 COSTS_N_INSNS (2), /* cost of a lea instruction */
722 COSTS_N_INSNS (1), /* variable shift costs */
723 COSTS_N_INSNS (1), /* constant shift costs */
724 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
725 COSTS_N_INSNS (4), /* HI */
726 COSTS_N_INSNS (3), /* SI */
727 COSTS_N_INSNS (4), /* DI */
728 COSTS_N_INSNS (5)}, /* other */
729 0, /* cost of multiply per each bit set */
730 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
731 COSTS_N_INSNS (26), /* HI */
732 COSTS_N_INSNS (42), /* SI */
733 COSTS_N_INSNS (74), /* DI */
734 COSTS_N_INSNS (74)}, /* other */
735 COSTS_N_INSNS (1), /* cost of movsx */
736 COSTS_N_INSNS (1), /* cost of movzx */
737 8, /* "large" insn */
738 9, /* MOVE_RATIO */
739 4, /* cost for loading QImode using movzbl */
740 {3, 4, 3}, /* cost of loading integer registers
741 in QImode, HImode and SImode.
742 Relative to reg-reg move (2). */
743 {3, 4, 3}, /* cost of storing integer registers */
744 4, /* cost of reg,reg fld/fst */
745 {4, 4, 12}, /* cost of loading fp registers
746 in SFmode, DFmode and XFmode */
747 {6, 6, 8}, /* cost of storing fp registers
748 in SFmode, DFmode and XFmode */
749 2, /* cost of moving MMX register */
750 {3, 3}, /* cost of loading MMX registers
751 in SImode and DImode */
752 {4, 4}, /* cost of storing MMX registers
753 in SImode and DImode */
754 2, /* cost of moving SSE register */
755 {4, 3, 6}, /* cost of loading SSE registers
756 in SImode, DImode and TImode */
757 {4, 4, 5}, /* cost of storing SSE registers
758 in SImode, DImode and TImode */
759 5, /* MMX or SSE register to integer */
760 64, /* size of l1 cache. */
761 512, /* size of l2 cache. */
762 64, /* size of prefetch block */
763 /* New AMD processors never drop prefetches; if they cannot be performed
764 immediately, they are queued. We set number of simultaneous prefetches
765 to a large constant to reflect this (it probably is not a good idea not
766 to limit number of prefetches at all, as their execution also takes some
767 time). */
768 100, /* number of parallel prefetches */
769 3, /* Branch cost */
770 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
771 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
772 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
773 COSTS_N_INSNS (2), /* cost of FABS instruction. */
774 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
775 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
777 k8_memcpy,
778 k8_memset,
779 4, /* scalar_stmt_cost. */
780 2, /* scalar load_cost. */
781 2, /* scalar_store_cost. */
782 5, /* vec_stmt_cost. */
783 0, /* vec_to_scalar_cost. */
784 2, /* scalar_to_vec_cost. */
785 2, /* vec_align_load_cost. */
786 3, /* vec_unalign_load_cost. */
787 3, /* vec_store_cost. */
788 3, /* cond_taken_branch_cost. */
789 2, /* cond_not_taken_branch_cost. */
792 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
793 very small blocks it is better to use loop. For large blocks, libcall can
794 do nontemporary accesses and beat inline considerably. */
795 static stringop_algs amdfam10_memcpy[2] = {
796 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
797 {-1, rep_prefix_4_byte, false}}},
798 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
799 {-1, libcall, false}}}};
800 static stringop_algs amdfam10_memset[2] = {
801 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
802 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
803 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
804 {-1, libcall, false}}}};
805 struct processor_costs amdfam10_cost = {
806 COSTS_N_INSNS (1), /* cost of an add instruction */
807 COSTS_N_INSNS (2), /* cost of a lea instruction */
808 COSTS_N_INSNS (1), /* variable shift costs */
809 COSTS_N_INSNS (1), /* constant shift costs */
810 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
811 COSTS_N_INSNS (4), /* HI */
812 COSTS_N_INSNS (3), /* SI */
813 COSTS_N_INSNS (4), /* DI */
814 COSTS_N_INSNS (5)}, /* other */
815 0, /* cost of multiply per each bit set */
816 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
817 COSTS_N_INSNS (35), /* HI */
818 COSTS_N_INSNS (51), /* SI */
819 COSTS_N_INSNS (83), /* DI */
820 COSTS_N_INSNS (83)}, /* other */
821 COSTS_N_INSNS (1), /* cost of movsx */
822 COSTS_N_INSNS (1), /* cost of movzx */
823 8, /* "large" insn */
824 9, /* MOVE_RATIO */
825 4, /* cost for loading QImode using movzbl */
826 {3, 4, 3}, /* cost of loading integer registers
827 in QImode, HImode and SImode.
828 Relative to reg-reg move (2). */
829 {3, 4, 3}, /* cost of storing integer registers */
830 4, /* cost of reg,reg fld/fst */
831 {4, 4, 12}, /* cost of loading fp registers
832 in SFmode, DFmode and XFmode */
833 {6, 6, 8}, /* cost of storing fp registers
834 in SFmode, DFmode and XFmode */
835 2, /* cost of moving MMX register */
836 {3, 3}, /* cost of loading MMX registers
837 in SImode and DImode */
838 {4, 4}, /* cost of storing MMX registers
839 in SImode and DImode */
840 2, /* cost of moving SSE register */
841 {4, 4, 3}, /* cost of loading SSE registers
842 in SImode, DImode and TImode */
843 {4, 4, 5}, /* cost of storing SSE registers
844 in SImode, DImode and TImode */
845 3, /* MMX or SSE register to integer */
846 /* On K8:
847 MOVD reg64, xmmreg Double FSTORE 4
848 MOVD reg32, xmmreg Double FSTORE 4
849 On AMDFAM10:
850 MOVD reg64, xmmreg Double FADD 3
851 1/1 1/1
852 MOVD reg32, xmmreg Double FADD 3
853 1/1 1/1 */
854 64, /* size of l1 cache. */
855 512, /* size of l2 cache. */
856 64, /* size of prefetch block */
857 /* New AMD processors never drop prefetches; if they cannot be performed
858 immediately, they are queued. We set number of simultaneous prefetches
859 to a large constant to reflect this (it probably is not a good idea not
860 to limit number of prefetches at all, as their execution also takes some
861 time). */
862 100, /* number of parallel prefetches */
863 2, /* Branch cost */
864 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
865 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
866 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
867 COSTS_N_INSNS (2), /* cost of FABS instruction. */
868 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
869 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
871 amdfam10_memcpy,
872 amdfam10_memset,
873 4, /* scalar_stmt_cost. */
874 2, /* scalar load_cost. */
875 2, /* scalar_store_cost. */
876 6, /* vec_stmt_cost. */
877 0, /* vec_to_scalar_cost. */
878 2, /* scalar_to_vec_cost. */
879 2, /* vec_align_load_cost. */
880 2, /* vec_unalign_load_cost. */
881 2, /* vec_store_cost. */
882 2, /* cond_taken_branch_cost. */
883 1, /* cond_not_taken_branch_cost. */
886 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
887 very small blocks it is better to use loop. For large blocks, libcall
888 can do nontemporary accesses and beat inline considerably. */
889 static stringop_algs bdver1_memcpy[2] = {
890 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
891 {-1, rep_prefix_4_byte, false}}},
892 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
893 {-1, libcall, false}}}};
894 static stringop_algs bdver1_memset[2] = {
895 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
896 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
897 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
898 {-1, libcall, false}}}};
900 const struct processor_costs bdver1_cost = {
901 COSTS_N_INSNS (1), /* cost of an add instruction */
902 COSTS_N_INSNS (1), /* cost of a lea instruction */
903 COSTS_N_INSNS (1), /* variable shift costs */
904 COSTS_N_INSNS (1), /* constant shift costs */
905 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
906 COSTS_N_INSNS (4), /* HI */
907 COSTS_N_INSNS (4), /* SI */
908 COSTS_N_INSNS (6), /* DI */
909 COSTS_N_INSNS (6)}, /* other */
910 0, /* cost of multiply per each bit set */
911 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
912 COSTS_N_INSNS (35), /* HI */
913 COSTS_N_INSNS (51), /* SI */
914 COSTS_N_INSNS (83), /* DI */
915 COSTS_N_INSNS (83)}, /* other */
916 COSTS_N_INSNS (1), /* cost of movsx */
917 COSTS_N_INSNS (1), /* cost of movzx */
918 8, /* "large" insn */
919 9, /* MOVE_RATIO */
920 4, /* cost for loading QImode using movzbl */
921 {5, 5, 4}, /* cost of loading integer registers
922 in QImode, HImode and SImode.
923 Relative to reg-reg move (2). */
924 {4, 4, 4}, /* cost of storing integer registers */
925 2, /* cost of reg,reg fld/fst */
926 {5, 5, 12}, /* cost of loading fp registers
927 in SFmode, DFmode and XFmode */
928 {4, 4, 8}, /* cost of storing fp registers
929 in SFmode, DFmode and XFmode */
930 2, /* cost of moving MMX register */
931 {4, 4}, /* cost of loading MMX registers
932 in SImode and DImode */
933 {4, 4}, /* cost of storing MMX registers
934 in SImode and DImode */
935 2, /* cost of moving SSE register */
936 {4, 4, 4}, /* cost of loading SSE registers
937 in SImode, DImode and TImode */
938 {4, 4, 4}, /* cost of storing SSE registers
939 in SImode, DImode and TImode */
940 2, /* MMX or SSE register to integer */
941 /* On K8:
942 MOVD reg64, xmmreg Double FSTORE 4
943 MOVD reg32, xmmreg Double FSTORE 4
944 On AMDFAM10:
945 MOVD reg64, xmmreg Double FADD 3
946 1/1 1/1
947 MOVD reg32, xmmreg Double FADD 3
948 1/1 1/1 */
949 16, /* size of l1 cache. */
950 2048, /* size of l2 cache. */
951 64, /* size of prefetch block */
952 /* New AMD processors never drop prefetches; if they cannot be performed
953 immediately, they are queued. We set number of simultaneous prefetches
954 to a large constant to reflect this (it probably is not a good idea not
955 to limit number of prefetches at all, as their execution also takes some
956 time). */
957 100, /* number of parallel prefetches */
958 2, /* Branch cost */
959 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
960 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
961 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
962 COSTS_N_INSNS (2), /* cost of FABS instruction. */
963 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
964 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
966 bdver1_memcpy,
967 bdver1_memset,
968 6, /* scalar_stmt_cost. */
969 4, /* scalar load_cost. */
970 4, /* scalar_store_cost. */
971 6, /* vec_stmt_cost. */
972 0, /* vec_to_scalar_cost. */
973 2, /* scalar_to_vec_cost. */
974 4, /* vec_align_load_cost. */
975 4, /* vec_unalign_load_cost. */
976 4, /* vec_store_cost. */
977 2, /* cond_taken_branch_cost. */
978 1, /* cond_not_taken_branch_cost. */
981 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
982 very small blocks it is better to use loop. For large blocks, libcall
983 can do nontemporary accesses and beat inline considerably. */
985 static stringop_algs bdver2_memcpy[2] = {
986 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
987 {-1, rep_prefix_4_byte, false}}},
988 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
989 {-1, libcall, false}}}};
990 static stringop_algs bdver2_memset[2] = {
991 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
992 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
993 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
994 {-1, libcall, false}}}};
996 const struct processor_costs bdver2_cost = {
997 COSTS_N_INSNS (1), /* cost of an add instruction */
998 COSTS_N_INSNS (1), /* cost of a lea instruction */
999 COSTS_N_INSNS (1), /* variable shift costs */
1000 COSTS_N_INSNS (1), /* constant shift costs */
1001 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1002 COSTS_N_INSNS (4), /* HI */
1003 COSTS_N_INSNS (4), /* SI */
1004 COSTS_N_INSNS (6), /* DI */
1005 COSTS_N_INSNS (6)}, /* other */
1006 0, /* cost of multiply per each bit set */
1007 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1008 COSTS_N_INSNS (35), /* HI */
1009 COSTS_N_INSNS (51), /* SI */
1010 COSTS_N_INSNS (83), /* DI */
1011 COSTS_N_INSNS (83)}, /* other */
1012 COSTS_N_INSNS (1), /* cost of movsx */
1013 COSTS_N_INSNS (1), /* cost of movzx */
1014 8, /* "large" insn */
1015 9, /* MOVE_RATIO */
1016 4, /* cost for loading QImode using movzbl */
1017 {5, 5, 4}, /* cost of loading integer registers
1018 in QImode, HImode and SImode.
1019 Relative to reg-reg move (2). */
1020 {4, 4, 4}, /* cost of storing integer registers */
1021 2, /* cost of reg,reg fld/fst */
1022 {5, 5, 12}, /* cost of loading fp registers
1023 in SFmode, DFmode and XFmode */
1024 {4, 4, 8}, /* cost of storing fp registers
1025 in SFmode, DFmode and XFmode */
1026 2, /* cost of moving MMX register */
1027 {4, 4}, /* cost of loading MMX registers
1028 in SImode and DImode */
1029 {4, 4}, /* cost of storing MMX registers
1030 in SImode and DImode */
1031 2, /* cost of moving SSE register */
1032 {4, 4, 4}, /* cost of loading SSE registers
1033 in SImode, DImode and TImode */
1034 {4, 4, 4}, /* cost of storing SSE registers
1035 in SImode, DImode and TImode */
1036 2, /* MMX or SSE register to integer */
1037 /* On K8:
1038 MOVD reg64, xmmreg Double FSTORE 4
1039 MOVD reg32, xmmreg Double FSTORE 4
1040 On AMDFAM10:
1041 MOVD reg64, xmmreg Double FADD 3
1042 1/1 1/1
1043 MOVD reg32, xmmreg Double FADD 3
1044 1/1 1/1 */
1045 16, /* size of l1 cache. */
1046 2048, /* size of l2 cache. */
1047 64, /* size of prefetch block */
1048 /* New AMD processors never drop prefetches; if they cannot be performed
1049 immediately, they are queued. We set number of simultaneous prefetches
1050 to a large constant to reflect this (it probably is not a good idea not
1051 to limit number of prefetches at all, as their execution also takes some
1052 time). */
1053 100, /* number of parallel prefetches */
1054 2, /* Branch cost */
1055 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1056 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1057 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1058 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1059 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1060 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1062 bdver2_memcpy,
1063 bdver2_memset,
1064 6, /* scalar_stmt_cost. */
1065 4, /* scalar load_cost. */
1066 4, /* scalar_store_cost. */
1067 6, /* vec_stmt_cost. */
1068 0, /* vec_to_scalar_cost. */
1069 2, /* scalar_to_vec_cost. */
1070 4, /* vec_align_load_cost. */
1071 4, /* vec_unalign_load_cost. */
1072 4, /* vec_store_cost. */
1073 2, /* cond_taken_branch_cost. */
1074 1, /* cond_not_taken_branch_cost. */
1078 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1079 very small blocks it is better to use loop. For large blocks, libcall
1080 can do nontemporary accesses and beat inline considerably. */
1081 static stringop_algs bdver3_memcpy[2] = {
1082 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1083 {-1, rep_prefix_4_byte, false}}},
1084 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}};
1086 static stringop_algs bdver3_memset[2] = {
1087 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1088 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1089 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1090 {-1, libcall, false}}}};
1091 struct processor_costs bdver3_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (1), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (4), /* SI */
1099 COSTS_N_INSNS (6), /* DI */
1100 COSTS_N_INSNS (6)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (35), /* HI */
1104 COSTS_N_INSNS (51), /* SI */
1105 COSTS_N_INSNS (83), /* DI */
1106 COSTS_N_INSNS (83)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {5, 5, 4}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {4, 4, 4}, /* cost of storing integer registers */
1116 2, /* cost of reg,reg fld/fst */
1117 {5, 5, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {4, 4, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {4, 4}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 4, 4}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 4}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 2, /* MMX or SSE register to integer */
1132 16, /* size of l1 cache. */
1133 2048, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 2, /* Branch cost */
1142 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1149 bdver3_memcpy,
1150 bdver3_memset,
1151 6, /* scalar_stmt_cost. */
1152 4, /* scalar load_cost. */
1153 4, /* scalar_store_cost. */
1154 6, /* vec_stmt_cost. */
1155 0, /* vec_to_scalar_cost. */
1156 2, /* scalar_to_vec_cost. */
1157 4, /* vec_align_load_cost. */
1158 4, /* vec_unalign_load_cost. */
1159 4, /* vec_store_cost. */
1160 2, /* cond_taken_branch_cost. */
1161 1, /* cond_not_taken_branch_cost. */
1164 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1165 very small blocks it is better to use loop. For large blocks, libcall
1166 can do nontemporary accesses and beat inline considerably. */
1167 static stringop_algs bdver4_memcpy[2] = {
1168 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1169 {-1, rep_prefix_4_byte, false}}},
1170 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1171 {-1, libcall, false}}}};
1172 static stringop_algs bdver4_memset[2] = {
1173 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1174 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1175 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1176 {-1, libcall, false}}}};
1177 struct processor_costs bdver4_cost = {
1178 COSTS_N_INSNS (1), /* cost of an add instruction */
1179 COSTS_N_INSNS (1), /* cost of a lea instruction */
1180 COSTS_N_INSNS (1), /* variable shift costs */
1181 COSTS_N_INSNS (1), /* constant shift costs */
1182 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1183 COSTS_N_INSNS (4), /* HI */
1184 COSTS_N_INSNS (4), /* SI */
1185 COSTS_N_INSNS (6), /* DI */
1186 COSTS_N_INSNS (6)}, /* other */
1187 0, /* cost of multiply per each bit set */
1188 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1189 COSTS_N_INSNS (35), /* HI */
1190 COSTS_N_INSNS (51), /* SI */
1191 COSTS_N_INSNS (83), /* DI */
1192 COSTS_N_INSNS (83)}, /* other */
1193 COSTS_N_INSNS (1), /* cost of movsx */
1194 COSTS_N_INSNS (1), /* cost of movzx */
1195 8, /* "large" insn */
1196 9, /* MOVE_RATIO */
1197 4, /* cost for loading QImode using movzbl */
1198 {5, 5, 4}, /* cost of loading integer registers
1199 in QImode, HImode and SImode.
1200 Relative to reg-reg move (2). */
1201 {4, 4, 4}, /* cost of storing integer registers */
1202 2, /* cost of reg,reg fld/fst */
1203 {5, 5, 12}, /* cost of loading fp registers
1204 in SFmode, DFmode and XFmode */
1205 {4, 4, 8}, /* cost of storing fp registers
1206 in SFmode, DFmode and XFmode */
1207 2, /* cost of moving MMX register */
1208 {4, 4}, /* cost of loading MMX registers
1209 in SImode and DImode */
1210 {4, 4}, /* cost of storing MMX registers
1211 in SImode and DImode */
1212 2, /* cost of moving SSE register */
1213 {4, 4, 4}, /* cost of loading SSE registers
1214 in SImode, DImode and TImode */
1215 {4, 4, 4}, /* cost of storing SSE registers
1216 in SImode, DImode and TImode */
1217 2, /* MMX or SSE register to integer */
1218 16, /* size of l1 cache. */
1219 2048, /* size of l2 cache. */
1220 64, /* size of prefetch block */
1221 /* New AMD processors never drop prefetches; if they cannot be performed
1222 immediately, they are queued. We set number of simultaneous prefetches
1223 to a large constant to reflect this (it probably is not a good idea not
1224 to limit number of prefetches at all, as their execution also takes some
1225 time). */
1226 100, /* number of parallel prefetches */
1227 2, /* Branch cost */
1228 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1229 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1230 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1231 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1232 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1233 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1235 bdver4_memcpy,
1236 bdver4_memset,
1237 6, /* scalar_stmt_cost. */
1238 4, /* scalar load_cost. */
1239 4, /* scalar_store_cost. */
1240 6, /* vec_stmt_cost. */
1241 0, /* vec_to_scalar_cost. */
1242 2, /* scalar_to_vec_cost. */
1243 4, /* vec_align_load_cost. */
1244 4, /* vec_unalign_load_cost. */
1245 4, /* vec_store_cost. */
1246 2, /* cond_taken_branch_cost. */
1247 1, /* cond_not_taken_branch_cost. */
1250 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1251 very small blocks it is better to use loop. For large blocks, libcall can
1252 do nontemporary accesses and beat inline considerably. */
1253 static stringop_algs btver1_memcpy[2] = {
1254 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1255 {-1, rep_prefix_4_byte, false}}},
1256 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1257 {-1, libcall, false}}}};
1258 static stringop_algs btver1_memset[2] = {
1259 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1260 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1261 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1262 {-1, libcall, false}}}};
1263 const struct processor_costs btver1_cost = {
1264 COSTS_N_INSNS (1), /* cost of an add instruction */
1265 COSTS_N_INSNS (2), /* cost of a lea instruction */
1266 COSTS_N_INSNS (1), /* variable shift costs */
1267 COSTS_N_INSNS (1), /* constant shift costs */
1268 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1269 COSTS_N_INSNS (4), /* HI */
1270 COSTS_N_INSNS (3), /* SI */
1271 COSTS_N_INSNS (4), /* DI */
1272 COSTS_N_INSNS (5)}, /* other */
1273 0, /* cost of multiply per each bit set */
1274 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1275 COSTS_N_INSNS (35), /* HI */
1276 COSTS_N_INSNS (51), /* SI */
1277 COSTS_N_INSNS (83), /* DI */
1278 COSTS_N_INSNS (83)}, /* other */
1279 COSTS_N_INSNS (1), /* cost of movsx */
1280 COSTS_N_INSNS (1), /* cost of movzx */
1281 8, /* "large" insn */
1282 9, /* MOVE_RATIO */
1283 4, /* cost for loading QImode using movzbl */
1284 {3, 4, 3}, /* cost of loading integer registers
1285 in QImode, HImode and SImode.
1286 Relative to reg-reg move (2). */
1287 {3, 4, 3}, /* cost of storing integer registers */
1288 4, /* cost of reg,reg fld/fst */
1289 {4, 4, 12}, /* cost of loading fp registers
1290 in SFmode, DFmode and XFmode */
1291 {6, 6, 8}, /* cost of storing fp registers
1292 in SFmode, DFmode and XFmode */
1293 2, /* cost of moving MMX register */
1294 {3, 3}, /* cost of loading MMX registers
1295 in SImode and DImode */
1296 {4, 4}, /* cost of storing MMX registers
1297 in SImode and DImode */
1298 2, /* cost of moving SSE register */
1299 {4, 4, 3}, /* cost of loading SSE registers
1300 in SImode, DImode and TImode */
1301 {4, 4, 5}, /* cost of storing SSE registers
1302 in SImode, DImode and TImode */
1303 3, /* MMX or SSE register to integer */
1304 /* On K8:
1305 MOVD reg64, xmmreg Double FSTORE 4
1306 MOVD reg32, xmmreg Double FSTORE 4
1307 On AMDFAM10:
1308 MOVD reg64, xmmreg Double FADD 3
1309 1/1 1/1
1310 MOVD reg32, xmmreg Double FADD 3
1311 1/1 1/1 */
1312 32, /* size of l1 cache. */
1313 512, /* size of l2 cache. */
1314 64, /* size of prefetch block */
1315 100, /* number of parallel prefetches */
1316 2, /* Branch cost */
1317 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1318 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1319 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1320 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1321 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1322 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1324 btver1_memcpy,
1325 btver1_memset,
1326 4, /* scalar_stmt_cost. */
1327 2, /* scalar load_cost. */
1328 2, /* scalar_store_cost. */
1329 6, /* vec_stmt_cost. */
1330 0, /* vec_to_scalar_cost. */
1331 2, /* scalar_to_vec_cost. */
1332 2, /* vec_align_load_cost. */
1333 2, /* vec_unalign_load_cost. */
1334 2, /* vec_store_cost. */
1335 2, /* cond_taken_branch_cost. */
1336 1, /* cond_not_taken_branch_cost. */
1339 static stringop_algs btver2_memcpy[2] = {
1340 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1341 {-1, rep_prefix_4_byte, false}}},
1342 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1343 {-1, libcall, false}}}};
1344 static stringop_algs btver2_memset[2] = {
1345 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1346 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1347 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1348 {-1, libcall, false}}}};
1349 const struct processor_costs btver2_cost = {
1350 COSTS_N_INSNS (1), /* cost of an add instruction */
1351 COSTS_N_INSNS (2), /* cost of a lea instruction */
1352 COSTS_N_INSNS (1), /* variable shift costs */
1353 COSTS_N_INSNS (1), /* constant shift costs */
1354 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1355 COSTS_N_INSNS (4), /* HI */
1356 COSTS_N_INSNS (3), /* SI */
1357 COSTS_N_INSNS (4), /* DI */
1358 COSTS_N_INSNS (5)}, /* other */
1359 0, /* cost of multiply per each bit set */
1360 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1361 COSTS_N_INSNS (35), /* HI */
1362 COSTS_N_INSNS (51), /* SI */
1363 COSTS_N_INSNS (83), /* DI */
1364 COSTS_N_INSNS (83)}, /* other */
1365 COSTS_N_INSNS (1), /* cost of movsx */
1366 COSTS_N_INSNS (1), /* cost of movzx */
1367 8, /* "large" insn */
1368 9, /* MOVE_RATIO */
1369 4, /* cost for loading QImode using movzbl */
1370 {3, 4, 3}, /* cost of loading integer registers
1371 in QImode, HImode and SImode.
1372 Relative to reg-reg move (2). */
1373 {3, 4, 3}, /* cost of storing integer registers */
1374 4, /* cost of reg,reg fld/fst */
1375 {4, 4, 12}, /* cost of loading fp registers
1376 in SFmode, DFmode and XFmode */
1377 {6, 6, 8}, /* cost of storing fp registers
1378 in SFmode, DFmode and XFmode */
1379 2, /* cost of moving MMX register */
1380 {3, 3}, /* cost of loading MMX registers
1381 in SImode and DImode */
1382 {4, 4}, /* cost of storing MMX registers
1383 in SImode and DImode */
1384 2, /* cost of moving SSE register */
1385 {4, 4, 3}, /* cost of loading SSE registers
1386 in SImode, DImode and TImode */
1387 {4, 4, 5}, /* cost of storing SSE registers
1388 in SImode, DImode and TImode */
1389 3, /* MMX or SSE register to integer */
1390 /* On K8:
1391 MOVD reg64, xmmreg Double FSTORE 4
1392 MOVD reg32, xmmreg Double FSTORE 4
1393 On AMDFAM10:
1394 MOVD reg64, xmmreg Double FADD 3
1395 1/1 1/1
1396 MOVD reg32, xmmreg Double FADD 3
1397 1/1 1/1 */
1398 32, /* size of l1 cache. */
1399 2048, /* size of l2 cache. */
1400 64, /* size of prefetch block */
1401 100, /* number of parallel prefetches */
1402 2, /* Branch cost */
1403 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1404 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1405 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1406 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1407 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1408 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1409 btver2_memcpy,
1410 btver2_memset,
1411 4, /* scalar_stmt_cost. */
1412 2, /* scalar load_cost. */
1413 2, /* scalar_store_cost. */
1414 6, /* vec_stmt_cost. */
1415 0, /* vec_to_scalar_cost. */
1416 2, /* scalar_to_vec_cost. */
1417 2, /* vec_align_load_cost. */
1418 2, /* vec_unalign_load_cost. */
1419 2, /* vec_store_cost. */
1420 2, /* cond_taken_branch_cost. */
1421 1, /* cond_not_taken_branch_cost. */
1424 static stringop_algs pentium4_memcpy[2] = {
1425 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1426 DUMMY_STRINGOP_ALGS};
1427 static stringop_algs pentium4_memset[2] = {
1428 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1429 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1430 DUMMY_STRINGOP_ALGS};
1432 static const
1433 struct processor_costs pentium4_cost = {
1434 COSTS_N_INSNS (1), /* cost of an add instruction */
1435 COSTS_N_INSNS (3), /* cost of a lea instruction */
1436 COSTS_N_INSNS (4), /* variable shift costs */
1437 COSTS_N_INSNS (4), /* constant shift costs */
1438 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1439 COSTS_N_INSNS (15), /* HI */
1440 COSTS_N_INSNS (15), /* SI */
1441 COSTS_N_INSNS (15), /* DI */
1442 COSTS_N_INSNS (15)}, /* other */
1443 0, /* cost of multiply per each bit set */
1444 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1445 COSTS_N_INSNS (56), /* HI */
1446 COSTS_N_INSNS (56), /* SI */
1447 COSTS_N_INSNS (56), /* DI */
1448 COSTS_N_INSNS (56)}, /* other */
1449 COSTS_N_INSNS (1), /* cost of movsx */
1450 COSTS_N_INSNS (1), /* cost of movzx */
1451 16, /* "large" insn */
1452 6, /* MOVE_RATIO */
1453 2, /* cost for loading QImode using movzbl */
1454 {4, 5, 4}, /* cost of loading integer registers
1455 in QImode, HImode and SImode.
1456 Relative to reg-reg move (2). */
1457 {2, 3, 2}, /* cost of storing integer registers */
1458 2, /* cost of reg,reg fld/fst */
1459 {2, 2, 6}, /* cost of loading fp registers
1460 in SFmode, DFmode and XFmode */
1461 {4, 4, 6}, /* cost of storing fp registers
1462 in SFmode, DFmode and XFmode */
1463 2, /* cost of moving MMX register */
1464 {2, 2}, /* cost of loading MMX registers
1465 in SImode and DImode */
1466 {2, 2}, /* cost of storing MMX registers
1467 in SImode and DImode */
1468 12, /* cost of moving SSE register */
1469 {12, 12, 12}, /* cost of loading SSE registers
1470 in SImode, DImode and TImode */
1471 {2, 2, 8}, /* cost of storing SSE registers
1472 in SImode, DImode and TImode */
1473 10, /* MMX or SSE register to integer */
1474 8, /* size of l1 cache. */
1475 256, /* size of l2 cache. */
1476 64, /* size of prefetch block */
1477 6, /* number of parallel prefetches */
1478 2, /* Branch cost */
1479 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1480 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1481 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1482 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1483 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1484 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1485 pentium4_memcpy,
1486 pentium4_memset,
1487 1, /* scalar_stmt_cost. */
1488 1, /* scalar load_cost. */
1489 1, /* scalar_store_cost. */
1490 1, /* vec_stmt_cost. */
1491 1, /* vec_to_scalar_cost. */
1492 1, /* scalar_to_vec_cost. */
1493 1, /* vec_align_load_cost. */
1494 2, /* vec_unalign_load_cost. */
1495 1, /* vec_store_cost. */
1496 3, /* cond_taken_branch_cost. */
1497 1, /* cond_not_taken_branch_cost. */
1500 static stringop_algs nocona_memcpy[2] = {
1501 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1502 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1503 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1505 static stringop_algs nocona_memset[2] = {
1506 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1507 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1508 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1509 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1511 static const
1512 struct processor_costs nocona_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (1), /* cost of a lea instruction */
1515 COSTS_N_INSNS (1), /* variable shift costs */
1516 COSTS_N_INSNS (1), /* constant shift costs */
1517 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (10), /* HI */
1519 COSTS_N_INSNS (10), /* SI */
1520 COSTS_N_INSNS (10), /* DI */
1521 COSTS_N_INSNS (10)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (66), /* HI */
1525 COSTS_N_INSNS (66), /* SI */
1526 COSTS_N_INSNS (66), /* DI */
1527 COSTS_N_INSNS (66)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 16, /* "large" insn */
1531 17, /* MOVE_RATIO */
1532 4, /* cost for loading QImode using movzbl */
1533 {4, 4, 4}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {4, 4, 4}, /* cost of storing integer registers */
1537 3, /* cost of reg,reg fld/fst */
1538 {12, 12, 12}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {4, 4, 4}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 6, /* cost of moving MMX register */
1543 {12, 12}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {12, 12}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 6, /* cost of moving SSE register */
1548 {12, 12, 12}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {12, 12, 12}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 8, /* MMX or SSE register to integer */
1553 8, /* size of l1 cache. */
1554 1024, /* size of l2 cache. */
1555 128, /* size of prefetch block */
1556 8, /* number of parallel prefetches */
1557 1, /* Branch cost */
1558 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1559 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1560 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1561 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1562 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1563 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1564 nocona_memcpy,
1565 nocona_memset,
1566 1, /* scalar_stmt_cost. */
1567 1, /* scalar load_cost. */
1568 1, /* scalar_store_cost. */
1569 1, /* vec_stmt_cost. */
1570 1, /* vec_to_scalar_cost. */
1571 1, /* scalar_to_vec_cost. */
1572 1, /* vec_align_load_cost. */
1573 2, /* vec_unalign_load_cost. */
1574 1, /* vec_store_cost. */
1575 3, /* cond_taken_branch_cost. */
1576 1, /* cond_not_taken_branch_cost. */
1579 static stringop_algs atom_memcpy[2] = {
1580 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1581 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1582 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1583 static stringop_algs atom_memset[2] = {
1584 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1585 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1586 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1587 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1588 static const
1589 struct processor_costs atom_cost = {
1590 COSTS_N_INSNS (1), /* cost of an add instruction */
1591 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1592 COSTS_N_INSNS (1), /* variable shift costs */
1593 COSTS_N_INSNS (1), /* constant shift costs */
1594 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1595 COSTS_N_INSNS (4), /* HI */
1596 COSTS_N_INSNS (3), /* SI */
1597 COSTS_N_INSNS (4), /* DI */
1598 COSTS_N_INSNS (2)}, /* other */
1599 0, /* cost of multiply per each bit set */
1600 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1601 COSTS_N_INSNS (26), /* HI */
1602 COSTS_N_INSNS (42), /* SI */
1603 COSTS_N_INSNS (74), /* DI */
1604 COSTS_N_INSNS (74)}, /* other */
1605 COSTS_N_INSNS (1), /* cost of movsx */
1606 COSTS_N_INSNS (1), /* cost of movzx */
1607 8, /* "large" insn */
1608 17, /* MOVE_RATIO */
1609 4, /* cost for loading QImode using movzbl */
1610 {4, 4, 4}, /* cost of loading integer registers
1611 in QImode, HImode and SImode.
1612 Relative to reg-reg move (2). */
1613 {4, 4, 4}, /* cost of storing integer registers */
1614 4, /* cost of reg,reg fld/fst */
1615 {12, 12, 12}, /* cost of loading fp registers
1616 in SFmode, DFmode and XFmode */
1617 {6, 6, 8}, /* cost of storing fp registers
1618 in SFmode, DFmode and XFmode */
1619 2, /* cost of moving MMX register */
1620 {8, 8}, /* cost of loading MMX registers
1621 in SImode and DImode */
1622 {8, 8}, /* cost of storing MMX registers
1623 in SImode and DImode */
1624 2, /* cost of moving SSE register */
1625 {8, 8, 8}, /* cost of loading SSE registers
1626 in SImode, DImode and TImode */
1627 {8, 8, 8}, /* cost of storing SSE registers
1628 in SImode, DImode and TImode */
1629 5, /* MMX or SSE register to integer */
1630 32, /* size of l1 cache. */
1631 256, /* size of l2 cache. */
1632 64, /* size of prefetch block */
1633 6, /* number of parallel prefetches */
1634 3, /* Branch cost */
1635 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1636 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1637 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1638 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1639 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1640 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1641 atom_memcpy,
1642 atom_memset,
1643 1, /* scalar_stmt_cost. */
1644 1, /* scalar load_cost. */
1645 1, /* scalar_store_cost. */
1646 1, /* vec_stmt_cost. */
1647 1, /* vec_to_scalar_cost. */
1648 1, /* scalar_to_vec_cost. */
1649 1, /* vec_align_load_cost. */
1650 2, /* vec_unalign_load_cost. */
1651 1, /* vec_store_cost. */
1652 3, /* cond_taken_branch_cost. */
1653 1, /* cond_not_taken_branch_cost. */
1656 static stringop_algs slm_memcpy[2] = {
1657 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1658 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1659 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1660 static stringop_algs slm_memset[2] = {
1661 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1662 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1663 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1664 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1665 static const
1666 struct processor_costs slm_cost = {
1667 COSTS_N_INSNS (1), /* cost of an add instruction */
1668 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1669 COSTS_N_INSNS (1), /* variable shift costs */
1670 COSTS_N_INSNS (1), /* constant shift costs */
1671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1672 COSTS_N_INSNS (4), /* HI */
1673 COSTS_N_INSNS (3), /* SI */
1674 COSTS_N_INSNS (4), /* DI */
1675 COSTS_N_INSNS (2)}, /* other */
1676 0, /* cost of multiply per each bit set */
1677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1678 COSTS_N_INSNS (26), /* HI */
1679 COSTS_N_INSNS (42), /* SI */
1680 COSTS_N_INSNS (74), /* DI */
1681 COSTS_N_INSNS (74)}, /* other */
1682 COSTS_N_INSNS (1), /* cost of movsx */
1683 COSTS_N_INSNS (1), /* cost of movzx */
1684 8, /* "large" insn */
1685 17, /* MOVE_RATIO */
1686 4, /* cost for loading QImode using movzbl */
1687 {4, 4, 4}, /* cost of loading integer registers
1688 in QImode, HImode and SImode.
1689 Relative to reg-reg move (2). */
1690 {4, 4, 4}, /* cost of storing integer registers */
1691 4, /* cost of reg,reg fld/fst */
1692 {12, 12, 12}, /* cost of loading fp registers
1693 in SFmode, DFmode and XFmode */
1694 {6, 6, 8}, /* cost of storing fp registers
1695 in SFmode, DFmode and XFmode */
1696 2, /* cost of moving MMX register */
1697 {8, 8}, /* cost of loading MMX registers
1698 in SImode and DImode */
1699 {8, 8}, /* cost of storing MMX registers
1700 in SImode and DImode */
1701 2, /* cost of moving SSE register */
1702 {8, 8, 8}, /* cost of loading SSE registers
1703 in SImode, DImode and TImode */
1704 {8, 8, 8}, /* cost of storing SSE registers
1705 in SImode, DImode and TImode */
1706 5, /* MMX or SSE register to integer */
1707 32, /* size of l1 cache. */
1708 256, /* size of l2 cache. */
1709 64, /* size of prefetch block */
1710 6, /* number of parallel prefetches */
1711 3, /* Branch cost */
1712 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1713 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1714 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1715 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1716 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1717 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1718 slm_memcpy,
1719 slm_memset,
1720 1, /* scalar_stmt_cost. */
1721 1, /* scalar load_cost. */
1722 1, /* scalar_store_cost. */
1723 1, /* vec_stmt_cost. */
1724 1, /* vec_to_scalar_cost. */
1725 1, /* scalar_to_vec_cost. */
1726 1, /* vec_align_load_cost. */
1727 2, /* vec_unalign_load_cost. */
1728 1, /* vec_store_cost. */
1729 3, /* cond_taken_branch_cost. */
1730 1, /* cond_not_taken_branch_cost. */
1733 /* Generic should produce code tuned for Core-i7 (and newer chips)
1734 and btver1 (and newer chips). */
1736 static stringop_algs generic_memcpy[2] = {
1737 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1738 {-1, libcall, false}}},
1739 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1740 {-1, libcall, false}}}};
1741 static stringop_algs generic_memset[2] = {
1742 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1743 {-1, libcall, false}}},
1744 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1745 {-1, libcall, false}}}};
1746 static const
1747 struct processor_costs generic_cost = {
1748 COSTS_N_INSNS (1), /* cost of an add instruction */
1749 /* On all chips taken into consideration lea is 2 cycles and more. With
1750 this cost however our current implementation of synth_mult results in
1751 use of unnecessary temporary registers causing regression on several
1752 SPECfp benchmarks. */
1753 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1754 COSTS_N_INSNS (1), /* variable shift costs */
1755 COSTS_N_INSNS (1), /* constant shift costs */
1756 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1757 COSTS_N_INSNS (4), /* HI */
1758 COSTS_N_INSNS (3), /* SI */
1759 COSTS_N_INSNS (4), /* DI */
1760 COSTS_N_INSNS (2)}, /* other */
1761 0, /* cost of multiply per each bit set */
1762 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1763 COSTS_N_INSNS (26), /* HI */
1764 COSTS_N_INSNS (42), /* SI */
1765 COSTS_N_INSNS (74), /* DI */
1766 COSTS_N_INSNS (74)}, /* other */
1767 COSTS_N_INSNS (1), /* cost of movsx */
1768 COSTS_N_INSNS (1), /* cost of movzx */
1769 8, /* "large" insn */
1770 17, /* MOVE_RATIO */
1771 4, /* cost for loading QImode using movzbl */
1772 {4, 4, 4}, /* cost of loading integer registers
1773 in QImode, HImode and SImode.
1774 Relative to reg-reg move (2). */
1775 {4, 4, 4}, /* cost of storing integer registers */
1776 4, /* cost of reg,reg fld/fst */
1777 {12, 12, 12}, /* cost of loading fp registers
1778 in SFmode, DFmode and XFmode */
1779 {6, 6, 8}, /* cost of storing fp registers
1780 in SFmode, DFmode and XFmode */
1781 2, /* cost of moving MMX register */
1782 {8, 8}, /* cost of loading MMX registers
1783 in SImode and DImode */
1784 {8, 8}, /* cost of storing MMX registers
1785 in SImode and DImode */
1786 2, /* cost of moving SSE register */
1787 {8, 8, 8}, /* cost of loading SSE registers
1788 in SImode, DImode and TImode */
1789 {8, 8, 8}, /* cost of storing SSE registers
1790 in SImode, DImode and TImode */
1791 5, /* MMX or SSE register to integer */
1792 32, /* size of l1 cache. */
1793 512, /* size of l2 cache. */
1794 64, /* size of prefetch block */
1795 6, /* number of parallel prefetches */
1796 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1797 value is increased to perhaps more appropriate value of 5. */
1798 3, /* Branch cost */
1799 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1800 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1801 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1802 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1803 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1804 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1805 generic_memcpy,
1806 generic_memset,
1807 1, /* scalar_stmt_cost. */
1808 1, /* scalar load_cost. */
1809 1, /* scalar_store_cost. */
1810 1, /* vec_stmt_cost. */
1811 1, /* vec_to_scalar_cost. */
1812 1, /* scalar_to_vec_cost. */
1813 1, /* vec_align_load_cost. */
1814 2, /* vec_unalign_load_cost. */
1815 1, /* vec_store_cost. */
1816 3, /* cond_taken_branch_cost. */
1817 1, /* cond_not_taken_branch_cost. */
1820 /* core_cost should produce code tuned for Core familly of CPUs. */
1821 static stringop_algs core_memcpy[2] = {
1822 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1823 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1824 {-1, libcall, false}}}};
1825 static stringop_algs core_memset[2] = {
1826 {libcall, {{6, loop_1_byte, true},
1827 {24, loop, true},
1828 {8192, rep_prefix_4_byte, true},
1829 {-1, libcall, false}}},
1830 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1831 {-1, libcall, false}}}};
1833 static const
1834 struct processor_costs core_cost = {
1835 COSTS_N_INSNS (1), /* cost of an add instruction */
1836 /* On all chips taken into consideration lea is 2 cycles and more. With
1837 this cost however our current implementation of synth_mult results in
1838 use of unnecessary temporary registers causing regression on several
1839 SPECfp benchmarks. */
1840 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1841 COSTS_N_INSNS (1), /* variable shift costs */
1842 COSTS_N_INSNS (1), /* constant shift costs */
1843 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1844 COSTS_N_INSNS (4), /* HI */
1845 COSTS_N_INSNS (3), /* SI */
1846 COSTS_N_INSNS (4), /* DI */
1847 COSTS_N_INSNS (2)}, /* other */
1848 0, /* cost of multiply per each bit set */
1849 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1850 COSTS_N_INSNS (26), /* HI */
1851 COSTS_N_INSNS (42), /* SI */
1852 COSTS_N_INSNS (74), /* DI */
1853 COSTS_N_INSNS (74)}, /* other */
1854 COSTS_N_INSNS (1), /* cost of movsx */
1855 COSTS_N_INSNS (1), /* cost of movzx */
1856 8, /* "large" insn */
1857 17, /* MOVE_RATIO */
1858 4, /* cost for loading QImode using movzbl */
1859 {4, 4, 4}, /* cost of loading integer registers
1860 in QImode, HImode and SImode.
1861 Relative to reg-reg move (2). */
1862 {4, 4, 4}, /* cost of storing integer registers */
1863 4, /* cost of reg,reg fld/fst */
1864 {12, 12, 12}, /* cost of loading fp registers
1865 in SFmode, DFmode and XFmode */
1866 {6, 6, 8}, /* cost of storing fp registers
1867 in SFmode, DFmode and XFmode */
1868 2, /* cost of moving MMX register */
1869 {8, 8}, /* cost of loading MMX registers
1870 in SImode and DImode */
1871 {8, 8}, /* cost of storing MMX registers
1872 in SImode and DImode */
1873 2, /* cost of moving SSE register */
1874 {8, 8, 8}, /* cost of loading SSE registers
1875 in SImode, DImode and TImode */
1876 {8, 8, 8}, /* cost of storing SSE registers
1877 in SImode, DImode and TImode */
1878 5, /* MMX or SSE register to integer */
1879 64, /* size of l1 cache. */
1880 512, /* size of l2 cache. */
1881 64, /* size of prefetch block */
1882 6, /* number of parallel prefetches */
1883 /* FIXME perhaps more appropriate value is 5. */
1884 3, /* Branch cost */
1885 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1886 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1887 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1888 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1889 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1890 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1891 core_memcpy,
1892 core_memset,
1893 1, /* scalar_stmt_cost. */
1894 1, /* scalar load_cost. */
1895 1, /* scalar_store_cost. */
1896 1, /* vec_stmt_cost. */
1897 1, /* vec_to_scalar_cost. */
1898 1, /* scalar_to_vec_cost. */
1899 1, /* vec_align_load_cost. */
1900 2, /* vec_unalign_load_cost. */
1901 1, /* vec_store_cost. */
1902 3, /* cond_taken_branch_cost. */
1903 1, /* cond_not_taken_branch_cost. */
1907 /* Set by -mtune. */
1908 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1910 /* Set by -mtune or -Os. */
1911 const struct processor_costs *ix86_cost = &pentium_cost;
1913 /* Processor feature/optimization bitmasks. */
1914 #define m_386 (1<<PROCESSOR_I386)
1915 #define m_486 (1<<PROCESSOR_I486)
1916 #define m_PENT (1<<PROCESSOR_PENTIUM)
1917 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1918 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1919 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1920 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1921 #define m_CORE2 (1<<PROCESSOR_CORE2)
1922 #define m_COREI7 (1<<PROCESSOR_COREI7)
1923 #define m_COREI7_AVX (1<<PROCESSOR_COREI7_AVX)
1924 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1925 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_COREI7_AVX | m_HASWELL)
1926 #define m_ATOM (1<<PROCESSOR_ATOM)
1927 #define m_SLM (1<<PROCESSOR_SLM)
1929 #define m_GEODE (1<<PROCESSOR_GEODE)
1930 #define m_K6 (1<<PROCESSOR_K6)
1931 #define m_K6_GEODE (m_K6 | m_GEODE)
1932 #define m_K8 (1<<PROCESSOR_K8)
1933 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1934 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1935 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1936 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1937 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1938 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1939 #define m_BDVER4 (1<<PROCESSOR_BDVER4)
1940 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1941 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1942 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
1943 #define m_BTVER (m_BTVER1 | m_BTVER2)
1944 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1946 #define m_GENERIC (1<<PROCESSOR_GENERIC)
1948 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1949 #undef DEF_TUNE
1950 #define DEF_TUNE(tune, name, selector) name,
1951 #include "x86-tune.def"
1952 #undef DEF_TUNE
1955 /* Feature tests against the various tunings. */
1956 unsigned char ix86_tune_features[X86_TUNE_LAST];
1958 /* Feature tests against the various tunings used to create ix86_tune_features
1959 based on the processor mask. */
1960 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1961 #undef DEF_TUNE
1962 #define DEF_TUNE(tune, name, selector) selector,
1963 #include "x86-tune.def"
1964 #undef DEF_TUNE
1967 /* Feature tests against the various architecture variations. */
1968 unsigned char ix86_arch_features[X86_ARCH_LAST];
1970 /* Feature tests against the various architecture variations, used to create
1971 ix86_arch_features based on the processor mask. */
1972 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1973 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1974 ~(m_386 | m_486 | m_PENT | m_K6),
1976 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1977 ~m_386,
1979 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1980 ~(m_386 | m_486),
1982 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1983 ~m_386,
1985 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1986 ~m_386,
1989 /* In case the average insn count for single function invocation is
1990 lower than this constant, emit fast (but longer) prologue and
1991 epilogue code. */
1992 #define FAST_PROLOGUE_INSN_COUNT 20
1994 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1995 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1996 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1997 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1999 /* Array of the smallest class containing reg number REGNO, indexed by
2000 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2002 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2004 /* ax, dx, cx, bx */
2005 AREG, DREG, CREG, BREG,
2006 /* si, di, bp, sp */
2007 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2008 /* FP registers */
2009 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2010 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2011 /* arg pointer */
2012 NON_Q_REGS,
2013 /* flags, fpsr, fpcr, frame */
2014 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2015 /* SSE registers */
2016 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2017 SSE_REGS, SSE_REGS,
2018 /* MMX registers */
2019 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2020 MMX_REGS, MMX_REGS,
2021 /* REX registers */
2022 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2023 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2024 /* SSE REX registers */
2025 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2026 SSE_REGS, SSE_REGS,
2027 /* AVX-512 SSE registers */
2028 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2029 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2030 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2031 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2032 /* Mask registers. */
2033 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2034 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2035 /* MPX bound registers */
2036 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2039 /* The "default" register map used in 32bit mode. */
2041 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2043 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2044 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2045 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2046 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2047 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2048 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2049 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2050 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2051 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2052 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2053 101, 102, 103, 104, /* bound registers */
2056 /* The "default" register map used in 64bit mode. */
2058 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2060 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2061 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2062 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2063 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2064 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2065 8,9,10,11,12,13,14,15, /* extended integer registers */
2066 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2067 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2068 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2069 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2070 126, 127, 128, 129, /* bound registers */
2073 /* Define the register numbers to be used in Dwarf debugging information.
2074 The SVR4 reference port C compiler uses the following register numbers
2075 in its Dwarf output code:
2076 0 for %eax (gcc regno = 0)
2077 1 for %ecx (gcc regno = 2)
2078 2 for %edx (gcc regno = 1)
2079 3 for %ebx (gcc regno = 3)
2080 4 for %esp (gcc regno = 7)
2081 5 for %ebp (gcc regno = 6)
2082 6 for %esi (gcc regno = 4)
2083 7 for %edi (gcc regno = 5)
2084 The following three DWARF register numbers are never generated by
2085 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2086 believes these numbers have these meanings.
2087 8 for %eip (no gcc equivalent)
2088 9 for %eflags (gcc regno = 17)
2089 10 for %trapno (no gcc equivalent)
2090 It is not at all clear how we should number the FP stack registers
2091 for the x86 architecture. If the version of SDB on x86/svr4 were
2092 a bit less brain dead with respect to floating-point then we would
2093 have a precedent to follow with respect to DWARF register numbers
2094 for x86 FP registers, but the SDB on x86/svr4 is so completely
2095 broken with respect to FP registers that it is hardly worth thinking
2096 of it as something to strive for compatibility with.
2097 The version of x86/svr4 SDB I have at the moment does (partially)
2098 seem to believe that DWARF register number 11 is associated with
2099 the x86 register %st(0), but that's about all. Higher DWARF
2100 register numbers don't seem to be associated with anything in
2101 particular, and even for DWARF regno 11, SDB only seems to under-
2102 stand that it should say that a variable lives in %st(0) (when
2103 asked via an `=' command) if we said it was in DWARF regno 11,
2104 but SDB still prints garbage when asked for the value of the
2105 variable in question (via a `/' command).
2106 (Also note that the labels SDB prints for various FP stack regs
2107 when doing an `x' command are all wrong.)
2108 Note that these problems generally don't affect the native SVR4
2109 C compiler because it doesn't allow the use of -O with -g and
2110 because when it is *not* optimizing, it allocates a memory
2111 location for each floating-point variable, and the memory
2112 location is what gets described in the DWARF AT_location
2113 attribute for the variable in question.
2114 Regardless of the severe mental illness of the x86/svr4 SDB, we
2115 do something sensible here and we use the following DWARF
2116 register numbers. Note that these are all stack-top-relative
2117 numbers.
2118 11 for %st(0) (gcc regno = 8)
2119 12 for %st(1) (gcc regno = 9)
2120 13 for %st(2) (gcc regno = 10)
2121 14 for %st(3) (gcc regno = 11)
2122 15 for %st(4) (gcc regno = 12)
2123 16 for %st(5) (gcc regno = 13)
2124 17 for %st(6) (gcc regno = 14)
2125 18 for %st(7) (gcc regno = 15)
2127 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2129 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2130 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2131 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2132 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2133 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2134 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2135 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2136 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2137 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2138 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2139 -1, -1, -1, -1, /* bound registers */
2142 /* Define parameter passing and return registers. */
2144 static int const x86_64_int_parameter_registers[6] =
2146 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2149 static int const x86_64_ms_abi_int_parameter_registers[4] =
2151 CX_REG, DX_REG, R8_REG, R9_REG
2154 static int const x86_64_int_return_registers[4] =
2156 AX_REG, DX_REG, DI_REG, SI_REG
2159 /* Additional registers that are clobbered by SYSV calls. */
2161 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2163 SI_REG, DI_REG,
2164 XMM6_REG, XMM7_REG,
2165 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2166 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2169 /* Define the structure for the machine field in struct function. */
2171 struct GTY(()) stack_local_entry {
2172 unsigned short mode;
2173 unsigned short n;
2174 rtx rtl;
2175 struct stack_local_entry *next;
2178 /* Structure describing stack frame layout.
2179 Stack grows downward:
2181 [arguments]
2182 <- ARG_POINTER
2183 saved pc
2185 saved static chain if ix86_static_chain_on_stack
2187 saved frame pointer if frame_pointer_needed
2188 <- HARD_FRAME_POINTER
2189 [saved regs]
2190 <- regs_save_offset
2191 [padding0]
2193 [saved SSE regs]
2194 <- sse_regs_save_offset
2195 [padding1] |
2196 | <- FRAME_POINTER
2197 [va_arg registers] |
2199 [frame] |
2201 [padding2] | = to_allocate
2202 <- STACK_POINTER
2204 struct ix86_frame
2206 int nsseregs;
2207 int nregs;
2208 int va_arg_size;
2209 int red_zone_size;
2210 int outgoing_arguments_size;
2212 /* The offsets relative to ARG_POINTER. */
2213 HOST_WIDE_INT frame_pointer_offset;
2214 HOST_WIDE_INT hard_frame_pointer_offset;
2215 HOST_WIDE_INT stack_pointer_offset;
2216 HOST_WIDE_INT hfp_save_offset;
2217 HOST_WIDE_INT reg_save_offset;
2218 HOST_WIDE_INT sse_reg_save_offset;
2220 /* When save_regs_using_mov is set, emit prologue using
2221 move instead of push instructions. */
2222 bool save_regs_using_mov;
2225 /* Which cpu are we scheduling for. */
2226 enum attr_cpu ix86_schedule;
2228 /* Which cpu are we optimizing for. */
2229 enum processor_type ix86_tune;
2231 /* Which instruction set architecture to use. */
2232 enum processor_type ix86_arch;
2234 /* True if processor has SSE prefetch instruction. */
2235 unsigned char x86_prefetch_sse;
2237 /* -mstackrealign option */
2238 static const char ix86_force_align_arg_pointer_string[]
2239 = "force_align_arg_pointer";
2241 static rtx (*ix86_gen_leave) (void);
2242 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2243 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2244 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2245 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2246 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2247 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2248 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2249 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2250 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2251 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2252 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2254 /* Preferred alignment for stack boundary in bits. */
2255 unsigned int ix86_preferred_stack_boundary;
2257 /* Alignment for incoming stack boundary in bits specified at
2258 command line. */
2259 static unsigned int ix86_user_incoming_stack_boundary;
2261 /* Default alignment for incoming stack boundary in bits. */
2262 static unsigned int ix86_default_incoming_stack_boundary;
2264 /* Alignment for incoming stack boundary in bits. */
2265 unsigned int ix86_incoming_stack_boundary;
2267 /* Calling abi specific va_list type nodes. */
2268 static GTY(()) tree sysv_va_list_type_node;
2269 static GTY(()) tree ms_va_list_type_node;
2271 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2272 char internal_label_prefix[16];
2273 int internal_label_prefix_len;
2275 /* Fence to use after loop using movnt. */
2276 tree x86_mfence;
2278 /* Register class used for passing given 64bit part of the argument.
2279 These represent classes as documented by the PS ABI, with the exception
2280 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2281 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2283 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2284 whenever possible (upper half does contain padding). */
2285 enum x86_64_reg_class
2287 X86_64_NO_CLASS,
2288 X86_64_INTEGER_CLASS,
2289 X86_64_INTEGERSI_CLASS,
2290 X86_64_SSE_CLASS,
2291 X86_64_SSESF_CLASS,
2292 X86_64_SSEDF_CLASS,
2293 X86_64_SSEUP_CLASS,
2294 X86_64_X87_CLASS,
2295 X86_64_X87UP_CLASS,
2296 X86_64_COMPLEX_X87_CLASS,
2297 X86_64_MEMORY_CLASS
2300 #define MAX_CLASSES 4
2302 /* Table of constants used by fldpi, fldln2, etc.... */
2303 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2304 static bool ext_80387_constants_init = 0;
2307 static struct machine_function * ix86_init_machine_status (void);
2308 static rtx ix86_function_value (const_tree, const_tree, bool);
2309 static bool ix86_function_value_regno_p (const unsigned int);
2310 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2311 const_tree);
2312 static rtx ix86_static_chain (const_tree, bool);
2313 static int ix86_function_regparm (const_tree, const_tree);
2314 static void ix86_compute_frame_layout (struct ix86_frame *);
2315 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2316 rtx, rtx, int);
2317 static void ix86_add_new_builtins (HOST_WIDE_INT);
2318 static tree ix86_canonical_va_list_type (tree);
2319 static void predict_jump (int);
2320 static unsigned int split_stack_prologue_scratch_regno (void);
2321 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2323 enum ix86_function_specific_strings
2325 IX86_FUNCTION_SPECIFIC_ARCH,
2326 IX86_FUNCTION_SPECIFIC_TUNE,
2327 IX86_FUNCTION_SPECIFIC_MAX
2330 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2331 const char *, enum fpmath_unit, bool);
2332 static void ix86_function_specific_save (struct cl_target_option *,
2333 struct gcc_options *opts);
2334 static void ix86_function_specific_restore (struct gcc_options *opts,
2335 struct cl_target_option *);
2336 static void ix86_function_specific_print (FILE *, int,
2337 struct cl_target_option *);
2338 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2339 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2340 struct gcc_options *,
2341 struct gcc_options *,
2342 struct gcc_options *);
2343 static bool ix86_can_inline_p (tree, tree);
2344 static void ix86_set_current_function (tree);
2345 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2347 static enum calling_abi ix86_function_abi (const_tree);
2350 #ifndef SUBTARGET32_DEFAULT_CPU
2351 #define SUBTARGET32_DEFAULT_CPU "i386"
2352 #endif
2354 /* Whether -mtune= or -march= were specified */
2355 static int ix86_tune_defaulted;
2356 static int ix86_arch_specified;
2358 /* Vectorization library interface and handlers. */
2359 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2361 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2362 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2364 /* Processor target table, indexed by processor number */
2365 struct ptt
2367 const struct processor_costs *cost; /* Processor costs */
2368 const int align_loop; /* Default alignments. */
2369 const int align_loop_max_skip;
2370 const int align_jump;
2371 const int align_jump_max_skip;
2372 const int align_func;
2375 static const struct ptt processor_target_table[PROCESSOR_max] =
2377 {&i386_cost, 4, 3, 4, 3, 4},
2378 {&i486_cost, 16, 15, 16, 15, 16},
2379 {&pentium_cost, 16, 7, 16, 7, 16},
2380 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2381 {&geode_cost, 0, 0, 0, 0, 0},
2382 {&k6_cost, 32, 7, 32, 7, 32},
2383 {&athlon_cost, 16, 7, 16, 7, 16},
2384 {&pentium4_cost, 0, 0, 0, 0, 0},
2385 {&k8_cost, 16, 7, 16, 7, 16},
2386 {&nocona_cost, 0, 0, 0, 0, 0},
2387 /* Core 2 */
2388 {&core_cost, 16, 10, 16, 10, 16},
2389 /* Core i7 */
2390 {&core_cost, 16, 10, 16, 10, 16},
2391 /* Core i7 avx */
2392 {&core_cost, 16, 10, 16, 10, 16},
2393 /* Core avx2 */
2394 {&core_cost, 16, 10, 16, 10, 16},
2395 {&generic_cost, 16, 10, 16, 10, 16},
2396 {&amdfam10_cost, 32, 24, 32, 7, 32},
2397 {&bdver1_cost, 16, 10, 16, 7, 11},
2398 {&bdver2_cost, 16, 10, 16, 7, 11},
2399 {&bdver3_cost, 16, 10, 16, 7, 11},
2400 {&bdver4_cost, 16, 10, 16, 7, 11},
2401 {&btver1_cost, 16, 10, 16, 7, 11},
2402 {&btver2_cost, 16, 10, 16, 7, 11},
2403 {&atom_cost, 16, 15, 16, 7, 16},
2404 {&slm_cost, 16, 15, 16, 7, 16}
2407 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2409 "generic",
2410 "i386",
2411 "i486",
2412 "pentium",
2413 "pentium-mmx",
2414 "pentiumpro",
2415 "pentium2",
2416 "pentium3",
2417 "pentium4",
2418 "pentium-m",
2419 "prescott",
2420 "nocona",
2421 "core2",
2422 "corei7",
2423 "corei7-avx",
2424 "core-avx2",
2425 "atom",
2426 "slm",
2427 "geode",
2428 "k6",
2429 "k6-2",
2430 "k6-3",
2431 "athlon",
2432 "athlon-4",
2433 "k8",
2434 "amdfam10",
2435 "bdver1",
2436 "bdver2",
2437 "bdver3",
2438 "bdver4",
2439 "btver1",
2440 "btver2"
2443 static bool
2444 gate_insert_vzeroupper (void)
2446 return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
2449 static unsigned int
2450 rest_of_handle_insert_vzeroupper (void)
2452 int i;
2454 /* vzeroupper instructions are inserted immediately after reload to
2455 account for possible spills from 256bit registers. The pass
2456 reuses mode switching infrastructure by re-running mode insertion
2457 pass, so disable entities that have already been processed. */
2458 for (i = 0; i < MAX_386_ENTITIES; i++)
2459 ix86_optimize_mode_switching[i] = 0;
2461 ix86_optimize_mode_switching[AVX_U128] = 1;
2463 /* Call optimize_mode_switching. */
2464 g->get_passes ()->execute_pass_mode_switching ();
2465 return 0;
2468 namespace {
2470 const pass_data pass_data_insert_vzeroupper =
2472 RTL_PASS, /* type */
2473 "vzeroupper", /* name */
2474 OPTGROUP_NONE, /* optinfo_flags */
2475 true, /* has_gate */
2476 true, /* has_execute */
2477 TV_NONE, /* tv_id */
2478 0, /* properties_required */
2479 0, /* properties_provided */
2480 0, /* properties_destroyed */
2481 0, /* todo_flags_start */
2482 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2485 class pass_insert_vzeroupper : public rtl_opt_pass
2487 public:
2488 pass_insert_vzeroupper(gcc::context *ctxt)
2489 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2492 /* opt_pass methods: */
2493 bool gate () { return gate_insert_vzeroupper (); }
2494 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2496 }; // class pass_insert_vzeroupper
2498 } // anon namespace
2500 rtl_opt_pass *
2501 make_pass_insert_vzeroupper (gcc::context *ctxt)
2503 return new pass_insert_vzeroupper (ctxt);
2506 /* Return true if a red-zone is in use. */
2508 static inline bool
2509 ix86_using_red_zone (void)
2511 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2514 /* Return a string that documents the current -m options. The caller is
2515 responsible for freeing the string. */
2517 static char *
2518 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2519 const char *tune, enum fpmath_unit fpmath,
2520 bool add_nl_p)
2522 struct ix86_target_opts
2524 const char *option; /* option string */
2525 HOST_WIDE_INT mask; /* isa mask options */
2528 /* This table is ordered so that options like -msse4.2 that imply
2529 preceding options while match those first. */
2530 static struct ix86_target_opts isa_opts[] =
2532 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2533 { "-mfma", OPTION_MASK_ISA_FMA },
2534 { "-mxop", OPTION_MASK_ISA_XOP },
2535 { "-mlwp", OPTION_MASK_ISA_LWP },
2536 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2537 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2538 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2539 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2540 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2541 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2542 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2543 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2544 { "-msse3", OPTION_MASK_ISA_SSE3 },
2545 { "-msse2", OPTION_MASK_ISA_SSE2 },
2546 { "-msse", OPTION_MASK_ISA_SSE },
2547 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2548 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2549 { "-mmmx", OPTION_MASK_ISA_MMX },
2550 { "-mabm", OPTION_MASK_ISA_ABM },
2551 { "-mbmi", OPTION_MASK_ISA_BMI },
2552 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2553 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2554 { "-mhle", OPTION_MASK_ISA_HLE },
2555 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2556 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2557 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2558 { "-madx", OPTION_MASK_ISA_ADX },
2559 { "-mtbm", OPTION_MASK_ISA_TBM },
2560 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2561 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2562 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2563 { "-maes", OPTION_MASK_ISA_AES },
2564 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2565 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2566 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2567 { "-mf16c", OPTION_MASK_ISA_F16C },
2568 { "-mrtm", OPTION_MASK_ISA_RTM },
2569 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2570 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2571 { "-mmpx", OPTION_MASK_ISA_MPX },
2574 /* Flag options. */
2575 static struct ix86_target_opts flag_opts[] =
2577 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2578 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2579 { "-m80387", MASK_80387 },
2580 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2581 { "-malign-double", MASK_ALIGN_DOUBLE },
2582 { "-mcld", MASK_CLD },
2583 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2584 { "-mieee-fp", MASK_IEEE_FP },
2585 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2586 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2587 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2588 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2589 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2590 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2591 { "-mno-red-zone", MASK_NO_RED_ZONE },
2592 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2593 { "-mrecip", MASK_RECIP },
2594 { "-mrtd", MASK_RTD },
2595 { "-msseregparm", MASK_SSEREGPARM },
2596 { "-mstack-arg-probe", MASK_STACK_PROBE },
2597 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2598 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2599 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2600 { "-mvzeroupper", MASK_VZEROUPPER },
2601 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2602 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2603 { "-mprefer-avx128", MASK_PREFER_AVX128},
2606 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2608 char isa_other[40];
2609 char target_other[40];
2610 unsigned num = 0;
2611 unsigned i, j;
2612 char *ret;
2613 char *ptr;
2614 size_t len;
2615 size_t line_len;
2616 size_t sep_len;
2617 const char *abi;
2619 memset (opts, '\0', sizeof (opts));
2621 /* Add -march= option. */
2622 if (arch)
2624 opts[num][0] = "-march=";
2625 opts[num++][1] = arch;
2628 /* Add -mtune= option. */
2629 if (tune)
2631 opts[num][0] = "-mtune=";
2632 opts[num++][1] = tune;
2635 /* Add -m32/-m64/-mx32. */
2636 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2638 if ((isa & OPTION_MASK_ABI_64) != 0)
2639 abi = "-m64";
2640 else
2641 abi = "-mx32";
2642 isa &= ~ (OPTION_MASK_ISA_64BIT
2643 | OPTION_MASK_ABI_64
2644 | OPTION_MASK_ABI_X32);
2646 else
2647 abi = "-m32";
2648 opts[num++][0] = abi;
2650 /* Pick out the options in isa options. */
2651 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2653 if ((isa & isa_opts[i].mask) != 0)
2655 opts[num++][0] = isa_opts[i].option;
2656 isa &= ~ isa_opts[i].mask;
2660 if (isa && add_nl_p)
2662 opts[num++][0] = isa_other;
2663 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2664 isa);
2667 /* Add flag options. */
2668 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2670 if ((flags & flag_opts[i].mask) != 0)
2672 opts[num++][0] = flag_opts[i].option;
2673 flags &= ~ flag_opts[i].mask;
2677 if (flags && add_nl_p)
2679 opts[num++][0] = target_other;
2680 sprintf (target_other, "(other flags: %#x)", flags);
2683 /* Add -fpmath= option. */
2684 if (fpmath)
2686 opts[num][0] = "-mfpmath=";
2687 switch ((int) fpmath)
2689 case FPMATH_387:
2690 opts[num++][1] = "387";
2691 break;
2693 case FPMATH_SSE:
2694 opts[num++][1] = "sse";
2695 break;
2697 case FPMATH_387 | FPMATH_SSE:
2698 opts[num++][1] = "sse+387";
2699 break;
2701 default:
2702 gcc_unreachable ();
2706 /* Any options? */
2707 if (num == 0)
2708 return NULL;
2710 gcc_assert (num < ARRAY_SIZE (opts));
2712 /* Size the string. */
2713 len = 0;
2714 sep_len = (add_nl_p) ? 3 : 1;
2715 for (i = 0; i < num; i++)
2717 len += sep_len;
2718 for (j = 0; j < 2; j++)
2719 if (opts[i][j])
2720 len += strlen (opts[i][j]);
2723 /* Build the string. */
2724 ret = ptr = (char *) xmalloc (len);
2725 line_len = 0;
2727 for (i = 0; i < num; i++)
2729 size_t len2[2];
2731 for (j = 0; j < 2; j++)
2732 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2734 if (i != 0)
2736 *ptr++ = ' ';
2737 line_len++;
2739 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2741 *ptr++ = '\\';
2742 *ptr++ = '\n';
2743 line_len = 0;
2747 for (j = 0; j < 2; j++)
2748 if (opts[i][j])
2750 memcpy (ptr, opts[i][j], len2[j]);
2751 ptr += len2[j];
2752 line_len += len2[j];
2756 *ptr = '\0';
2757 gcc_assert (ret + len >= ptr);
2759 return ret;
2762 /* Return true, if profiling code should be emitted before
2763 prologue. Otherwise it returns false.
2764 Note: For x86 with "hotfix" it is sorried. */
2765 static bool
2766 ix86_profile_before_prologue (void)
2768 return flag_fentry != 0;
2771 /* Function that is callable from the debugger to print the current
2772 options. */
2773 void ATTRIBUTE_UNUSED
2774 ix86_debug_options (void)
2776 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2777 ix86_arch_string, ix86_tune_string,
2778 ix86_fpmath, true);
2780 if (opts)
2782 fprintf (stderr, "%s\n\n", opts);
2783 free (opts);
2785 else
2786 fputs ("<no options>\n\n", stderr);
2788 return;
2791 static const char *stringop_alg_names[] = {
2792 #define DEF_ENUM
2793 #define DEF_ALG(alg, name) #name,
2794 #include "stringop.def"
2795 #undef DEF_ENUM
2796 #undef DEF_ALG
2799 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2800 The string is of the following form (or comma separated list of it):
2802 strategy_alg:max_size:[align|noalign]
2804 where the full size range for the strategy is either [0, max_size] or
2805 [min_size, max_size], in which min_size is the max_size + 1 of the
2806 preceding range. The last size range must have max_size == -1.
2808 Examples:
2811 -mmemcpy-strategy=libcall:-1:noalign
2813 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2817 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2819 This is to tell the compiler to use the following strategy for memset
2820 1) when the expected size is between [1, 16], use rep_8byte strategy;
2821 2) when the size is between [17, 2048], use vector_loop;
2822 3) when the size is > 2048, use libcall. */
2824 struct stringop_size_range
2826 int max;
2827 stringop_alg alg;
2828 bool noalign;
2831 static void
2832 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2834 const struct stringop_algs *default_algs;
2835 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2836 char *curr_range_str, *next_range_str;
2837 int i = 0, n = 0;
2839 if (is_memset)
2840 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2841 else
2842 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2844 curr_range_str = strategy_str;
2848 int maxs;
2849 stringop_alg alg;
2850 char alg_name[128];
2851 char align[16];
2852 next_range_str = strchr (curr_range_str, ',');
2853 if (next_range_str)
2854 *next_range_str++ = '\0';
2856 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2857 alg_name, &maxs, align))
2859 error ("wrong arg %s to option %s", curr_range_str,
2860 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2861 return;
2864 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2866 error ("size ranges of option %s should be increasing",
2867 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2868 return;
2871 for (i = 0; i < last_alg; i++)
2873 if (!strcmp (alg_name, stringop_alg_names[i]))
2875 alg = (stringop_alg) i;
2876 break;
2880 if (i == last_alg)
2882 error ("wrong stringop strategy name %s specified for option %s",
2883 alg_name,
2884 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2885 return;
2888 input_ranges[n].max = maxs;
2889 input_ranges[n].alg = alg;
2890 if (!strcmp (align, "align"))
2891 input_ranges[n].noalign = false;
2892 else if (!strcmp (align, "noalign"))
2893 input_ranges[n].noalign = true;
2894 else
2896 error ("unknown alignment %s specified for option %s",
2897 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2898 return;
2900 n++;
2901 curr_range_str = next_range_str;
2903 while (curr_range_str);
2905 if (input_ranges[n - 1].max != -1)
2907 error ("the max value for the last size range should be -1"
2908 " for option %s",
2909 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2910 return;
2913 if (n > MAX_STRINGOP_ALGS)
2915 error ("too many size ranges specified in option %s",
2916 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2917 return;
2920 /* Now override the default algs array. */
2921 for (i = 0; i < n; i++)
2923 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2924 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2925 = input_ranges[i].alg;
2926 *const_cast<int *>(&default_algs->size[i].noalign)
2927 = input_ranges[i].noalign;
2932 /* parse -mtune-ctrl= option. When DUMP is true,
2933 print the features that are explicitly set. */
2935 static void
2936 parse_mtune_ctrl_str (bool dump)
2938 if (!ix86_tune_ctrl_string)
2939 return;
2941 char *next_feature_string = NULL;
2942 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2943 char *orig = curr_feature_string;
2944 int i;
2947 bool clear = false;
2949 next_feature_string = strchr (curr_feature_string, ',');
2950 if (next_feature_string)
2951 *next_feature_string++ = '\0';
2952 if (*curr_feature_string == '^')
2954 curr_feature_string++;
2955 clear = true;
2957 for (i = 0; i < X86_TUNE_LAST; i++)
2959 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2961 ix86_tune_features[i] = !clear;
2962 if (dump)
2963 fprintf (stderr, "Explicitly %s feature %s\n",
2964 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2965 break;
2968 if (i == X86_TUNE_LAST)
2969 error ("Unknown parameter to option -mtune-ctrl: %s",
2970 clear ? curr_feature_string - 1 : curr_feature_string);
2971 curr_feature_string = next_feature_string;
2973 while (curr_feature_string);
2974 free (orig);
2977 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2978 processor type. */
2980 static void
2981 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2983 unsigned int ix86_tune_mask = 1u << ix86_tune;
2984 int i;
2986 for (i = 0; i < X86_TUNE_LAST; ++i)
2988 if (ix86_tune_no_default)
2989 ix86_tune_features[i] = 0;
2990 else
2991 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
2994 if (dump)
2996 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
2997 for (i = 0; i < X86_TUNE_LAST; i++)
2998 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
2999 ix86_tune_features[i] ? "on" : "off");
3002 parse_mtune_ctrl_str (dump);
3006 /* Override various settings based on options. If MAIN_ARGS_P, the
3007 options are from the command line, otherwise they are from
3008 attributes. */
3010 static void
3011 ix86_option_override_internal (bool main_args_p,
3012 struct gcc_options *opts,
3013 struct gcc_options *opts_set)
3015 int i;
3016 unsigned int ix86_arch_mask;
3017 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3018 const char *prefix;
3019 const char *suffix;
3020 const char *sw;
3022 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3023 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3024 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3025 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3026 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3027 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3028 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3029 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3030 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3031 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3032 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3033 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3034 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3035 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3036 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3037 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3038 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3039 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3040 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3041 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3042 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3043 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3044 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3045 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3046 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3047 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3048 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3049 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3050 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3051 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3052 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3053 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3054 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3055 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3056 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3057 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3058 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3059 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3060 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3061 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3062 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3063 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3064 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3065 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3066 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3068 /* if this reaches 64, need to widen struct pta flags below */
3070 static struct pta
3072 const char *const name; /* processor name or nickname. */
3073 const enum processor_type processor;
3074 const enum attr_cpu schedule;
3075 const unsigned HOST_WIDE_INT flags;
3077 const processor_alias_table[] =
3079 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3080 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3081 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3082 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3083 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3084 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3085 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3086 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3087 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3088 PTA_MMX | PTA_SSE | PTA_FXSR},
3089 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3090 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3091 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3092 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3093 PTA_MMX | PTA_SSE | PTA_FXSR},
3094 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3095 PTA_MMX | PTA_SSE | PTA_FXSR},
3096 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3097 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3098 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3099 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3100 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3101 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3102 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3103 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3104 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3105 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3106 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3107 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3108 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3109 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3110 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3111 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3112 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3113 {"corei7-avx", PROCESSOR_COREI7_AVX, CPU_COREI7,
3114 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3115 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3116 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3117 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3118 {"core-avx-i", PROCESSOR_COREI7_AVX, CPU_COREI7,
3119 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3120 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3121 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3122 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3123 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3124 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3125 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3126 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3127 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3128 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3129 | PTA_XSAVEOPT},
3130 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3131 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3132 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3133 {"slm", PROCESSOR_SLM, CPU_SLM,
3134 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3135 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3136 | PTA_FXSR},
3137 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3138 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3139 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3140 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3141 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3142 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3143 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3144 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3145 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3146 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3147 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3148 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3149 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3150 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3151 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3152 {"x86-64", PROCESSOR_K8, CPU_K8,
3153 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3154 {"k8", PROCESSOR_K8, CPU_K8,
3155 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3156 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3157 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3158 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3159 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3160 {"opteron", PROCESSOR_K8, CPU_K8,
3161 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3162 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3163 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3164 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3165 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3166 {"athlon64", PROCESSOR_K8, CPU_K8,
3167 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3168 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3169 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3170 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3171 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3172 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3173 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3174 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3175 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3176 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3177 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3178 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3179 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3180 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3181 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3182 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3183 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3184 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3185 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3186 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3187 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3188 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3189 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3190 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3191 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3192 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3193 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3194 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3195 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3196 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3197 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3198 | PTA_XSAVEOPT | PTA_FSGSBASE},
3199 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3200 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3201 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3202 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3203 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3204 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3205 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE},
3206 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3207 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3208 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3209 | PTA_FXSR | PTA_XSAVE},
3210 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3211 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3212 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3213 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3214 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3215 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3217 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3218 PTA_64BIT
3219 | PTA_HLE /* flags are only used for -march switch. */ },
3222 /* -mrecip options. */
3223 static struct
3225 const char *string; /* option name */
3226 unsigned int mask; /* mask bits to set */
3228 const recip_options[] =
3230 { "all", RECIP_MASK_ALL },
3231 { "none", RECIP_MASK_NONE },
3232 { "div", RECIP_MASK_DIV },
3233 { "sqrt", RECIP_MASK_SQRT },
3234 { "vec-div", RECIP_MASK_VEC_DIV },
3235 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3238 int const pta_size = ARRAY_SIZE (processor_alias_table);
3240 /* Set up prefix/suffix so the error messages refer to either the command
3241 line argument, or the attribute(target). */
3242 if (main_args_p)
3244 prefix = "-m";
3245 suffix = "";
3246 sw = "switch";
3248 else
3250 prefix = "option(\"";
3251 suffix = "\")";
3252 sw = "attribute";
3255 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3256 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3257 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3258 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3259 #ifdef TARGET_BI_ARCH
3260 else
3262 #if TARGET_BI_ARCH == 1
3263 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3264 is on and OPTION_MASK_ABI_X32 is off. We turn off
3265 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3266 -mx32. */
3267 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3268 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3269 #else
3270 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3271 on and OPTION_MASK_ABI_64 is off. We turn off
3272 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3273 -m64. */
3274 if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3275 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3276 #endif
3278 #endif
3280 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3282 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3283 OPTION_MASK_ABI_64 for TARGET_X32. */
3284 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3285 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3287 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3289 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3290 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3291 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3292 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3295 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3296 SUBTARGET_OVERRIDE_OPTIONS;
3297 #endif
3299 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3300 SUBSUBTARGET_OVERRIDE_OPTIONS;
3301 #endif
3303 /* -fPIC is the default for x86_64. */
3304 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3305 opts->x_flag_pic = 2;
3307 /* Need to check -mtune=generic first. */
3308 if (opts->x_ix86_tune_string)
3310 if (!strcmp (opts->x_ix86_tune_string, "generic")
3311 || !strcmp (opts->x_ix86_tune_string, "i686")
3312 /* As special support for cross compilers we read -mtune=native
3313 as -mtune=generic. With native compilers we won't see the
3314 -mtune=native, as it was changed by the driver. */
3315 || !strcmp (opts->x_ix86_tune_string, "native"))
3317 opts->x_ix86_tune_string = "generic";
3319 /* If this call is for setting the option attribute, allow the
3320 generic that was previously set. */
3321 else if (!main_args_p
3322 && !strcmp (opts->x_ix86_tune_string, "generic"))
3324 else if (!strncmp (opts->x_ix86_tune_string, "generic", 7))
3325 error ("bad value (%s) for %stune=%s %s",
3326 opts->x_ix86_tune_string, prefix, suffix, sw);
3327 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3328 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3329 "%stune=k8%s or %stune=generic%s instead as appropriate",
3330 prefix, suffix, prefix, suffix, prefix, suffix);
3332 else
3334 if (opts->x_ix86_arch_string)
3335 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3336 if (!opts->x_ix86_tune_string)
3338 opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3339 ix86_tune_defaulted = 1;
3342 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3343 or defaulted. We need to use a sensible tune option. */
3344 if (!strcmp (opts->x_ix86_tune_string, "generic")
3345 || !strcmp (opts->x_ix86_tune_string, "x86-64")
3346 || !strcmp (opts->x_ix86_tune_string, "i686"))
3348 opts->x_ix86_tune_string = "generic";
3352 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3353 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3355 /* rep; movq isn't available in 32-bit code. */
3356 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3357 opts->x_ix86_stringop_alg = no_stringop;
3360 if (!opts->x_ix86_arch_string)
3361 opts->x_ix86_arch_string
3362 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3363 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3364 else
3365 ix86_arch_specified = 1;
3367 if (opts_set->x_ix86_pmode)
3369 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3370 && opts->x_ix86_pmode == PMODE_SI)
3371 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3372 && opts->x_ix86_pmode == PMODE_DI))
3373 error ("address mode %qs not supported in the %s bit mode",
3374 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3375 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3377 else
3378 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3379 ? PMODE_DI : PMODE_SI;
3381 if (!opts_set->x_ix86_abi)
3382 opts->x_ix86_abi = DEFAULT_ABI;
3384 /* For targets using ms ABI enable ms-extensions, if not
3385 explicit turned off. For non-ms ABI we turn off this
3386 option. */
3387 if (!opts_set->x_flag_ms_extensions)
3388 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3390 if (opts_set->x_ix86_cmodel)
3392 switch (opts->x_ix86_cmodel)
3394 case CM_SMALL:
3395 case CM_SMALL_PIC:
3396 if (opts->x_flag_pic)
3397 opts->x_ix86_cmodel = CM_SMALL_PIC;
3398 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3399 error ("code model %qs not supported in the %s bit mode",
3400 "small", "32");
3401 break;
3403 case CM_MEDIUM:
3404 case CM_MEDIUM_PIC:
3405 if (opts->x_flag_pic)
3406 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3407 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3408 error ("code model %qs not supported in the %s bit mode",
3409 "medium", "32");
3410 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3411 error ("code model %qs not supported in x32 mode",
3412 "medium");
3413 break;
3415 case CM_LARGE:
3416 case CM_LARGE_PIC:
3417 if (opts->x_flag_pic)
3418 opts->x_ix86_cmodel = CM_LARGE_PIC;
3419 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3420 error ("code model %qs not supported in the %s bit mode",
3421 "large", "32");
3422 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3423 error ("code model %qs not supported in x32 mode",
3424 "large");
3425 break;
3427 case CM_32:
3428 if (opts->x_flag_pic)
3429 error ("code model %s does not support PIC mode", "32");
3430 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3431 error ("code model %qs not supported in the %s bit mode",
3432 "32", "64");
3433 break;
3435 case CM_KERNEL:
3436 if (opts->x_flag_pic)
3438 error ("code model %s does not support PIC mode", "kernel");
3439 opts->x_ix86_cmodel = CM_32;
3441 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3442 error ("code model %qs not supported in the %s bit mode",
3443 "kernel", "32");
3444 break;
3446 default:
3447 gcc_unreachable ();
3450 else
3452 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3453 use of rip-relative addressing. This eliminates fixups that
3454 would otherwise be needed if this object is to be placed in a
3455 DLL, and is essentially just as efficient as direct addressing. */
3456 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3457 && (TARGET_RDOS || TARGET_PECOFF))
3458 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3459 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3460 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3461 else
3462 opts->x_ix86_cmodel = CM_32;
3464 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3466 error ("-masm=intel not supported in this configuration");
3467 opts->x_ix86_asm_dialect = ASM_ATT;
3469 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3470 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3471 sorry ("%i-bit mode not compiled in",
3472 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3474 for (i = 0; i < pta_size; i++)
3475 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3477 ix86_schedule = processor_alias_table[i].schedule;
3478 ix86_arch = processor_alias_table[i].processor;
3479 /* Default cpu tuning to the architecture. */
3480 ix86_tune = ix86_arch;
3482 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3483 && !(processor_alias_table[i].flags & PTA_64BIT))
3484 error ("CPU you selected does not support x86-64 "
3485 "instruction set");
3487 if (processor_alias_table[i].flags & PTA_MMX
3488 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3489 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3490 if (processor_alias_table[i].flags & PTA_3DNOW
3491 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3492 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3493 if (processor_alias_table[i].flags & PTA_3DNOW_A
3494 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3495 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3496 if (processor_alias_table[i].flags & PTA_SSE
3497 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3498 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3499 if (processor_alias_table[i].flags & PTA_SSE2
3500 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3501 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3502 if (processor_alias_table[i].flags & PTA_SSE3
3503 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3504 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3505 if (processor_alias_table[i].flags & PTA_SSSE3
3506 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3507 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3508 if (processor_alias_table[i].flags & PTA_SSE4_1
3509 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3510 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3511 if (processor_alias_table[i].flags & PTA_SSE4_2
3512 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3513 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3514 if (processor_alias_table[i].flags & PTA_AVX
3515 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3516 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3517 if (processor_alias_table[i].flags & PTA_AVX2
3518 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3519 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3520 if (processor_alias_table[i].flags & PTA_FMA
3521 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3522 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3523 if (processor_alias_table[i].flags & PTA_SSE4A
3524 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3525 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3526 if (processor_alias_table[i].flags & PTA_FMA4
3527 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3528 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3529 if (processor_alias_table[i].flags & PTA_XOP
3530 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3531 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3532 if (processor_alias_table[i].flags & PTA_LWP
3533 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3534 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3535 if (processor_alias_table[i].flags & PTA_ABM
3536 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3537 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3538 if (processor_alias_table[i].flags & PTA_BMI
3539 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3540 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3541 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3542 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3543 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3544 if (processor_alias_table[i].flags & PTA_TBM
3545 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3546 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3547 if (processor_alias_table[i].flags & PTA_BMI2
3548 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3549 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3550 if (processor_alias_table[i].flags & PTA_CX16
3551 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3552 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3553 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3554 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3555 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3556 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3557 && (processor_alias_table[i].flags & PTA_NO_SAHF))
3558 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3559 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3560 if (processor_alias_table[i].flags & PTA_MOVBE
3561 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3562 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3563 if (processor_alias_table[i].flags & PTA_AES
3564 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3565 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES;
3566 if (processor_alias_table[i].flags & PTA_PCLMUL
3567 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3568 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3569 if (processor_alias_table[i].flags & PTA_FSGSBASE
3570 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3571 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3572 if (processor_alias_table[i].flags & PTA_RDRND
3573 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3574 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3575 if (processor_alias_table[i].flags & PTA_F16C
3576 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3577 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3578 if (processor_alias_table[i].flags & PTA_RTM
3579 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3580 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3581 if (processor_alias_table[i].flags & PTA_HLE
3582 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3583 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3584 if (processor_alias_table[i].flags & PTA_PRFCHW
3585 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3586 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3587 if (processor_alias_table[i].flags & PTA_RDSEED
3588 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3589 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3590 if (processor_alias_table[i].flags & PTA_ADX
3591 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3592 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3593 if (processor_alias_table[i].flags & PTA_FXSR
3594 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3595 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3596 if (processor_alias_table[i].flags & PTA_XSAVE
3597 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3598 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3599 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3600 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3601 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3602 if (processor_alias_table[i].flags & PTA_AVX512F
3603 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3604 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3605 if (processor_alias_table[i].flags & PTA_AVX512ER
3606 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3607 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3608 if (processor_alias_table[i].flags & PTA_AVX512PF
3609 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3610 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3611 if (processor_alias_table[i].flags & PTA_AVX512CD
3612 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3613 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3614 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3615 x86_prefetch_sse = true;
3617 break;
3620 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3621 error ("generic CPU can be used only for %stune=%s %s",
3622 prefix, suffix, sw);
3623 else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size)
3624 error ("bad value (%s) for %sarch=%s %s",
3625 opts->x_ix86_arch_string, prefix, suffix, sw);
3627 ix86_arch_mask = 1u << ix86_arch;
3628 for (i = 0; i < X86_ARCH_LAST; ++i)
3629 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3631 for (i = 0; i < pta_size; i++)
3632 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
3634 ix86_schedule = processor_alias_table[i].schedule;
3635 ix86_tune = processor_alias_table[i].processor;
3636 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3638 if (!(processor_alias_table[i].flags & PTA_64BIT))
3640 if (ix86_tune_defaulted)
3642 opts->x_ix86_tune_string = "x86-64";
3643 for (i = 0; i < pta_size; i++)
3644 if (! strcmp (opts->x_ix86_tune_string,
3645 processor_alias_table[i].name))
3646 break;
3647 ix86_schedule = processor_alias_table[i].schedule;
3648 ix86_tune = processor_alias_table[i].processor;
3650 else
3651 error ("CPU you selected does not support x86-64 "
3652 "instruction set");
3655 /* Intel CPUs have always interpreted SSE prefetch instructions as
3656 NOPs; so, we can enable SSE prefetch instructions even when
3657 -mtune (rather than -march) points us to a processor that has them.
3658 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3659 higher processors. */
3660 if (TARGET_CMOV
3661 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3662 x86_prefetch_sse = true;
3663 break;
3666 if (ix86_tune_specified && i == pta_size)
3667 error ("bad value (%s) for %stune=%s %s",
3668 opts->x_ix86_tune_string, prefix, suffix, sw);
3670 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
3672 #ifndef USE_IX86_FRAME_POINTER
3673 #define USE_IX86_FRAME_POINTER 0
3674 #endif
3676 #ifndef USE_X86_64_FRAME_POINTER
3677 #define USE_X86_64_FRAME_POINTER 0
3678 #endif
3680 /* Set the default values for switches whose default depends on TARGET_64BIT
3681 in case they weren't overwritten by command line options. */
3682 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3684 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3685 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3686 if (opts->x_flag_asynchronous_unwind_tables == 2)
3687 opts->x_flag_unwind_tables
3688 = opts->x_flag_asynchronous_unwind_tables = 1;
3689 if (opts->x_flag_pcc_struct_return == 2)
3690 opts->x_flag_pcc_struct_return = 0;
3692 else
3694 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3695 opts->x_flag_omit_frame_pointer
3696 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
3697 if (opts->x_flag_asynchronous_unwind_tables == 2)
3698 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3699 if (opts->x_flag_pcc_struct_return == 2)
3700 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3703 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3704 if (opts->x_optimize_size)
3705 ix86_cost = &ix86_size_cost;
3706 else
3707 ix86_cost = ix86_tune_cost;
3709 /* Arrange to set up i386_stack_locals for all functions. */
3710 init_machine_status = ix86_init_machine_status;
3712 /* Validate -mregparm= value. */
3713 if (opts_set->x_ix86_regparm)
3715 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3716 warning (0, "-mregparm is ignored in 64-bit mode");
3717 if (opts->x_ix86_regparm > REGPARM_MAX)
3719 error ("-mregparm=%d is not between 0 and %d",
3720 opts->x_ix86_regparm, REGPARM_MAX);
3721 opts->x_ix86_regparm = 0;
3724 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3725 opts->x_ix86_regparm = REGPARM_MAX;
3727 /* Default align_* from the processor table. */
3728 if (opts->x_align_loops == 0)
3730 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3731 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3733 if (opts->x_align_jumps == 0)
3735 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3736 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3738 if (opts->x_align_functions == 0)
3740 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3743 /* Provide default for -mbranch-cost= value. */
3744 if (!opts_set->x_ix86_branch_cost)
3745 opts->x_ix86_branch_cost = ix86_cost->branch_cost;
3747 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3749 opts->x_target_flags
3750 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
3752 /* Enable by default the SSE and MMX builtins. Do allow the user to
3753 explicitly disable any of these. In particular, disabling SSE and
3754 MMX for kernel code is extremely useful. */
3755 if (!ix86_arch_specified)
3756 opts->x_ix86_isa_flags
3757 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3758 | TARGET_SUBTARGET64_ISA_DEFAULT)
3759 & ~opts->x_ix86_isa_flags_explicit);
3761 if (TARGET_RTD_P (opts->x_target_flags))
3762 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3764 else
3766 opts->x_target_flags
3767 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
3769 if (!ix86_arch_specified)
3770 opts->x_ix86_isa_flags
3771 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
3773 /* i386 ABI does not specify red zone. It still makes sense to use it
3774 when programmer takes care to stack from being destroyed. */
3775 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
3776 opts->x_target_flags |= MASK_NO_RED_ZONE;
3779 /* Keep nonleaf frame pointers. */
3780 if (opts->x_flag_omit_frame_pointer)
3781 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3782 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
3783 opts->x_flag_omit_frame_pointer = 1;
3785 /* If we're doing fast math, we don't care about comparison order
3786 wrt NaNs. This lets us use a shorter comparison sequence. */
3787 if (opts->x_flag_finite_math_only)
3788 opts->x_target_flags &= ~MASK_IEEE_FP;
3790 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3791 since the insns won't need emulation. */
3792 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
3793 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
3795 /* Likewise, if the target doesn't have a 387, or we've specified
3796 software floating point, don't use 387 inline intrinsics. */
3797 if (!TARGET_80387_P (opts->x_target_flags))
3798 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
3800 /* Turn on MMX builtins for -msse. */
3801 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
3802 opts->x_ix86_isa_flags
3803 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
3805 /* Enable SSE prefetch. */
3806 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
3807 || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)))
3808 x86_prefetch_sse = true;
3810 /* Enable prefetch{,w} instructions for -m3dnow. */
3811 if (TARGET_3DNOW_P (opts->x_ix86_isa_flags))
3812 opts->x_ix86_isa_flags
3813 |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit;
3815 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3816 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
3817 || TARGET_ABM_P (opts->x_ix86_isa_flags))
3818 opts->x_ix86_isa_flags
3819 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
3821 /* Enable lzcnt instruction for -mabm. */
3822 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
3823 opts->x_ix86_isa_flags
3824 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
3826 /* Validate -mpreferred-stack-boundary= value or default it to
3827 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3828 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3829 if (opts_set->x_ix86_preferred_stack_boundary_arg)
3831 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3832 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
3833 int max = (TARGET_SEH ? 4 : 12);
3835 if (opts->x_ix86_preferred_stack_boundary_arg < min
3836 || opts->x_ix86_preferred_stack_boundary_arg > max)
3838 if (min == max)
3839 error ("-mpreferred-stack-boundary is not supported "
3840 "for this target");
3841 else
3842 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3843 opts->x_ix86_preferred_stack_boundary_arg, min, max);
3845 else
3846 ix86_preferred_stack_boundary
3847 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3850 /* Set the default value for -mstackrealign. */
3851 if (opts->x_ix86_force_align_arg_pointer == -1)
3852 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3854 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3856 /* Validate -mincoming-stack-boundary= value or default it to
3857 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3858 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3859 if (opts_set->x_ix86_incoming_stack_boundary_arg)
3861 if (ix86_incoming_stack_boundary_arg
3862 < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
3863 || ix86_incoming_stack_boundary_arg > 12)
3864 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3865 ix86_incoming_stack_boundary_arg,
3866 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
3867 else
3869 ix86_user_incoming_stack_boundary
3870 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3871 ix86_incoming_stack_boundary
3872 = ix86_user_incoming_stack_boundary;
3876 /* Accept -msseregparm only if at least SSE support is enabled. */
3877 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
3878 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
3879 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3881 if (opts_set->x_ix86_fpmath)
3883 if (opts->x_ix86_fpmath & FPMATH_SSE)
3885 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
3887 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3888 opts->x_ix86_fpmath = FPMATH_387;
3890 else if ((opts->x_ix86_fpmath & FPMATH_387)
3891 && !TARGET_80387_P (opts->x_target_flags))
3893 warning (0, "387 instruction set disabled, using SSE arithmetics");
3894 opts->x_ix86_fpmath = FPMATH_SSE;
3898 /* For all chips supporting SSE2, -mfpmath=sse performs better than
3899 fpmath=387. The second is however default at many targets since the
3900 extra 80bit precision of temporaries is considered to be part of ABI.
3901 Overwrite the default at least for -ffast-math.
3902 TODO: -mfpmath=both seems to produce same performing code with bit
3903 smaller binaries. It is however not clear if register allocation is
3904 ready for this setting.
3905 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
3906 codegen. We may switch to 387 with -ffast-math for size optimized
3907 functions. */
3908 else if (fast_math_flags_set_p (&global_options)
3909 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
3910 ix86_fpmath = FPMATH_SSE;
3911 else
3912 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
3914 /* If the i387 is disabled, then do not return values in it. */
3915 if (!TARGET_80387_P (opts->x_target_flags))
3916 opts->x_target_flags &= ~MASK_FLOAT_RETURNS;
3918 /* Use external vectorized library in vectorizing intrinsics. */
3919 if (opts_set->x_ix86_veclibabi_type)
3920 switch (opts->x_ix86_veclibabi_type)
3922 case ix86_veclibabi_type_svml:
3923 ix86_veclib_handler = ix86_veclibabi_svml;
3924 break;
3926 case ix86_veclibabi_type_acml:
3927 ix86_veclib_handler = ix86_veclibabi_acml;
3928 break;
3930 default:
3931 gcc_unreachable ();
3934 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
3935 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3936 && !opts->x_optimize_size)
3937 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3939 /* If stack probes are required, the space used for large function
3940 arguments on the stack must also be probed, so enable
3941 -maccumulate-outgoing-args so this happens in the prologue. */
3942 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
3943 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3945 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3946 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3947 "for correctness", prefix, suffix);
3948 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3951 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3953 char *p;
3954 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3955 p = strchr (internal_label_prefix, 'X');
3956 internal_label_prefix_len = p - internal_label_prefix;
3957 *p = '\0';
3960 /* When scheduling description is not available, disable scheduler pass
3961 so it won't slow down the compilation and make x87 code slower. */
3962 if (!TARGET_SCHEDULE)
3963 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
3965 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3966 ix86_tune_cost->simultaneous_prefetches,
3967 opts->x_param_values,
3968 opts_set->x_param_values);
3969 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3970 ix86_tune_cost->prefetch_block,
3971 opts->x_param_values,
3972 opts_set->x_param_values);
3973 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3974 ix86_tune_cost->l1_cache_size,
3975 opts->x_param_values,
3976 opts_set->x_param_values);
3977 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3978 ix86_tune_cost->l2_cache_size,
3979 opts->x_param_values,
3980 opts_set->x_param_values);
3982 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3983 if (opts->x_flag_prefetch_loop_arrays < 0
3984 && HAVE_prefetch
3985 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
3986 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3987 opts->x_flag_prefetch_loop_arrays = 1;
3989 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3990 can be opts->x_optimized to ap = __builtin_next_arg (0). */
3991 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
3992 targetm.expand_builtin_va_start = NULL;
3994 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3996 ix86_gen_leave = gen_leave_rex64;
3997 if (Pmode == DImode)
3999 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4000 ix86_gen_tls_local_dynamic_base_64
4001 = gen_tls_local_dynamic_base_64_di;
4003 else
4005 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4006 ix86_gen_tls_local_dynamic_base_64
4007 = gen_tls_local_dynamic_base_64_si;
4010 else
4011 ix86_gen_leave = gen_leave;
4013 if (Pmode == DImode)
4015 ix86_gen_add3 = gen_adddi3;
4016 ix86_gen_sub3 = gen_subdi3;
4017 ix86_gen_sub3_carry = gen_subdi3_carry;
4018 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4019 ix86_gen_andsp = gen_anddi3;
4020 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4021 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4022 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4023 ix86_gen_monitor = gen_sse3_monitor_di;
4025 else
4027 ix86_gen_add3 = gen_addsi3;
4028 ix86_gen_sub3 = gen_subsi3;
4029 ix86_gen_sub3_carry = gen_subsi3_carry;
4030 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4031 ix86_gen_andsp = gen_andsi3;
4032 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4033 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4034 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4035 ix86_gen_monitor = gen_sse3_monitor_si;
4038 #ifdef USE_IX86_CLD
4039 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4040 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4041 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4042 #endif
4044 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
4046 if (opts->x_flag_fentry > 0)
4047 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4048 "with -fpic");
4049 opts->x_flag_fentry = 0;
4051 else if (TARGET_SEH)
4053 if (opts->x_flag_fentry == 0)
4054 sorry ("-mno-fentry isn%'t compatible with SEH");
4055 opts->x_flag_fentry = 1;
4057 else if (opts->x_flag_fentry < 0)
4059 #if defined(PROFILE_BEFORE_PROLOGUE)
4060 opts->x_flag_fentry = 1;
4061 #else
4062 opts->x_flag_fentry = 0;
4063 #endif
4066 /* When not opts->x_optimize for size, enable vzeroupper optimization for
4067 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4068 AVX unaligned load/store. */
4069 if (!opts->x_optimize_size)
4071 if (flag_expensive_optimizations
4072 && !(opts_set->x_target_flags & MASK_VZEROUPPER))
4073 opts->x_target_flags |= MASK_VZEROUPPER;
4074 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4075 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4076 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4077 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4078 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4079 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4080 /* Enable 128-bit AVX instruction generation
4081 for the auto-vectorizer. */
4082 if (TARGET_AVX128_OPTIMAL
4083 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
4084 opts->x_target_flags |= MASK_PREFER_AVX128;
4087 if (opts->x_ix86_recip_name)
4089 char *p = ASTRDUP (opts->x_ix86_recip_name);
4090 char *q;
4091 unsigned int mask, i;
4092 bool invert;
4094 while ((q = strtok (p, ",")) != NULL)
4096 p = NULL;
4097 if (*q == '!')
4099 invert = true;
4100 q++;
4102 else
4103 invert = false;
4105 if (!strcmp (q, "default"))
4106 mask = RECIP_MASK_ALL;
4107 else
4109 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4110 if (!strcmp (q, recip_options[i].string))
4112 mask = recip_options[i].mask;
4113 break;
4116 if (i == ARRAY_SIZE (recip_options))
4118 error ("unknown option for -mrecip=%s", q);
4119 invert = false;
4120 mask = RECIP_MASK_NONE;
4124 opts->x_recip_mask_explicit |= mask;
4125 if (invert)
4126 opts->x_recip_mask &= ~mask;
4127 else
4128 opts->x_recip_mask |= mask;
4132 if (TARGET_RECIP_P (opts->x_target_flags))
4133 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4134 else if (opts_set->x_target_flags & MASK_RECIP)
4135 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4137 /* Default long double to 64-bit for Bionic. */
4138 if (TARGET_HAS_BIONIC
4139 && !(opts_set->x_target_flags & MASK_LONG_DOUBLE_64))
4140 opts->x_target_flags |= MASK_LONG_DOUBLE_64;
4142 /* Save the initial options in case the user does function specific
4143 options. */
4144 if (main_args_p)
4145 target_option_default_node = target_option_current_node
4146 = build_target_option_node (opts);
4148 /* Handle stack protector */
4149 if (!opts_set->x_ix86_stack_protector_guard)
4150 opts->x_ix86_stack_protector_guard
4151 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4153 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4154 if (opts->x_ix86_tune_memcpy_strategy)
4156 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4157 ix86_parse_stringop_strategy_string (str, false);
4158 free (str);
4161 if (opts->x_ix86_tune_memset_strategy)
4163 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4164 ix86_parse_stringop_strategy_string (str, true);
4165 free (str);
4169 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4171 static void
4172 ix86_option_override (void)
4174 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4175 static struct register_pass_info insert_vzeroupper_info
4176 = { pass_insert_vzeroupper, "reload",
4177 1, PASS_POS_INSERT_AFTER
4180 ix86_option_override_internal (true, &global_options, &global_options_set);
4183 /* This needs to be done at start up. It's convenient to do it here. */
4184 register_pass (&insert_vzeroupper_info);
4187 /* Update register usage after having seen the compiler flags. */
4189 static void
4190 ix86_conditional_register_usage (void)
4192 int i, c_mask;
4193 unsigned int j;
4195 /* The PIC register, if it exists, is fixed. */
4196 j = PIC_OFFSET_TABLE_REGNUM;
4197 if (j != INVALID_REGNUM)
4198 fixed_regs[j] = call_used_regs[j] = 1;
4200 /* For 32-bit targets, squash the REX registers. */
4201 if (! TARGET_64BIT)
4203 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4204 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4205 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4206 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4207 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4208 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4211 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4212 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4213 : TARGET_64BIT ? (1 << 2)
4214 : (1 << 1));
4216 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4218 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4220 /* Set/reset conditionally defined registers from
4221 CALL_USED_REGISTERS initializer. */
4222 if (call_used_regs[i] > 1)
4223 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4225 /* Calculate registers of CLOBBERED_REGS register set
4226 as call used registers from GENERAL_REGS register set. */
4227 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4228 && call_used_regs[i])
4229 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4232 /* If MMX is disabled, squash the registers. */
4233 if (! TARGET_MMX)
4234 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4235 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4236 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4238 /* If SSE is disabled, squash the registers. */
4239 if (! TARGET_SSE)
4240 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4241 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4242 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4244 /* If the FPU is disabled, squash the registers. */
4245 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4246 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4247 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4248 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4250 /* If AVX512F is disabled, squash the registers. */
4251 if (! TARGET_AVX512F)
4253 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4254 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4256 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4257 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4260 /* If MPX is disabled, squash the registers. */
4261 if (! TARGET_MPX)
4262 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4263 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4267 /* Save the current options */
4269 static void
4270 ix86_function_specific_save (struct cl_target_option *ptr,
4271 struct gcc_options *opts)
4273 ptr->arch = ix86_arch;
4274 ptr->schedule = ix86_schedule;
4275 ptr->tune = ix86_tune;
4276 ptr->branch_cost = ix86_branch_cost;
4277 ptr->tune_defaulted = ix86_tune_defaulted;
4278 ptr->arch_specified = ix86_arch_specified;
4279 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
4280 ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit;
4281 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
4283 /* The fields are char but the variables are not; make sure the
4284 values fit in the fields. */
4285 gcc_assert (ptr->arch == ix86_arch);
4286 gcc_assert (ptr->schedule == ix86_schedule);
4287 gcc_assert (ptr->tune == ix86_tune);
4288 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4291 /* Restore the current options */
4293 static void
4294 ix86_function_specific_restore (struct gcc_options *opts,
4295 struct cl_target_option *ptr)
4297 enum processor_type old_tune = ix86_tune;
4298 enum processor_type old_arch = ix86_arch;
4299 unsigned int ix86_arch_mask;
4300 int i;
4302 ix86_arch = (enum processor_type) ptr->arch;
4303 ix86_schedule = (enum attr_cpu) ptr->schedule;
4304 ix86_tune = (enum processor_type) ptr->tune;
4305 opts->x_ix86_branch_cost = ptr->branch_cost;
4306 ix86_tune_defaulted = ptr->tune_defaulted;
4307 ix86_arch_specified = ptr->arch_specified;
4308 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4309 opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit;
4310 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
4312 /* Recreate the arch feature tests if the arch changed */
4313 if (old_arch != ix86_arch)
4315 ix86_arch_mask = 1u << ix86_arch;
4316 for (i = 0; i < X86_ARCH_LAST; ++i)
4317 ix86_arch_features[i]
4318 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4321 /* Recreate the tune optimization tests */
4322 if (old_tune != ix86_tune)
4323 set_ix86_tune_features (ix86_tune, false);
4326 /* Print the current options */
4328 static void
4329 ix86_function_specific_print (FILE *file, int indent,
4330 struct cl_target_option *ptr)
4332 char *target_string
4333 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4334 NULL, NULL, ptr->x_ix86_fpmath, false);
4336 fprintf (file, "%*sarch = %d (%s)\n",
4337 indent, "",
4338 ptr->arch,
4339 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4340 ? cpu_names[ptr->arch]
4341 : "<unknown>"));
4343 fprintf (file, "%*stune = %d (%s)\n",
4344 indent, "",
4345 ptr->tune,
4346 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4347 ? cpu_names[ptr->tune]
4348 : "<unknown>"));
4350 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4352 if (target_string)
4354 fprintf (file, "%*s%s\n", indent, "", target_string);
4355 free (target_string);
4360 /* Inner function to process the attribute((target(...))), take an argument and
4361 set the current options from the argument. If we have a list, recursively go
4362 over the list. */
4364 static bool
4365 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4366 struct gcc_options *opts,
4367 struct gcc_options *opts_set,
4368 struct gcc_options *enum_opts_set)
4370 char *next_optstr;
4371 bool ret = true;
4373 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4374 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4375 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4376 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4377 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4379 enum ix86_opt_type
4381 ix86_opt_unknown,
4382 ix86_opt_yes,
4383 ix86_opt_no,
4384 ix86_opt_str,
4385 ix86_opt_enum,
4386 ix86_opt_isa
4389 static const struct
4391 const char *string;
4392 size_t len;
4393 enum ix86_opt_type type;
4394 int opt;
4395 int mask;
4396 } attrs[] = {
4397 /* isa options */
4398 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4399 IX86_ATTR_ISA ("abm", OPT_mabm),
4400 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4401 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4402 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4403 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4404 IX86_ATTR_ISA ("aes", OPT_maes),
4405 IX86_ATTR_ISA ("avx", OPT_mavx),
4406 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4407 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
4408 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
4409 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
4410 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
4411 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4412 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4413 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4414 IX86_ATTR_ISA ("sse", OPT_msse),
4415 IX86_ATTR_ISA ("sse2", OPT_msse2),
4416 IX86_ATTR_ISA ("sse3", OPT_msse3),
4417 IX86_ATTR_ISA ("sse4", OPT_msse4),
4418 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4419 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4420 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4421 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4422 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4423 IX86_ATTR_ISA ("fma", OPT_mfma),
4424 IX86_ATTR_ISA ("xop", OPT_mxop),
4425 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4426 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4427 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4428 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4429 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4430 IX86_ATTR_ISA ("hle", OPT_mhle),
4431 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4432 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4433 IX86_ATTR_ISA ("adx", OPT_madx),
4434 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4435 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4436 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4438 /* enum options */
4439 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4441 /* string options */
4442 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4443 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4445 /* flag options */
4446 IX86_ATTR_YES ("cld",
4447 OPT_mcld,
4448 MASK_CLD),
4450 IX86_ATTR_NO ("fancy-math-387",
4451 OPT_mfancy_math_387,
4452 MASK_NO_FANCY_MATH_387),
4454 IX86_ATTR_YES ("ieee-fp",
4455 OPT_mieee_fp,
4456 MASK_IEEE_FP),
4458 IX86_ATTR_YES ("inline-all-stringops",
4459 OPT_minline_all_stringops,
4460 MASK_INLINE_ALL_STRINGOPS),
4462 IX86_ATTR_YES ("inline-stringops-dynamically",
4463 OPT_minline_stringops_dynamically,
4464 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4466 IX86_ATTR_NO ("align-stringops",
4467 OPT_mno_align_stringops,
4468 MASK_NO_ALIGN_STRINGOPS),
4470 IX86_ATTR_YES ("recip",
4471 OPT_mrecip,
4472 MASK_RECIP),
4476 /* If this is a list, recurse to get the options. */
4477 if (TREE_CODE (args) == TREE_LIST)
4479 bool ret = true;
4481 for (; args; args = TREE_CHAIN (args))
4482 if (TREE_VALUE (args)
4483 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4484 p_strings, opts, opts_set,
4485 enum_opts_set))
4486 ret = false;
4488 return ret;
4491 else if (TREE_CODE (args) != STRING_CST)
4493 error ("attribute %<target%> argument not a string");
4494 return false;
4497 /* Handle multiple arguments separated by commas. */
4498 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4500 while (next_optstr && *next_optstr != '\0')
4502 char *p = next_optstr;
4503 char *orig_p = p;
4504 char *comma = strchr (next_optstr, ',');
4505 const char *opt_string;
4506 size_t len, opt_len;
4507 int opt;
4508 bool opt_set_p;
4509 char ch;
4510 unsigned i;
4511 enum ix86_opt_type type = ix86_opt_unknown;
4512 int mask = 0;
4514 if (comma)
4516 *comma = '\0';
4517 len = comma - next_optstr;
4518 next_optstr = comma + 1;
4520 else
4522 len = strlen (p);
4523 next_optstr = NULL;
4526 /* Recognize no-xxx. */
4527 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4529 opt_set_p = false;
4530 p += 3;
4531 len -= 3;
4533 else
4534 opt_set_p = true;
4536 /* Find the option. */
4537 ch = *p;
4538 opt = N_OPTS;
4539 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4541 type = attrs[i].type;
4542 opt_len = attrs[i].len;
4543 if (ch == attrs[i].string[0]
4544 && ((type != ix86_opt_str && type != ix86_opt_enum)
4545 ? len == opt_len
4546 : len > opt_len)
4547 && memcmp (p, attrs[i].string, opt_len) == 0)
4549 opt = attrs[i].opt;
4550 mask = attrs[i].mask;
4551 opt_string = attrs[i].string;
4552 break;
4556 /* Process the option. */
4557 if (opt == N_OPTS)
4559 error ("attribute(target(\"%s\")) is unknown", orig_p);
4560 ret = false;
4563 else if (type == ix86_opt_isa)
4565 struct cl_decoded_option decoded;
4567 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4568 ix86_handle_option (opts, opts_set,
4569 &decoded, input_location);
4572 else if (type == ix86_opt_yes || type == ix86_opt_no)
4574 if (type == ix86_opt_no)
4575 opt_set_p = !opt_set_p;
4577 if (opt_set_p)
4578 opts->x_target_flags |= mask;
4579 else
4580 opts->x_target_flags &= ~mask;
4583 else if (type == ix86_opt_str)
4585 if (p_strings[opt])
4587 error ("option(\"%s\") was already specified", opt_string);
4588 ret = false;
4590 else
4591 p_strings[opt] = xstrdup (p + opt_len);
4594 else if (type == ix86_opt_enum)
4596 bool arg_ok;
4597 int value;
4599 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4600 if (arg_ok)
4601 set_option (opts, enum_opts_set, opt, value,
4602 p + opt_len, DK_UNSPECIFIED, input_location,
4603 global_dc);
4604 else
4606 error ("attribute(target(\"%s\")) is unknown", orig_p);
4607 ret = false;
4611 else
4612 gcc_unreachable ();
4615 return ret;
4618 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4620 tree
4621 ix86_valid_target_attribute_tree (tree args,
4622 struct gcc_options *opts,
4623 struct gcc_options *opts_set)
4625 const char *orig_arch_string = ix86_arch_string;
4626 const char *orig_tune_string = ix86_tune_string;
4627 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
4628 int orig_tune_defaulted = ix86_tune_defaulted;
4629 int orig_arch_specified = ix86_arch_specified;
4630 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4631 tree t = NULL_TREE;
4632 int i;
4633 struct cl_target_option *def
4634 = TREE_TARGET_OPTION (target_option_default_node);
4635 struct gcc_options enum_opts_set;
4637 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4639 /* Process each of the options on the chain. */
4640 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
4641 opts_set, &enum_opts_set))
4642 return error_mark_node;
4644 /* If the changed options are different from the default, rerun
4645 ix86_option_override_internal, and then save the options away.
4646 The string options are are attribute options, and will be undone
4647 when we copy the save structure. */
4648 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
4649 || opts->x_target_flags != def->x_target_flags
4650 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4651 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4652 || enum_opts_set.x_ix86_fpmath)
4654 /* If we are using the default tune= or arch=, undo the string assigned,
4655 and use the default. */
4656 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4657 opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4658 else if (!orig_arch_specified)
4659 opts->x_ix86_arch_string = NULL;
4661 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4662 opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4663 else if (orig_tune_defaulted)
4664 opts->x_ix86_tune_string = NULL;
4666 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4667 if (enum_opts_set.x_ix86_fpmath)
4668 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4669 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4670 && TARGET_SSE_P (opts->x_ix86_isa_flags))
4672 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4673 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4676 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4677 ix86_option_override_internal (false, opts, opts_set);
4679 /* Add any builtin functions with the new isa if any. */
4680 ix86_add_new_builtins (opts->x_ix86_isa_flags);
4682 /* Save the current options unless we are validating options for
4683 #pragma. */
4684 t = build_target_option_node (opts);
4686 opts->x_ix86_arch_string = orig_arch_string;
4687 opts->x_ix86_tune_string = orig_tune_string;
4688 opts_set->x_ix86_fpmath = orig_fpmath_set;
4690 /* Free up memory allocated to hold the strings */
4691 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4692 free (option_strings[i]);
4695 return t;
4698 /* Hook to validate attribute((target("string"))). */
4700 static bool
4701 ix86_valid_target_attribute_p (tree fndecl,
4702 tree ARG_UNUSED (name),
4703 tree args,
4704 int ARG_UNUSED (flags))
4706 struct gcc_options func_options;
4707 tree new_target, new_optimize;
4708 bool ret = true;
4710 /* attribute((target("default"))) does nothing, beyond
4711 affecting multi-versioning. */
4712 if (TREE_VALUE (args)
4713 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4714 && TREE_CHAIN (args) == NULL_TREE
4715 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4716 return true;
4718 tree old_optimize = build_optimization_node (&global_options);
4720 /* Get the optimization options of the current function. */
4721 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4723 if (!func_optimize)
4724 func_optimize = old_optimize;
4726 /* Init func_options. */
4727 memset (&func_options, 0, sizeof (func_options));
4728 init_options_struct (&func_options, NULL);
4729 lang_hooks.init_options_struct (&func_options);
4731 cl_optimization_restore (&func_options,
4732 TREE_OPTIMIZATION (func_optimize));
4734 /* Initialize func_options to the default before its target options can
4735 be set. */
4736 cl_target_option_restore (&func_options,
4737 TREE_TARGET_OPTION (target_option_default_node));
4739 new_target = ix86_valid_target_attribute_tree (args, &func_options,
4740 &global_options_set);
4742 new_optimize = build_optimization_node (&func_options);
4744 if (new_target == error_mark_node)
4745 ret = false;
4747 else if (fndecl && new_target)
4749 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4751 if (old_optimize != new_optimize)
4752 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4755 return ret;
4759 /* Hook to determine if one function can safely inline another. */
4761 static bool
4762 ix86_can_inline_p (tree caller, tree callee)
4764 bool ret = false;
4765 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4766 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4768 /* If callee has no option attributes, then it is ok to inline. */
4769 if (!callee_tree)
4770 ret = true;
4772 /* If caller has no option attributes, but callee does then it is not ok to
4773 inline. */
4774 else if (!caller_tree)
4775 ret = false;
4777 else
4779 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4780 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4782 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4783 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4784 function. */
4785 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4786 != callee_opts->x_ix86_isa_flags)
4787 ret = false;
4789 /* See if we have the same non-isa options. */
4790 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4791 ret = false;
4793 /* See if arch, tune, etc. are the same. */
4794 else if (caller_opts->arch != callee_opts->arch)
4795 ret = false;
4797 else if (caller_opts->tune != callee_opts->tune)
4798 ret = false;
4800 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4801 ret = false;
4803 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4804 ret = false;
4806 else
4807 ret = true;
4810 return ret;
4814 /* Remember the last target of ix86_set_current_function. */
4815 static GTY(()) tree ix86_previous_fndecl;
4817 /* Invalidate ix86_previous_fndecl cache. */
4818 void
4819 ix86_reset_previous_fndecl (void)
4821 ix86_previous_fndecl = NULL_TREE;
4824 /* Establish appropriate back-end context for processing the function
4825 FNDECL. The argument might be NULL to indicate processing at top
4826 level, outside of any function scope. */
4827 static void
4828 ix86_set_current_function (tree fndecl)
4830 /* Only change the context if the function changes. This hook is called
4831 several times in the course of compiling a function, and we don't want to
4832 slow things down too much or call target_reinit when it isn't safe. */
4833 if (fndecl && fndecl != ix86_previous_fndecl)
4835 tree old_tree = (ix86_previous_fndecl
4836 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4837 : NULL_TREE);
4839 tree new_tree = (fndecl
4840 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4841 : NULL_TREE);
4843 ix86_previous_fndecl = fndecl;
4844 if (old_tree == new_tree)
4847 else if (new_tree)
4849 cl_target_option_restore (&global_options,
4850 TREE_TARGET_OPTION (new_tree));
4851 target_reinit ();
4854 else if (old_tree)
4856 struct cl_target_option *def
4857 = TREE_TARGET_OPTION (target_option_current_node);
4859 cl_target_option_restore (&global_options, def);
4860 target_reinit ();
4866 /* Return true if this goes in large data/bss. */
4868 static bool
4869 ix86_in_large_data_p (tree exp)
4871 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4872 return false;
4874 /* Functions are never large data. */
4875 if (TREE_CODE (exp) == FUNCTION_DECL)
4876 return false;
4878 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4880 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4881 if (strcmp (section, ".ldata") == 0
4882 || strcmp (section, ".lbss") == 0)
4883 return true;
4884 return false;
4886 else
4888 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4890 /* If this is an incomplete type with size 0, then we can't put it
4891 in data because it might be too big when completed. */
4892 if (!size || size > ix86_section_threshold)
4893 return true;
4896 return false;
4899 /* Switch to the appropriate section for output of DECL.
4900 DECL is either a `VAR_DECL' node or a constant of some sort.
4901 RELOC indicates whether forming the initial value of DECL requires
4902 link-time relocations. */
4904 ATTRIBUTE_UNUSED static section *
4905 x86_64_elf_select_section (tree decl, int reloc,
4906 unsigned HOST_WIDE_INT align)
4908 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4909 && ix86_in_large_data_p (decl))
4911 const char *sname = NULL;
4912 unsigned int flags = SECTION_WRITE;
4913 switch (categorize_decl_for_section (decl, reloc))
4915 case SECCAT_DATA:
4916 sname = ".ldata";
4917 break;
4918 case SECCAT_DATA_REL:
4919 sname = ".ldata.rel";
4920 break;
4921 case SECCAT_DATA_REL_LOCAL:
4922 sname = ".ldata.rel.local";
4923 break;
4924 case SECCAT_DATA_REL_RO:
4925 sname = ".ldata.rel.ro";
4926 break;
4927 case SECCAT_DATA_REL_RO_LOCAL:
4928 sname = ".ldata.rel.ro.local";
4929 break;
4930 case SECCAT_BSS:
4931 sname = ".lbss";
4932 flags |= SECTION_BSS;
4933 break;
4934 case SECCAT_RODATA:
4935 case SECCAT_RODATA_MERGE_STR:
4936 case SECCAT_RODATA_MERGE_STR_INIT:
4937 case SECCAT_RODATA_MERGE_CONST:
4938 sname = ".lrodata";
4939 flags = 0;
4940 break;
4941 case SECCAT_SRODATA:
4942 case SECCAT_SDATA:
4943 case SECCAT_SBSS:
4944 gcc_unreachable ();
4945 case SECCAT_TEXT:
4946 case SECCAT_TDATA:
4947 case SECCAT_TBSS:
4948 /* We don't split these for medium model. Place them into
4949 default sections and hope for best. */
4950 break;
4952 if (sname)
4954 /* We might get called with string constants, but get_named_section
4955 doesn't like them as they are not DECLs. Also, we need to set
4956 flags in that case. */
4957 if (!DECL_P (decl))
4958 return get_section (sname, flags, NULL);
4959 return get_named_section (decl, sname, reloc);
4962 return default_elf_select_section (decl, reloc, align);
4965 /* Select a set of attributes for section NAME based on the properties
4966 of DECL and whether or not RELOC indicates that DECL's initializer
4967 might contain runtime relocations. */
4969 static unsigned int ATTRIBUTE_UNUSED
4970 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4972 unsigned int flags = default_section_type_flags (decl, name, reloc);
4974 if (decl == NULL_TREE
4975 && (strcmp (name, ".ldata.rel.ro") == 0
4976 || strcmp (name, ".ldata.rel.ro.local") == 0))
4977 flags |= SECTION_RELRO;
4979 if (strcmp (name, ".lbss") == 0
4980 || strncmp (name, ".lbss.", 5) == 0
4981 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4982 flags |= SECTION_BSS;
4984 return flags;
4987 /* Build up a unique section name, expressed as a
4988 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4989 RELOC indicates whether the initial value of EXP requires
4990 link-time relocations. */
4992 static void ATTRIBUTE_UNUSED
4993 x86_64_elf_unique_section (tree decl, int reloc)
4995 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4996 && ix86_in_large_data_p (decl))
4998 const char *prefix = NULL;
4999 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
5000 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
5002 switch (categorize_decl_for_section (decl, reloc))
5004 case SECCAT_DATA:
5005 case SECCAT_DATA_REL:
5006 case SECCAT_DATA_REL_LOCAL:
5007 case SECCAT_DATA_REL_RO:
5008 case SECCAT_DATA_REL_RO_LOCAL:
5009 prefix = one_only ? ".ld" : ".ldata";
5010 break;
5011 case SECCAT_BSS:
5012 prefix = one_only ? ".lb" : ".lbss";
5013 break;
5014 case SECCAT_RODATA:
5015 case SECCAT_RODATA_MERGE_STR:
5016 case SECCAT_RODATA_MERGE_STR_INIT:
5017 case SECCAT_RODATA_MERGE_CONST:
5018 prefix = one_only ? ".lr" : ".lrodata";
5019 break;
5020 case SECCAT_SRODATA:
5021 case SECCAT_SDATA:
5022 case SECCAT_SBSS:
5023 gcc_unreachable ();
5024 case SECCAT_TEXT:
5025 case SECCAT_TDATA:
5026 case SECCAT_TBSS:
5027 /* We don't split these for medium model. Place them into
5028 default sections and hope for best. */
5029 break;
5031 if (prefix)
5033 const char *name, *linkonce;
5034 char *string;
5036 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
5037 name = targetm.strip_name_encoding (name);
5039 /* If we're using one_only, then there needs to be a .gnu.linkonce
5040 prefix to the section name. */
5041 linkonce = one_only ? ".gnu.linkonce" : "";
5043 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
5045 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
5046 return;
5049 default_unique_section (decl, reloc);
5052 #ifdef COMMON_ASM_OP
5053 /* This says how to output assembler code to declare an
5054 uninitialized external linkage data object.
5056 For medium model x86-64 we need to use .largecomm opcode for
5057 large objects. */
5058 void
5059 x86_elf_aligned_common (FILE *file,
5060 const char *name, unsigned HOST_WIDE_INT size,
5061 int align)
5063 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5064 && size > (unsigned int)ix86_section_threshold)
5065 fputs (".largecomm\t", file);
5066 else
5067 fputs (COMMON_ASM_OP, file);
5068 assemble_name (file, name);
5069 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
5070 size, align / BITS_PER_UNIT);
5072 #endif
5074 /* Utility function for targets to use in implementing
5075 ASM_OUTPUT_ALIGNED_BSS. */
5077 void
5078 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
5079 const char *name, unsigned HOST_WIDE_INT size,
5080 int align)
5082 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5083 && size > (unsigned int)ix86_section_threshold)
5084 switch_to_section (get_named_section (decl, ".lbss", 0));
5085 else
5086 switch_to_section (bss_section);
5087 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
5088 #ifdef ASM_DECLARE_OBJECT_NAME
5089 last_assemble_variable_decl = decl;
5090 ASM_DECLARE_OBJECT_NAME (file, name, decl);
5091 #else
5092 /* Standard thing is just output label for the object. */
5093 ASM_OUTPUT_LABEL (file, name);
5094 #endif /* ASM_DECLARE_OBJECT_NAME */
5095 ASM_OUTPUT_SKIP (file, size ? size : 1);
5098 /* Decide whether we must probe the stack before any space allocation
5099 on this target. It's essentially TARGET_STACK_PROBE except when
5100 -fstack-check causes the stack to be already probed differently. */
5102 bool
5103 ix86_target_stack_probe (void)
5105 /* Do not probe the stack twice if static stack checking is enabled. */
5106 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5107 return false;
5109 return TARGET_STACK_PROBE;
5112 /* Decide whether we can make a sibling call to a function. DECL is the
5113 declaration of the function being targeted by the call and EXP is the
5114 CALL_EXPR representing the call. */
5116 static bool
5117 ix86_function_ok_for_sibcall (tree decl, tree exp)
5119 tree type, decl_or_type;
5120 rtx a, b;
5122 /* If we are generating position-independent code, we cannot sibcall
5123 optimize any indirect call, or a direct call to a global function,
5124 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5125 if (!TARGET_MACHO
5126 && !TARGET_64BIT
5127 && flag_pic
5128 && (!decl || !targetm.binds_local_p (decl)))
5129 return false;
5131 /* If we need to align the outgoing stack, then sibcalling would
5132 unalign the stack, which may break the called function. */
5133 if (ix86_minimum_incoming_stack_boundary (true)
5134 < PREFERRED_STACK_BOUNDARY)
5135 return false;
5137 if (decl)
5139 decl_or_type = decl;
5140 type = TREE_TYPE (decl);
5142 else
5144 /* We're looking at the CALL_EXPR, we need the type of the function. */
5145 type = CALL_EXPR_FN (exp); /* pointer expression */
5146 type = TREE_TYPE (type); /* pointer type */
5147 type = TREE_TYPE (type); /* function type */
5148 decl_or_type = type;
5151 /* Check that the return value locations are the same. Like
5152 if we are returning floats on the 80387 register stack, we cannot
5153 make a sibcall from a function that doesn't return a float to a
5154 function that does or, conversely, from a function that does return
5155 a float to a function that doesn't; the necessary stack adjustment
5156 would not be executed. This is also the place we notice
5157 differences in the return value ABI. Note that it is ok for one
5158 of the functions to have void return type as long as the return
5159 value of the other is passed in a register. */
5160 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5161 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5162 cfun->decl, false);
5163 if (STACK_REG_P (a) || STACK_REG_P (b))
5165 if (!rtx_equal_p (a, b))
5166 return false;
5168 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5170 else if (!rtx_equal_p (a, b))
5171 return false;
5173 if (TARGET_64BIT)
5175 /* The SYSV ABI has more call-clobbered registers;
5176 disallow sibcalls from MS to SYSV. */
5177 if (cfun->machine->call_abi == MS_ABI
5178 && ix86_function_type_abi (type) == SYSV_ABI)
5179 return false;
5181 else
5183 /* If this call is indirect, we'll need to be able to use a
5184 call-clobbered register for the address of the target function.
5185 Make sure that all such registers are not used for passing
5186 parameters. Note that DLLIMPORT functions are indirect. */
5187 if (!decl
5188 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5190 if (ix86_function_regparm (type, NULL) >= 3)
5192 /* ??? Need to count the actual number of registers to be used,
5193 not the possible number of registers. Fix later. */
5194 return false;
5199 /* Otherwise okay. That also includes certain types of indirect calls. */
5200 return true;
5203 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5204 and "sseregparm" calling convention attributes;
5205 arguments as in struct attribute_spec.handler. */
5207 static tree
5208 ix86_handle_cconv_attribute (tree *node, tree name,
5209 tree args,
5210 int flags ATTRIBUTE_UNUSED,
5211 bool *no_add_attrs)
5213 if (TREE_CODE (*node) != FUNCTION_TYPE
5214 && TREE_CODE (*node) != METHOD_TYPE
5215 && TREE_CODE (*node) != FIELD_DECL
5216 && TREE_CODE (*node) != TYPE_DECL)
5218 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5219 name);
5220 *no_add_attrs = true;
5221 return NULL_TREE;
5224 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5225 if (is_attribute_p ("regparm", name))
5227 tree cst;
5229 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5231 error ("fastcall and regparm attributes are not compatible");
5234 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5236 error ("regparam and thiscall attributes are not compatible");
5239 cst = TREE_VALUE (args);
5240 if (TREE_CODE (cst) != INTEGER_CST)
5242 warning (OPT_Wattributes,
5243 "%qE attribute requires an integer constant argument",
5244 name);
5245 *no_add_attrs = true;
5247 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5249 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5250 name, REGPARM_MAX);
5251 *no_add_attrs = true;
5254 return NULL_TREE;
5257 if (TARGET_64BIT)
5259 /* Do not warn when emulating the MS ABI. */
5260 if ((TREE_CODE (*node) != FUNCTION_TYPE
5261 && TREE_CODE (*node) != METHOD_TYPE)
5262 || ix86_function_type_abi (*node) != MS_ABI)
5263 warning (OPT_Wattributes, "%qE attribute ignored",
5264 name);
5265 *no_add_attrs = true;
5266 return NULL_TREE;
5269 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5270 if (is_attribute_p ("fastcall", name))
5272 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5274 error ("fastcall and cdecl attributes are not compatible");
5276 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5278 error ("fastcall and stdcall attributes are not compatible");
5280 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5282 error ("fastcall and regparm attributes are not compatible");
5284 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5286 error ("fastcall and thiscall attributes are not compatible");
5290 /* Can combine stdcall with fastcall (redundant), regparm and
5291 sseregparm. */
5292 else if (is_attribute_p ("stdcall", name))
5294 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5296 error ("stdcall and cdecl attributes are not compatible");
5298 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5300 error ("stdcall and fastcall attributes are not compatible");
5302 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5304 error ("stdcall and thiscall attributes are not compatible");
5308 /* Can combine cdecl with regparm and sseregparm. */
5309 else if (is_attribute_p ("cdecl", name))
5311 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5313 error ("stdcall and cdecl attributes are not compatible");
5315 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5317 error ("fastcall and cdecl attributes are not compatible");
5319 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5321 error ("cdecl and thiscall attributes are not compatible");
5324 else if (is_attribute_p ("thiscall", name))
5326 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5327 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5328 name);
5329 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5331 error ("stdcall and thiscall attributes are not compatible");
5333 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5335 error ("fastcall and thiscall attributes are not compatible");
5337 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5339 error ("cdecl and thiscall attributes are not compatible");
5343 /* Can combine sseregparm with all attributes. */
5345 return NULL_TREE;
5348 /* The transactional memory builtins are implicitly regparm or fastcall
5349 depending on the ABI. Override the generic do-nothing attribute that
5350 these builtins were declared with, and replace it with one of the two
5351 attributes that we expect elsewhere. */
5353 static tree
5354 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5355 tree args ATTRIBUTE_UNUSED,
5356 int flags, bool *no_add_attrs)
5358 tree alt;
5360 /* In no case do we want to add the placeholder attribute. */
5361 *no_add_attrs = true;
5363 /* The 64-bit ABI is unchanged for transactional memory. */
5364 if (TARGET_64BIT)
5365 return NULL_TREE;
5367 /* ??? Is there a better way to validate 32-bit windows? We have
5368 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5369 if (CHECK_STACK_LIMIT > 0)
5370 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5371 else
5373 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5374 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5376 decl_attributes (node, alt, flags);
5378 return NULL_TREE;
5381 /* This function determines from TYPE the calling-convention. */
5383 unsigned int
5384 ix86_get_callcvt (const_tree type)
5386 unsigned int ret = 0;
5387 bool is_stdarg;
5388 tree attrs;
5390 if (TARGET_64BIT)
5391 return IX86_CALLCVT_CDECL;
5393 attrs = TYPE_ATTRIBUTES (type);
5394 if (attrs != NULL_TREE)
5396 if (lookup_attribute ("cdecl", attrs))
5397 ret |= IX86_CALLCVT_CDECL;
5398 else if (lookup_attribute ("stdcall", attrs))
5399 ret |= IX86_CALLCVT_STDCALL;
5400 else if (lookup_attribute ("fastcall", attrs))
5401 ret |= IX86_CALLCVT_FASTCALL;
5402 else if (lookup_attribute ("thiscall", attrs))
5403 ret |= IX86_CALLCVT_THISCALL;
5405 /* Regparam isn't allowed for thiscall and fastcall. */
5406 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5408 if (lookup_attribute ("regparm", attrs))
5409 ret |= IX86_CALLCVT_REGPARM;
5410 if (lookup_attribute ("sseregparm", attrs))
5411 ret |= IX86_CALLCVT_SSEREGPARM;
5414 if (IX86_BASE_CALLCVT(ret) != 0)
5415 return ret;
5418 is_stdarg = stdarg_p (type);
5419 if (TARGET_RTD && !is_stdarg)
5420 return IX86_CALLCVT_STDCALL | ret;
5422 if (ret != 0
5423 || is_stdarg
5424 || TREE_CODE (type) != METHOD_TYPE
5425 || ix86_function_type_abi (type) != MS_ABI)
5426 return IX86_CALLCVT_CDECL | ret;
5428 return IX86_CALLCVT_THISCALL;
5431 /* Return 0 if the attributes for two types are incompatible, 1 if they
5432 are compatible, and 2 if they are nearly compatible (which causes a
5433 warning to be generated). */
5435 static int
5436 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5438 unsigned int ccvt1, ccvt2;
5440 if (TREE_CODE (type1) != FUNCTION_TYPE
5441 && TREE_CODE (type1) != METHOD_TYPE)
5442 return 1;
5444 ccvt1 = ix86_get_callcvt (type1);
5445 ccvt2 = ix86_get_callcvt (type2);
5446 if (ccvt1 != ccvt2)
5447 return 0;
5448 if (ix86_function_regparm (type1, NULL)
5449 != ix86_function_regparm (type2, NULL))
5450 return 0;
5452 return 1;
5455 /* Return the regparm value for a function with the indicated TYPE and DECL.
5456 DECL may be NULL when calling function indirectly
5457 or considering a libcall. */
5459 static int
5460 ix86_function_regparm (const_tree type, const_tree decl)
5462 tree attr;
5463 int regparm;
5464 unsigned int ccvt;
5466 if (TARGET_64BIT)
5467 return (ix86_function_type_abi (type) == SYSV_ABI
5468 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5469 ccvt = ix86_get_callcvt (type);
5470 regparm = ix86_regparm;
5472 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5474 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5475 if (attr)
5477 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5478 return regparm;
5481 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5482 return 2;
5483 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5484 return 1;
5486 /* Use register calling convention for local functions when possible. */
5487 if (decl
5488 && TREE_CODE (decl) == FUNCTION_DECL
5489 && optimize
5490 && !(profile_flag && !flag_fentry))
5492 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5493 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5494 if (i && i->local && i->can_change_signature)
5496 int local_regparm, globals = 0, regno;
5498 /* Make sure no regparm register is taken by a
5499 fixed register variable. */
5500 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5501 if (fixed_regs[local_regparm])
5502 break;
5504 /* We don't want to use regparm(3) for nested functions as
5505 these use a static chain pointer in the third argument. */
5506 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5507 local_regparm = 2;
5509 /* In 32-bit mode save a register for the split stack. */
5510 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5511 local_regparm = 2;
5513 /* Each fixed register usage increases register pressure,
5514 so less registers should be used for argument passing.
5515 This functionality can be overriden by an explicit
5516 regparm value. */
5517 for (regno = AX_REG; regno <= DI_REG; regno++)
5518 if (fixed_regs[regno])
5519 globals++;
5521 local_regparm
5522 = globals < local_regparm ? local_regparm - globals : 0;
5524 if (local_regparm > regparm)
5525 regparm = local_regparm;
5529 return regparm;
5532 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5533 DFmode (2) arguments in SSE registers for a function with the
5534 indicated TYPE and DECL. DECL may be NULL when calling function
5535 indirectly or considering a libcall. Otherwise return 0. */
5537 static int
5538 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5540 gcc_assert (!TARGET_64BIT);
5542 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5543 by the sseregparm attribute. */
5544 if (TARGET_SSEREGPARM
5545 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5547 if (!TARGET_SSE)
5549 if (warn)
5551 if (decl)
5552 error ("calling %qD with attribute sseregparm without "
5553 "SSE/SSE2 enabled", decl);
5554 else
5555 error ("calling %qT with attribute sseregparm without "
5556 "SSE/SSE2 enabled", type);
5558 return 0;
5561 return 2;
5564 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5565 (and DFmode for SSE2) arguments in SSE registers. */
5566 if (decl && TARGET_SSE_MATH && optimize
5567 && !(profile_flag && !flag_fentry))
5569 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5570 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5571 if (i && i->local && i->can_change_signature)
5572 return TARGET_SSE2 ? 2 : 1;
5575 return 0;
5578 /* Return true if EAX is live at the start of the function. Used by
5579 ix86_expand_prologue to determine if we need special help before
5580 calling allocate_stack_worker. */
5582 static bool
5583 ix86_eax_live_at_start_p (void)
5585 /* Cheat. Don't bother working forward from ix86_function_regparm
5586 to the function type to whether an actual argument is located in
5587 eax. Instead just look at cfg info, which is still close enough
5588 to correct at this point. This gives false positives for broken
5589 functions that might use uninitialized data that happens to be
5590 allocated in eax, but who cares? */
5591 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5594 static bool
5595 ix86_keep_aggregate_return_pointer (tree fntype)
5597 tree attr;
5599 if (!TARGET_64BIT)
5601 attr = lookup_attribute ("callee_pop_aggregate_return",
5602 TYPE_ATTRIBUTES (fntype));
5603 if (attr)
5604 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5606 /* For 32-bit MS-ABI the default is to keep aggregate
5607 return pointer. */
5608 if (ix86_function_type_abi (fntype) == MS_ABI)
5609 return true;
5611 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5614 /* Value is the number of bytes of arguments automatically
5615 popped when returning from a subroutine call.
5616 FUNDECL is the declaration node of the function (as a tree),
5617 FUNTYPE is the data type of the function (as a tree),
5618 or for a library call it is an identifier node for the subroutine name.
5619 SIZE is the number of bytes of arguments passed on the stack.
5621 On the 80386, the RTD insn may be used to pop them if the number
5622 of args is fixed, but if the number is variable then the caller
5623 must pop them all. RTD can't be used for library calls now
5624 because the library is compiled with the Unix compiler.
5625 Use of RTD is a selectable option, since it is incompatible with
5626 standard Unix calling sequences. If the option is not selected,
5627 the caller must always pop the args.
5629 The attribute stdcall is equivalent to RTD on a per module basis. */
5631 static int
5632 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5634 unsigned int ccvt;
5636 /* None of the 64-bit ABIs pop arguments. */
5637 if (TARGET_64BIT)
5638 return 0;
5640 ccvt = ix86_get_callcvt (funtype);
5642 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5643 | IX86_CALLCVT_THISCALL)) != 0
5644 && ! stdarg_p (funtype))
5645 return size;
5647 /* Lose any fake structure return argument if it is passed on the stack. */
5648 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5649 && !ix86_keep_aggregate_return_pointer (funtype))
5651 int nregs = ix86_function_regparm (funtype, fundecl);
5652 if (nregs == 0)
5653 return GET_MODE_SIZE (Pmode);
5656 return 0;
5659 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5661 static bool
5662 ix86_legitimate_combined_insn (rtx insn)
5664 /* Check operand constraints in case hard registers were propagated
5665 into insn pattern. This check prevents combine pass from
5666 generating insn patterns with invalid hard register operands.
5667 These invalid insns can eventually confuse reload to error out
5668 with a spill failure. See also PRs 46829 and 46843. */
5669 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5671 int i;
5673 extract_insn (insn);
5674 preprocess_constraints ();
5676 for (i = 0; i < recog_data.n_operands; i++)
5678 rtx op = recog_data.operand[i];
5679 enum machine_mode mode = GET_MODE (op);
5680 struct operand_alternative *op_alt;
5681 int offset = 0;
5682 bool win;
5683 int j;
5685 /* A unary operator may be accepted by the predicate, but it
5686 is irrelevant for matching constraints. */
5687 if (UNARY_P (op))
5688 op = XEXP (op, 0);
5690 if (GET_CODE (op) == SUBREG)
5692 if (REG_P (SUBREG_REG (op))
5693 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5694 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5695 GET_MODE (SUBREG_REG (op)),
5696 SUBREG_BYTE (op),
5697 GET_MODE (op));
5698 op = SUBREG_REG (op);
5701 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5702 continue;
5704 op_alt = recog_op_alt[i];
5706 /* Operand has no constraints, anything is OK. */
5707 win = !recog_data.n_alternatives;
5709 for (j = 0; j < recog_data.n_alternatives; j++)
5711 if (op_alt[j].anything_ok
5712 || (op_alt[j].matches != -1
5713 && operands_match_p
5714 (recog_data.operand[i],
5715 recog_data.operand[op_alt[j].matches]))
5716 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5718 win = true;
5719 break;
5723 if (!win)
5724 return false;
5728 return true;
5731 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5733 static unsigned HOST_WIDE_INT
5734 ix86_asan_shadow_offset (void)
5736 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5737 : HOST_WIDE_INT_C (0x7fff8000))
5738 : (HOST_WIDE_INT_1 << 29);
5741 /* Argument support functions. */
5743 /* Return true when register may be used to pass function parameters. */
5744 bool
5745 ix86_function_arg_regno_p (int regno)
5747 int i;
5748 const int *parm_regs;
5750 if (!TARGET_64BIT)
5752 if (TARGET_MACHO)
5753 return (regno < REGPARM_MAX
5754 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5755 else
5756 return (regno < REGPARM_MAX
5757 || (TARGET_MMX && MMX_REGNO_P (regno)
5758 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5759 || (TARGET_SSE && SSE_REGNO_P (regno)
5760 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5763 if (TARGET_SSE && SSE_REGNO_P (regno)
5764 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5765 return true;
5767 /* TODO: The function should depend on current function ABI but
5768 builtins.c would need updating then. Therefore we use the
5769 default ABI. */
5771 /* RAX is used as hidden argument to va_arg functions. */
5772 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5773 return true;
5775 if (ix86_abi == MS_ABI)
5776 parm_regs = x86_64_ms_abi_int_parameter_registers;
5777 else
5778 parm_regs = x86_64_int_parameter_registers;
5779 for (i = 0; i < (ix86_abi == MS_ABI
5780 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5781 if (regno == parm_regs[i])
5782 return true;
5783 return false;
5786 /* Return if we do not know how to pass TYPE solely in registers. */
5788 static bool
5789 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5791 if (must_pass_in_stack_var_size_or_pad (mode, type))
5792 return true;
5794 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5795 The layout_type routine is crafty and tries to trick us into passing
5796 currently unsupported vector types on the stack by using TImode. */
5797 return (!TARGET_64BIT && mode == TImode
5798 && type && TREE_CODE (type) != VECTOR_TYPE);
5801 /* It returns the size, in bytes, of the area reserved for arguments passed
5802 in registers for the function represented by fndecl dependent to the used
5803 abi format. */
5805 ix86_reg_parm_stack_space (const_tree fndecl)
5807 enum calling_abi call_abi = SYSV_ABI;
5808 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5809 call_abi = ix86_function_abi (fndecl);
5810 else
5811 call_abi = ix86_function_type_abi (fndecl);
5812 if (TARGET_64BIT && call_abi == MS_ABI)
5813 return 32;
5814 return 0;
5817 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5818 call abi used. */
5819 enum calling_abi
5820 ix86_function_type_abi (const_tree fntype)
5822 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5824 enum calling_abi abi = ix86_abi;
5825 if (abi == SYSV_ABI)
5827 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5828 abi = MS_ABI;
5830 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5831 abi = SYSV_ABI;
5832 return abi;
5834 return ix86_abi;
5837 /* We add this as a workaround in order to use libc_has_function
5838 hook in i386.md. */
5839 bool
5840 ix86_libc_has_function (enum function_class fn_class)
5842 return targetm.libc_has_function (fn_class);
5845 static bool
5846 ix86_function_ms_hook_prologue (const_tree fn)
5848 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5850 if (decl_function_context (fn) != NULL_TREE)
5851 error_at (DECL_SOURCE_LOCATION (fn),
5852 "ms_hook_prologue is not compatible with nested function");
5853 else
5854 return true;
5856 return false;
5859 static enum calling_abi
5860 ix86_function_abi (const_tree fndecl)
5862 if (! fndecl)
5863 return ix86_abi;
5864 return ix86_function_type_abi (TREE_TYPE (fndecl));
5867 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5868 call abi used. */
5869 enum calling_abi
5870 ix86_cfun_abi (void)
5872 if (! cfun)
5873 return ix86_abi;
5874 return cfun->machine->call_abi;
5877 /* Write the extra assembler code needed to declare a function properly. */
5879 void
5880 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5881 tree decl)
5883 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5885 if (is_ms_hook)
5887 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5888 unsigned int filler_cc = 0xcccccccc;
5890 for (i = 0; i < filler_count; i += 4)
5891 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5894 #ifdef SUBTARGET_ASM_UNWIND_INIT
5895 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5896 #endif
5898 ASM_OUTPUT_LABEL (asm_out_file, fname);
5900 /* Output magic byte marker, if hot-patch attribute is set. */
5901 if (is_ms_hook)
5903 if (TARGET_64BIT)
5905 /* leaq [%rsp + 0], %rsp */
5906 asm_fprintf (asm_out_file, ASM_BYTE
5907 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5909 else
5911 /* movl.s %edi, %edi
5912 push %ebp
5913 movl.s %esp, %ebp */
5914 asm_fprintf (asm_out_file, ASM_BYTE
5915 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5920 /* regclass.c */
5921 extern void init_regs (void);
5923 /* Implementation of call abi switching target hook. Specific to FNDECL
5924 the specific call register sets are set. See also
5925 ix86_conditional_register_usage for more details. */
5926 void
5927 ix86_call_abi_override (const_tree fndecl)
5929 if (fndecl == NULL_TREE)
5930 cfun->machine->call_abi = ix86_abi;
5931 else
5932 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5935 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5936 expensive re-initialization of init_regs each time we switch function context
5937 since this is needed only during RTL expansion. */
5938 static void
5939 ix86_maybe_switch_abi (void)
5941 if (TARGET_64BIT &&
5942 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5943 reinit_regs ();
5946 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5947 for a call to a function whose data type is FNTYPE.
5948 For a library call, FNTYPE is 0. */
5950 void
5951 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5952 tree fntype, /* tree ptr for function decl */
5953 rtx libname, /* SYMBOL_REF of library name or 0 */
5954 tree fndecl,
5955 int caller)
5957 struct cgraph_local_info *i;
5959 memset (cum, 0, sizeof (*cum));
5961 if (fndecl)
5963 i = cgraph_local_info (fndecl);
5964 cum->call_abi = ix86_function_abi (fndecl);
5966 else
5968 i = NULL;
5969 cum->call_abi = ix86_function_type_abi (fntype);
5972 cum->caller = caller;
5974 /* Set up the number of registers to use for passing arguments. */
5976 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5977 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5978 "or subtarget optimization implying it");
5979 cum->nregs = ix86_regparm;
5980 if (TARGET_64BIT)
5982 cum->nregs = (cum->call_abi == SYSV_ABI
5983 ? X86_64_REGPARM_MAX
5984 : X86_64_MS_REGPARM_MAX);
5986 if (TARGET_SSE)
5988 cum->sse_nregs = SSE_REGPARM_MAX;
5989 if (TARGET_64BIT)
5991 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5992 ? X86_64_SSE_REGPARM_MAX
5993 : X86_64_MS_SSE_REGPARM_MAX);
5996 if (TARGET_MMX)
5997 cum->mmx_nregs = MMX_REGPARM_MAX;
5998 cum->warn_avx = true;
5999 cum->warn_sse = true;
6000 cum->warn_mmx = true;
6002 /* Because type might mismatch in between caller and callee, we need to
6003 use actual type of function for local calls.
6004 FIXME: cgraph_analyze can be told to actually record if function uses
6005 va_start so for local functions maybe_vaarg can be made aggressive
6006 helping K&R code.
6007 FIXME: once typesytem is fixed, we won't need this code anymore. */
6008 if (i && i->local && i->can_change_signature)
6009 fntype = TREE_TYPE (fndecl);
6010 cum->maybe_vaarg = (fntype
6011 ? (!prototype_p (fntype) || stdarg_p (fntype))
6012 : !libname);
6014 if (!TARGET_64BIT)
6016 /* If there are variable arguments, then we won't pass anything
6017 in registers in 32-bit mode. */
6018 if (stdarg_p (fntype))
6020 cum->nregs = 0;
6021 cum->sse_nregs = 0;
6022 cum->mmx_nregs = 0;
6023 cum->warn_avx = 0;
6024 cum->warn_sse = 0;
6025 cum->warn_mmx = 0;
6026 return;
6029 /* Use ecx and edx registers if function has fastcall attribute,
6030 else look for regparm information. */
6031 if (fntype)
6033 unsigned int ccvt = ix86_get_callcvt (fntype);
6034 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6036 cum->nregs = 1;
6037 cum->fastcall = 1; /* Same first register as in fastcall. */
6039 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6041 cum->nregs = 2;
6042 cum->fastcall = 1;
6044 else
6045 cum->nregs = ix86_function_regparm (fntype, fndecl);
6048 /* Set up the number of SSE registers used for passing SFmode
6049 and DFmode arguments. Warn for mismatching ABI. */
6050 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
6054 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
6055 But in the case of vector types, it is some vector mode.
6057 When we have only some of our vector isa extensions enabled, then there
6058 are some modes for which vector_mode_supported_p is false. For these
6059 modes, the generic vector support in gcc will choose some non-vector mode
6060 in order to implement the type. By computing the natural mode, we'll
6061 select the proper ABI location for the operand and not depend on whatever
6062 the middle-end decides to do with these vector types.
6064 The midde-end can't deal with the vector types > 16 bytes. In this
6065 case, we return the original mode and warn ABI change if CUM isn't
6066 NULL. */
6068 static enum machine_mode
6069 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
6071 enum machine_mode mode = TYPE_MODE (type);
6073 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
6075 HOST_WIDE_INT size = int_size_in_bytes (type);
6076 if ((size == 8 || size == 16 || size == 32)
6077 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
6078 && TYPE_VECTOR_SUBPARTS (type) > 1)
6080 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
6082 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
6083 mode = MIN_MODE_VECTOR_FLOAT;
6084 else
6085 mode = MIN_MODE_VECTOR_INT;
6087 /* Get the mode which has this inner mode and number of units. */
6088 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
6089 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
6090 && GET_MODE_INNER (mode) == innermode)
6092 if (size == 32 && !TARGET_AVX)
6094 static bool warnedavx;
6096 if (cum
6097 && !warnedavx
6098 && cum->warn_avx)
6100 warnedavx = true;
6101 warning (0, "AVX vector argument without AVX "
6102 "enabled changes the ABI");
6104 return TYPE_MODE (type);
6106 else if ((size == 8 || size == 16) && !TARGET_SSE)
6108 static bool warnedsse;
6110 if (cum
6111 && !warnedsse
6112 && cum->warn_sse)
6114 warnedsse = true;
6115 warning (0, "SSE vector argument without SSE "
6116 "enabled changes the ABI");
6118 return mode;
6120 else
6121 return mode;
6124 gcc_unreachable ();
6128 return mode;
6131 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6132 this may not agree with the mode that the type system has chosen for the
6133 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6134 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6136 static rtx
6137 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6138 unsigned int regno)
6140 rtx tmp;
6142 if (orig_mode != BLKmode)
6143 tmp = gen_rtx_REG (orig_mode, regno);
6144 else
6146 tmp = gen_rtx_REG (mode, regno);
6147 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6148 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6151 return tmp;
6154 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6155 of this code is to classify each 8bytes of incoming argument by the register
6156 class and assign registers accordingly. */
6158 /* Return the union class of CLASS1 and CLASS2.
6159 See the x86-64 PS ABI for details. */
6161 static enum x86_64_reg_class
6162 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6164 /* Rule #1: If both classes are equal, this is the resulting class. */
6165 if (class1 == class2)
6166 return class1;
6168 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6169 the other class. */
6170 if (class1 == X86_64_NO_CLASS)
6171 return class2;
6172 if (class2 == X86_64_NO_CLASS)
6173 return class1;
6175 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6176 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6177 return X86_64_MEMORY_CLASS;
6179 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6180 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6181 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6182 return X86_64_INTEGERSI_CLASS;
6183 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6184 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6185 return X86_64_INTEGER_CLASS;
6187 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6188 MEMORY is used. */
6189 if (class1 == X86_64_X87_CLASS
6190 || class1 == X86_64_X87UP_CLASS
6191 || class1 == X86_64_COMPLEX_X87_CLASS
6192 || class2 == X86_64_X87_CLASS
6193 || class2 == X86_64_X87UP_CLASS
6194 || class2 == X86_64_COMPLEX_X87_CLASS)
6195 return X86_64_MEMORY_CLASS;
6197 /* Rule #6: Otherwise class SSE is used. */
6198 return X86_64_SSE_CLASS;
6201 /* Classify the argument of type TYPE and mode MODE.
6202 CLASSES will be filled by the register class used to pass each word
6203 of the operand. The number of words is returned. In case the parameter
6204 should be passed in memory, 0 is returned. As a special case for zero
6205 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6207 BIT_OFFSET is used internally for handling records and specifies offset
6208 of the offset in bits modulo 256 to avoid overflow cases.
6210 See the x86-64 PS ABI for details.
6213 static int
6214 classify_argument (enum machine_mode mode, const_tree type,
6215 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6217 HOST_WIDE_INT bytes =
6218 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6219 int words
6220 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6222 /* Variable sized entities are always passed/returned in memory. */
6223 if (bytes < 0)
6224 return 0;
6226 if (mode != VOIDmode
6227 && targetm.calls.must_pass_in_stack (mode, type))
6228 return 0;
6230 if (type && AGGREGATE_TYPE_P (type))
6232 int i;
6233 tree field;
6234 enum x86_64_reg_class subclasses[MAX_CLASSES];
6236 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6237 if (bytes > 32)
6238 return 0;
6240 for (i = 0; i < words; i++)
6241 classes[i] = X86_64_NO_CLASS;
6243 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6244 signalize memory class, so handle it as special case. */
6245 if (!words)
6247 classes[0] = X86_64_NO_CLASS;
6248 return 1;
6251 /* Classify each field of record and merge classes. */
6252 switch (TREE_CODE (type))
6254 case RECORD_TYPE:
6255 /* And now merge the fields of structure. */
6256 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6258 if (TREE_CODE (field) == FIELD_DECL)
6260 int num;
6262 if (TREE_TYPE (field) == error_mark_node)
6263 continue;
6265 /* Bitfields are always classified as integer. Handle them
6266 early, since later code would consider them to be
6267 misaligned integers. */
6268 if (DECL_BIT_FIELD (field))
6270 for (i = (int_bit_position (field)
6271 + (bit_offset % 64)) / 8 / 8;
6272 i < ((int_bit_position (field) + (bit_offset % 64))
6273 + tree_low_cst (DECL_SIZE (field), 0)
6274 + 63) / 8 / 8; i++)
6275 classes[i] =
6276 merge_classes (X86_64_INTEGER_CLASS,
6277 classes[i]);
6279 else
6281 int pos;
6283 type = TREE_TYPE (field);
6285 /* Flexible array member is ignored. */
6286 if (TYPE_MODE (type) == BLKmode
6287 && TREE_CODE (type) == ARRAY_TYPE
6288 && TYPE_SIZE (type) == NULL_TREE
6289 && TYPE_DOMAIN (type) != NULL_TREE
6290 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6291 == NULL_TREE))
6293 static bool warned;
6295 if (!warned && warn_psabi)
6297 warned = true;
6298 inform (input_location,
6299 "the ABI of passing struct with"
6300 " a flexible array member has"
6301 " changed in GCC 4.4");
6303 continue;
6305 num = classify_argument (TYPE_MODE (type), type,
6306 subclasses,
6307 (int_bit_position (field)
6308 + bit_offset) % 256);
6309 if (!num)
6310 return 0;
6311 pos = (int_bit_position (field)
6312 + (bit_offset % 64)) / 8 / 8;
6313 for (i = 0; i < num && (i + pos) < words; i++)
6314 classes[i + pos] =
6315 merge_classes (subclasses[i], classes[i + pos]);
6319 break;
6321 case ARRAY_TYPE:
6322 /* Arrays are handled as small records. */
6324 int num;
6325 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6326 TREE_TYPE (type), subclasses, bit_offset);
6327 if (!num)
6328 return 0;
6330 /* The partial classes are now full classes. */
6331 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6332 subclasses[0] = X86_64_SSE_CLASS;
6333 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6334 && !((bit_offset % 64) == 0 && bytes == 4))
6335 subclasses[0] = X86_64_INTEGER_CLASS;
6337 for (i = 0; i < words; i++)
6338 classes[i] = subclasses[i % num];
6340 break;
6342 case UNION_TYPE:
6343 case QUAL_UNION_TYPE:
6344 /* Unions are similar to RECORD_TYPE but offset is always 0.
6346 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6348 if (TREE_CODE (field) == FIELD_DECL)
6350 int num;
6352 if (TREE_TYPE (field) == error_mark_node)
6353 continue;
6355 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6356 TREE_TYPE (field), subclasses,
6357 bit_offset);
6358 if (!num)
6359 return 0;
6360 for (i = 0; i < num; i++)
6361 classes[i] = merge_classes (subclasses[i], classes[i]);
6364 break;
6366 default:
6367 gcc_unreachable ();
6370 if (words > 2)
6372 /* When size > 16 bytes, if the first one isn't
6373 X86_64_SSE_CLASS or any other ones aren't
6374 X86_64_SSEUP_CLASS, everything should be passed in
6375 memory. */
6376 if (classes[0] != X86_64_SSE_CLASS)
6377 return 0;
6379 for (i = 1; i < words; i++)
6380 if (classes[i] != X86_64_SSEUP_CLASS)
6381 return 0;
6384 /* Final merger cleanup. */
6385 for (i = 0; i < words; i++)
6387 /* If one class is MEMORY, everything should be passed in
6388 memory. */
6389 if (classes[i] == X86_64_MEMORY_CLASS)
6390 return 0;
6392 /* The X86_64_SSEUP_CLASS should be always preceded by
6393 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6394 if (classes[i] == X86_64_SSEUP_CLASS
6395 && classes[i - 1] != X86_64_SSE_CLASS
6396 && classes[i - 1] != X86_64_SSEUP_CLASS)
6398 /* The first one should never be X86_64_SSEUP_CLASS. */
6399 gcc_assert (i != 0);
6400 classes[i] = X86_64_SSE_CLASS;
6403 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6404 everything should be passed in memory. */
6405 if (classes[i] == X86_64_X87UP_CLASS
6406 && (classes[i - 1] != X86_64_X87_CLASS))
6408 static bool warned;
6410 /* The first one should never be X86_64_X87UP_CLASS. */
6411 gcc_assert (i != 0);
6412 if (!warned && warn_psabi)
6414 warned = true;
6415 inform (input_location,
6416 "the ABI of passing union with long double"
6417 " has changed in GCC 4.4");
6419 return 0;
6422 return words;
6425 /* Compute alignment needed. We align all types to natural boundaries with
6426 exception of XFmode that is aligned to 64bits. */
6427 if (mode != VOIDmode && mode != BLKmode)
6429 int mode_alignment = GET_MODE_BITSIZE (mode);
6431 if (mode == XFmode)
6432 mode_alignment = 128;
6433 else if (mode == XCmode)
6434 mode_alignment = 256;
6435 if (COMPLEX_MODE_P (mode))
6436 mode_alignment /= 2;
6437 /* Misaligned fields are always returned in memory. */
6438 if (bit_offset % mode_alignment)
6439 return 0;
6442 /* for V1xx modes, just use the base mode */
6443 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6444 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6445 mode = GET_MODE_INNER (mode);
6447 /* Classification of atomic types. */
6448 switch (mode)
6450 case SDmode:
6451 case DDmode:
6452 classes[0] = X86_64_SSE_CLASS;
6453 return 1;
6454 case TDmode:
6455 classes[0] = X86_64_SSE_CLASS;
6456 classes[1] = X86_64_SSEUP_CLASS;
6457 return 2;
6458 case DImode:
6459 case SImode:
6460 case HImode:
6461 case QImode:
6462 case CSImode:
6463 case CHImode:
6464 case CQImode:
6466 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6468 if (size <= 32)
6470 classes[0] = X86_64_INTEGERSI_CLASS;
6471 return 1;
6473 else if (size <= 64)
6475 classes[0] = X86_64_INTEGER_CLASS;
6476 return 1;
6478 else if (size <= 64+32)
6480 classes[0] = X86_64_INTEGER_CLASS;
6481 classes[1] = X86_64_INTEGERSI_CLASS;
6482 return 2;
6484 else if (size <= 64+64)
6486 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6487 return 2;
6489 else
6490 gcc_unreachable ();
6492 case CDImode:
6493 case TImode:
6494 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6495 return 2;
6496 case COImode:
6497 case OImode:
6498 /* OImode shouldn't be used directly. */
6499 gcc_unreachable ();
6500 case CTImode:
6501 return 0;
6502 case SFmode:
6503 if (!(bit_offset % 64))
6504 classes[0] = X86_64_SSESF_CLASS;
6505 else
6506 classes[0] = X86_64_SSE_CLASS;
6507 return 1;
6508 case DFmode:
6509 classes[0] = X86_64_SSEDF_CLASS;
6510 return 1;
6511 case XFmode:
6512 classes[0] = X86_64_X87_CLASS;
6513 classes[1] = X86_64_X87UP_CLASS;
6514 return 2;
6515 case TFmode:
6516 classes[0] = X86_64_SSE_CLASS;
6517 classes[1] = X86_64_SSEUP_CLASS;
6518 return 2;
6519 case SCmode:
6520 classes[0] = X86_64_SSE_CLASS;
6521 if (!(bit_offset % 64))
6522 return 1;
6523 else
6525 static bool warned;
6527 if (!warned && warn_psabi)
6529 warned = true;
6530 inform (input_location,
6531 "the ABI of passing structure with complex float"
6532 " member has changed in GCC 4.4");
6534 classes[1] = X86_64_SSESF_CLASS;
6535 return 2;
6537 case DCmode:
6538 classes[0] = X86_64_SSEDF_CLASS;
6539 classes[1] = X86_64_SSEDF_CLASS;
6540 return 2;
6541 case XCmode:
6542 classes[0] = X86_64_COMPLEX_X87_CLASS;
6543 return 1;
6544 case TCmode:
6545 /* This modes is larger than 16 bytes. */
6546 return 0;
6547 case V8SFmode:
6548 case V8SImode:
6549 case V32QImode:
6550 case V16HImode:
6551 case V4DFmode:
6552 case V4DImode:
6553 classes[0] = X86_64_SSE_CLASS;
6554 classes[1] = X86_64_SSEUP_CLASS;
6555 classes[2] = X86_64_SSEUP_CLASS;
6556 classes[3] = X86_64_SSEUP_CLASS;
6557 return 4;
6558 case V4SFmode:
6559 case V4SImode:
6560 case V16QImode:
6561 case V8HImode:
6562 case V2DFmode:
6563 case V2DImode:
6564 classes[0] = X86_64_SSE_CLASS;
6565 classes[1] = X86_64_SSEUP_CLASS;
6566 return 2;
6567 case V1TImode:
6568 case V1DImode:
6569 case V2SFmode:
6570 case V2SImode:
6571 case V4HImode:
6572 case V8QImode:
6573 classes[0] = X86_64_SSE_CLASS;
6574 return 1;
6575 case BLKmode:
6576 case VOIDmode:
6577 return 0;
6578 default:
6579 gcc_assert (VECTOR_MODE_P (mode));
6581 if (bytes > 16)
6582 return 0;
6584 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6586 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6587 classes[0] = X86_64_INTEGERSI_CLASS;
6588 else
6589 classes[0] = X86_64_INTEGER_CLASS;
6590 classes[1] = X86_64_INTEGER_CLASS;
6591 return 1 + (bytes > 8);
6595 /* Examine the argument and return set number of register required in each
6596 class. Return 0 iff parameter should be passed in memory. */
6597 static int
6598 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6599 int *int_nregs, int *sse_nregs)
6601 enum x86_64_reg_class regclass[MAX_CLASSES];
6602 int n = classify_argument (mode, type, regclass, 0);
6604 *int_nregs = 0;
6605 *sse_nregs = 0;
6606 if (!n)
6607 return 0;
6608 for (n--; n >= 0; n--)
6609 switch (regclass[n])
6611 case X86_64_INTEGER_CLASS:
6612 case X86_64_INTEGERSI_CLASS:
6613 (*int_nregs)++;
6614 break;
6615 case X86_64_SSE_CLASS:
6616 case X86_64_SSESF_CLASS:
6617 case X86_64_SSEDF_CLASS:
6618 (*sse_nregs)++;
6619 break;
6620 case X86_64_NO_CLASS:
6621 case X86_64_SSEUP_CLASS:
6622 break;
6623 case X86_64_X87_CLASS:
6624 case X86_64_X87UP_CLASS:
6625 if (!in_return)
6626 return 0;
6627 break;
6628 case X86_64_COMPLEX_X87_CLASS:
6629 return in_return ? 2 : 0;
6630 case X86_64_MEMORY_CLASS:
6631 gcc_unreachable ();
6633 return 1;
6636 /* Construct container for the argument used by GCC interface. See
6637 FUNCTION_ARG for the detailed description. */
6639 static rtx
6640 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6641 const_tree type, int in_return, int nintregs, int nsseregs,
6642 const int *intreg, int sse_regno)
6644 /* The following variables hold the static issued_error state. */
6645 static bool issued_sse_arg_error;
6646 static bool issued_sse_ret_error;
6647 static bool issued_x87_ret_error;
6649 enum machine_mode tmpmode;
6650 int bytes =
6651 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6652 enum x86_64_reg_class regclass[MAX_CLASSES];
6653 int n;
6654 int i;
6655 int nexps = 0;
6656 int needed_sseregs, needed_intregs;
6657 rtx exp[MAX_CLASSES];
6658 rtx ret;
6660 n = classify_argument (mode, type, regclass, 0);
6661 if (!n)
6662 return NULL;
6663 if (!examine_argument (mode, type, in_return, &needed_intregs,
6664 &needed_sseregs))
6665 return NULL;
6666 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6667 return NULL;
6669 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6670 some less clueful developer tries to use floating-point anyway. */
6671 if (needed_sseregs && !TARGET_SSE)
6673 if (in_return)
6675 if (!issued_sse_ret_error)
6677 error ("SSE register return with SSE disabled");
6678 issued_sse_ret_error = true;
6681 else if (!issued_sse_arg_error)
6683 error ("SSE register argument with SSE disabled");
6684 issued_sse_arg_error = true;
6686 return NULL;
6689 /* Likewise, error if the ABI requires us to return values in the
6690 x87 registers and the user specified -mno-80387. */
6691 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6692 for (i = 0; i < n; i++)
6693 if (regclass[i] == X86_64_X87_CLASS
6694 || regclass[i] == X86_64_X87UP_CLASS
6695 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6697 if (!issued_x87_ret_error)
6699 error ("x87 register return with x87 disabled");
6700 issued_x87_ret_error = true;
6702 return NULL;
6705 /* First construct simple cases. Avoid SCmode, since we want to use
6706 single register to pass this type. */
6707 if (n == 1 && mode != SCmode)
6708 switch (regclass[0])
6710 case X86_64_INTEGER_CLASS:
6711 case X86_64_INTEGERSI_CLASS:
6712 return gen_rtx_REG (mode, intreg[0]);
6713 case X86_64_SSE_CLASS:
6714 case X86_64_SSESF_CLASS:
6715 case X86_64_SSEDF_CLASS:
6716 if (mode != BLKmode)
6717 return gen_reg_or_parallel (mode, orig_mode,
6718 SSE_REGNO (sse_regno));
6719 break;
6720 case X86_64_X87_CLASS:
6721 case X86_64_COMPLEX_X87_CLASS:
6722 return gen_rtx_REG (mode, FIRST_STACK_REG);
6723 case X86_64_NO_CLASS:
6724 /* Zero sized array, struct or class. */
6725 return NULL;
6726 default:
6727 gcc_unreachable ();
6729 if (n == 2
6730 && regclass[0] == X86_64_SSE_CLASS
6731 && regclass[1] == X86_64_SSEUP_CLASS
6732 && mode != BLKmode)
6733 return gen_reg_or_parallel (mode, orig_mode,
6734 SSE_REGNO (sse_regno));
6735 if (n == 4
6736 && regclass[0] == X86_64_SSE_CLASS
6737 && regclass[1] == X86_64_SSEUP_CLASS
6738 && regclass[2] == X86_64_SSEUP_CLASS
6739 && regclass[3] == X86_64_SSEUP_CLASS
6740 && mode != BLKmode)
6741 return gen_reg_or_parallel (mode, orig_mode,
6742 SSE_REGNO (sse_regno));
6743 if (n == 2
6744 && regclass[0] == X86_64_X87_CLASS
6745 && regclass[1] == X86_64_X87UP_CLASS)
6746 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6748 if (n == 2
6749 && regclass[0] == X86_64_INTEGER_CLASS
6750 && regclass[1] == X86_64_INTEGER_CLASS
6751 && (mode == CDImode || mode == TImode || mode == TFmode)
6752 && intreg[0] + 1 == intreg[1])
6753 return gen_rtx_REG (mode, intreg[0]);
6755 /* Otherwise figure out the entries of the PARALLEL. */
6756 for (i = 0; i < n; i++)
6758 int pos;
6760 switch (regclass[i])
6762 case X86_64_NO_CLASS:
6763 break;
6764 case X86_64_INTEGER_CLASS:
6765 case X86_64_INTEGERSI_CLASS:
6766 /* Merge TImodes on aligned occasions here too. */
6767 if (i * 8 + 8 > bytes)
6768 tmpmode
6769 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6770 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6771 tmpmode = SImode;
6772 else
6773 tmpmode = DImode;
6774 /* We've requested 24 bytes we
6775 don't have mode for. Use DImode. */
6776 if (tmpmode == BLKmode)
6777 tmpmode = DImode;
6778 exp [nexps++]
6779 = gen_rtx_EXPR_LIST (VOIDmode,
6780 gen_rtx_REG (tmpmode, *intreg),
6781 GEN_INT (i*8));
6782 intreg++;
6783 break;
6784 case X86_64_SSESF_CLASS:
6785 exp [nexps++]
6786 = gen_rtx_EXPR_LIST (VOIDmode,
6787 gen_rtx_REG (SFmode,
6788 SSE_REGNO (sse_regno)),
6789 GEN_INT (i*8));
6790 sse_regno++;
6791 break;
6792 case X86_64_SSEDF_CLASS:
6793 exp [nexps++]
6794 = gen_rtx_EXPR_LIST (VOIDmode,
6795 gen_rtx_REG (DFmode,
6796 SSE_REGNO (sse_regno)),
6797 GEN_INT (i*8));
6798 sse_regno++;
6799 break;
6800 case X86_64_SSE_CLASS:
6801 pos = i;
6802 switch (n)
6804 case 1:
6805 tmpmode = DImode;
6806 break;
6807 case 2:
6808 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6810 tmpmode = TImode;
6811 i++;
6813 else
6814 tmpmode = DImode;
6815 break;
6816 case 4:
6817 gcc_assert (i == 0
6818 && regclass[1] == X86_64_SSEUP_CLASS
6819 && regclass[2] == X86_64_SSEUP_CLASS
6820 && regclass[3] == X86_64_SSEUP_CLASS);
6821 tmpmode = OImode;
6822 i += 3;
6823 break;
6824 default:
6825 gcc_unreachable ();
6827 exp [nexps++]
6828 = gen_rtx_EXPR_LIST (VOIDmode,
6829 gen_rtx_REG (tmpmode,
6830 SSE_REGNO (sse_regno)),
6831 GEN_INT (pos*8));
6832 sse_regno++;
6833 break;
6834 default:
6835 gcc_unreachable ();
6839 /* Empty aligned struct, union or class. */
6840 if (nexps == 0)
6841 return NULL;
6843 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6844 for (i = 0; i < nexps; i++)
6845 XVECEXP (ret, 0, i) = exp [i];
6846 return ret;
6849 /* Update the data in CUM to advance over an argument of mode MODE
6850 and data type TYPE. (TYPE is null for libcalls where that information
6851 may not be available.) */
6853 static void
6854 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6855 const_tree type, HOST_WIDE_INT bytes,
6856 HOST_WIDE_INT words)
6858 switch (mode)
6860 default:
6861 break;
6863 case BLKmode:
6864 if (bytes < 0)
6865 break;
6866 /* FALLTHRU */
6868 case DImode:
6869 case SImode:
6870 case HImode:
6871 case QImode:
6872 cum->words += words;
6873 cum->nregs -= words;
6874 cum->regno += words;
6876 if (cum->nregs <= 0)
6878 cum->nregs = 0;
6879 cum->regno = 0;
6881 break;
6883 case OImode:
6884 /* OImode shouldn't be used directly. */
6885 gcc_unreachable ();
6887 case DFmode:
6888 if (cum->float_in_sse < 2)
6889 break;
6890 case SFmode:
6891 if (cum->float_in_sse < 1)
6892 break;
6893 /* FALLTHRU */
6895 case V8SFmode:
6896 case V8SImode:
6897 case V32QImode:
6898 case V16HImode:
6899 case V4DFmode:
6900 case V4DImode:
6901 case TImode:
6902 case V16QImode:
6903 case V8HImode:
6904 case V4SImode:
6905 case V2DImode:
6906 case V4SFmode:
6907 case V2DFmode:
6908 if (!type || !AGGREGATE_TYPE_P (type))
6910 cum->sse_words += words;
6911 cum->sse_nregs -= 1;
6912 cum->sse_regno += 1;
6913 if (cum->sse_nregs <= 0)
6915 cum->sse_nregs = 0;
6916 cum->sse_regno = 0;
6919 break;
6921 case V8QImode:
6922 case V4HImode:
6923 case V2SImode:
6924 case V2SFmode:
6925 case V1TImode:
6926 case V1DImode:
6927 if (!type || !AGGREGATE_TYPE_P (type))
6929 cum->mmx_words += words;
6930 cum->mmx_nregs -= 1;
6931 cum->mmx_regno += 1;
6932 if (cum->mmx_nregs <= 0)
6934 cum->mmx_nregs = 0;
6935 cum->mmx_regno = 0;
6938 break;
6942 static void
6943 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6944 const_tree type, HOST_WIDE_INT words, bool named)
6946 int int_nregs, sse_nregs;
6948 /* Unnamed 256bit vector mode parameters are passed on stack. */
6949 if (!named && VALID_AVX256_REG_MODE (mode))
6950 return;
6952 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6953 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6955 cum->nregs -= int_nregs;
6956 cum->sse_nregs -= sse_nregs;
6957 cum->regno += int_nregs;
6958 cum->sse_regno += sse_nregs;
6960 else
6962 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6963 cum->words = (cum->words + align - 1) & ~(align - 1);
6964 cum->words += words;
6968 static void
6969 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6970 HOST_WIDE_INT words)
6972 /* Otherwise, this should be passed indirect. */
6973 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6975 cum->words += words;
6976 if (cum->nregs > 0)
6978 cum->nregs -= 1;
6979 cum->regno += 1;
6983 /* Update the data in CUM to advance over an argument of mode MODE and
6984 data type TYPE. (TYPE is null for libcalls where that information
6985 may not be available.) */
6987 static void
6988 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6989 const_tree type, bool named)
6991 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6992 HOST_WIDE_INT bytes, words;
6994 if (mode == BLKmode)
6995 bytes = int_size_in_bytes (type);
6996 else
6997 bytes = GET_MODE_SIZE (mode);
6998 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7000 if (type)
7001 mode = type_natural_mode (type, NULL);
7003 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7004 function_arg_advance_ms_64 (cum, bytes, words);
7005 else if (TARGET_64BIT)
7006 function_arg_advance_64 (cum, mode, type, words, named);
7007 else
7008 function_arg_advance_32 (cum, mode, type, bytes, words);
7011 /* Define where to put the arguments to a function.
7012 Value is zero to push the argument on the stack,
7013 or a hard register in which to store the argument.
7015 MODE is the argument's machine mode.
7016 TYPE is the data type of the argument (as a tree).
7017 This is null for libcalls where that information may
7018 not be available.
7019 CUM is a variable of type CUMULATIVE_ARGS which gives info about
7020 the preceding args and about the function being called.
7021 NAMED is nonzero if this argument is a named parameter
7022 (otherwise it is an extra parameter matching an ellipsis). */
7024 static rtx
7025 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7026 enum machine_mode orig_mode, const_tree type,
7027 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
7029 static bool warnedsse, warnedmmx;
7031 /* Avoid the AL settings for the Unix64 ABI. */
7032 if (mode == VOIDmode)
7033 return constm1_rtx;
7035 switch (mode)
7037 default:
7038 break;
7040 case BLKmode:
7041 if (bytes < 0)
7042 break;
7043 /* FALLTHRU */
7044 case DImode:
7045 case SImode:
7046 case HImode:
7047 case QImode:
7048 if (words <= cum->nregs)
7050 int regno = cum->regno;
7052 /* Fastcall allocates the first two DWORD (SImode) or
7053 smaller arguments to ECX and EDX if it isn't an
7054 aggregate type . */
7055 if (cum->fastcall)
7057 if (mode == BLKmode
7058 || mode == DImode
7059 || (type && AGGREGATE_TYPE_P (type)))
7060 break;
7062 /* ECX not EAX is the first allocated register. */
7063 if (regno == AX_REG)
7064 regno = CX_REG;
7066 return gen_rtx_REG (mode, regno);
7068 break;
7070 case DFmode:
7071 if (cum->float_in_sse < 2)
7072 break;
7073 case SFmode:
7074 if (cum->float_in_sse < 1)
7075 break;
7076 /* FALLTHRU */
7077 case TImode:
7078 /* In 32bit, we pass TImode in xmm registers. */
7079 case V16QImode:
7080 case V8HImode:
7081 case V4SImode:
7082 case V2DImode:
7083 case V4SFmode:
7084 case V2DFmode:
7085 if (!type || !AGGREGATE_TYPE_P (type))
7087 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
7089 warnedsse = true;
7090 warning (0, "SSE vector argument without SSE enabled "
7091 "changes the ABI");
7093 if (cum->sse_nregs)
7094 return gen_reg_or_parallel (mode, orig_mode,
7095 cum->sse_regno + FIRST_SSE_REG);
7097 break;
7099 case OImode:
7100 /* OImode shouldn't be used directly. */
7101 gcc_unreachable ();
7103 case V8SFmode:
7104 case V8SImode:
7105 case V32QImode:
7106 case V16HImode:
7107 case V4DFmode:
7108 case V4DImode:
7109 if (!type || !AGGREGATE_TYPE_P (type))
7111 if (cum->sse_nregs)
7112 return gen_reg_or_parallel (mode, orig_mode,
7113 cum->sse_regno + FIRST_SSE_REG);
7115 break;
7117 case V8QImode:
7118 case V4HImode:
7119 case V2SImode:
7120 case V2SFmode:
7121 case V1TImode:
7122 case V1DImode:
7123 if (!type || !AGGREGATE_TYPE_P (type))
7125 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7127 warnedmmx = true;
7128 warning (0, "MMX vector argument without MMX enabled "
7129 "changes the ABI");
7131 if (cum->mmx_nregs)
7132 return gen_reg_or_parallel (mode, orig_mode,
7133 cum->mmx_regno + FIRST_MMX_REG);
7135 break;
7138 return NULL_RTX;
7141 static rtx
7142 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7143 enum machine_mode orig_mode, const_tree type, bool named)
7145 /* Handle a hidden AL argument containing number of registers
7146 for varargs x86-64 functions. */
7147 if (mode == VOIDmode)
7148 return GEN_INT (cum->maybe_vaarg
7149 ? (cum->sse_nregs < 0
7150 ? X86_64_SSE_REGPARM_MAX
7151 : cum->sse_regno)
7152 : -1);
7154 switch (mode)
7156 default:
7157 break;
7159 case V8SFmode:
7160 case V8SImode:
7161 case V32QImode:
7162 case V16HImode:
7163 case V4DFmode:
7164 case V4DImode:
7165 /* Unnamed 256bit vector mode parameters are passed on stack. */
7166 if (!named)
7167 return NULL;
7168 break;
7171 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7172 cum->sse_nregs,
7173 &x86_64_int_parameter_registers [cum->regno],
7174 cum->sse_regno);
7177 static rtx
7178 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7179 enum machine_mode orig_mode, bool named,
7180 HOST_WIDE_INT bytes)
7182 unsigned int regno;
7184 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7185 We use value of -2 to specify that current function call is MSABI. */
7186 if (mode == VOIDmode)
7187 return GEN_INT (-2);
7189 /* If we've run out of registers, it goes on the stack. */
7190 if (cum->nregs == 0)
7191 return NULL_RTX;
7193 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7195 /* Only floating point modes are passed in anything but integer regs. */
7196 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7198 if (named)
7199 regno = cum->regno + FIRST_SSE_REG;
7200 else
7202 rtx t1, t2;
7204 /* Unnamed floating parameters are passed in both the
7205 SSE and integer registers. */
7206 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7207 t2 = gen_rtx_REG (mode, regno);
7208 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7209 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7210 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7213 /* Handle aggregated types passed in register. */
7214 if (orig_mode == BLKmode)
7216 if (bytes > 0 && bytes <= 8)
7217 mode = (bytes > 4 ? DImode : SImode);
7218 if (mode == BLKmode)
7219 mode = DImode;
7222 return gen_reg_or_parallel (mode, orig_mode, regno);
7225 /* Return where to put the arguments to a function.
7226 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7228 MODE is the argument's machine mode. TYPE is the data type of the
7229 argument. It is null for libcalls where that information may not be
7230 available. CUM gives information about the preceding args and about
7231 the function being called. NAMED is nonzero if this argument is a
7232 named parameter (otherwise it is an extra parameter matching an
7233 ellipsis). */
7235 static rtx
7236 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7237 const_tree type, bool named)
7239 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7240 enum machine_mode mode = omode;
7241 HOST_WIDE_INT bytes, words;
7242 rtx arg;
7244 if (mode == BLKmode)
7245 bytes = int_size_in_bytes (type);
7246 else
7247 bytes = GET_MODE_SIZE (mode);
7248 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7250 /* To simplify the code below, represent vector types with a vector mode
7251 even if MMX/SSE are not active. */
7252 if (type && TREE_CODE (type) == VECTOR_TYPE)
7253 mode = type_natural_mode (type, cum);
7255 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7256 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7257 else if (TARGET_64BIT)
7258 arg = function_arg_64 (cum, mode, omode, type, named);
7259 else
7260 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7262 return arg;
7265 /* A C expression that indicates when an argument must be passed by
7266 reference. If nonzero for an argument, a copy of that argument is
7267 made in memory and a pointer to the argument is passed instead of
7268 the argument itself. The pointer is passed in whatever way is
7269 appropriate for passing a pointer to that type. */
7271 static bool
7272 ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
7273 const_tree type, bool named ATTRIBUTE_UNUSED)
7275 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7277 /* See Windows x64 Software Convention. */
7278 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7280 int msize = (int) GET_MODE_SIZE (mode);
7281 if (type)
7283 /* Arrays are passed by reference. */
7284 if (TREE_CODE (type) == ARRAY_TYPE)
7285 return true;
7287 if (AGGREGATE_TYPE_P (type))
7289 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7290 are passed by reference. */
7291 msize = int_size_in_bytes (type);
7295 /* __m128 is passed by reference. */
7296 switch (msize) {
7297 case 1: case 2: case 4: case 8:
7298 break;
7299 default:
7300 return true;
7303 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7304 return 1;
7306 return 0;
7309 /* Return true when TYPE should be 128bit aligned for 32bit argument
7310 passing ABI. XXX: This function is obsolete and is only used for
7311 checking psABI compatibility with previous versions of GCC. */
7313 static bool
7314 ix86_compat_aligned_value_p (const_tree type)
7316 enum machine_mode mode = TYPE_MODE (type);
7317 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7318 || mode == TDmode
7319 || mode == TFmode
7320 || mode == TCmode)
7321 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7322 return true;
7323 if (TYPE_ALIGN (type) < 128)
7324 return false;
7326 if (AGGREGATE_TYPE_P (type))
7328 /* Walk the aggregates recursively. */
7329 switch (TREE_CODE (type))
7331 case RECORD_TYPE:
7332 case UNION_TYPE:
7333 case QUAL_UNION_TYPE:
7335 tree field;
7337 /* Walk all the structure fields. */
7338 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7340 if (TREE_CODE (field) == FIELD_DECL
7341 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7342 return true;
7344 break;
7347 case ARRAY_TYPE:
7348 /* Just for use if some languages passes arrays by value. */
7349 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7350 return true;
7351 break;
7353 default:
7354 gcc_unreachable ();
7357 return false;
7360 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7361 XXX: This function is obsolete and is only used for checking psABI
7362 compatibility with previous versions of GCC. */
7364 static unsigned int
7365 ix86_compat_function_arg_boundary (enum machine_mode mode,
7366 const_tree type, unsigned int align)
7368 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7369 natural boundaries. */
7370 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7372 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7373 make an exception for SSE modes since these require 128bit
7374 alignment.
7376 The handling here differs from field_alignment. ICC aligns MMX
7377 arguments to 4 byte boundaries, while structure fields are aligned
7378 to 8 byte boundaries. */
7379 if (!type)
7381 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7382 align = PARM_BOUNDARY;
7384 else
7386 if (!ix86_compat_aligned_value_p (type))
7387 align = PARM_BOUNDARY;
7390 if (align > BIGGEST_ALIGNMENT)
7391 align = BIGGEST_ALIGNMENT;
7392 return align;
7395 /* Return true when TYPE should be 128bit aligned for 32bit argument
7396 passing ABI. */
7398 static bool
7399 ix86_contains_aligned_value_p (const_tree type)
7401 enum machine_mode mode = TYPE_MODE (type);
7403 if (mode == XFmode || mode == XCmode)
7404 return false;
7406 if (TYPE_ALIGN (type) < 128)
7407 return false;
7409 if (AGGREGATE_TYPE_P (type))
7411 /* Walk the aggregates recursively. */
7412 switch (TREE_CODE (type))
7414 case RECORD_TYPE:
7415 case UNION_TYPE:
7416 case QUAL_UNION_TYPE:
7418 tree field;
7420 /* Walk all the structure fields. */
7421 for (field = TYPE_FIELDS (type);
7422 field;
7423 field = DECL_CHAIN (field))
7425 if (TREE_CODE (field) == FIELD_DECL
7426 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7427 return true;
7429 break;
7432 case ARRAY_TYPE:
7433 /* Just for use if some languages passes arrays by value. */
7434 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7435 return true;
7436 break;
7438 default:
7439 gcc_unreachable ();
7442 else
7443 return TYPE_ALIGN (type) >= 128;
7445 return false;
7448 /* Gives the alignment boundary, in bits, of an argument with the
7449 specified mode and type. */
7451 static unsigned int
7452 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7454 unsigned int align;
7455 if (type)
7457 /* Since the main variant type is used for call, we convert it to
7458 the main variant type. */
7459 type = TYPE_MAIN_VARIANT (type);
7460 align = TYPE_ALIGN (type);
7462 else
7463 align = GET_MODE_ALIGNMENT (mode);
7464 if (align < PARM_BOUNDARY)
7465 align = PARM_BOUNDARY;
7466 else
7468 static bool warned;
7469 unsigned int saved_align = align;
7471 if (!TARGET_64BIT)
7473 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7474 if (!type)
7476 if (mode == XFmode || mode == XCmode)
7477 align = PARM_BOUNDARY;
7479 else if (!ix86_contains_aligned_value_p (type))
7480 align = PARM_BOUNDARY;
7482 if (align < 128)
7483 align = PARM_BOUNDARY;
7486 if (warn_psabi
7487 && !warned
7488 && align != ix86_compat_function_arg_boundary (mode, type,
7489 saved_align))
7491 warned = true;
7492 inform (input_location,
7493 "The ABI for passing parameters with %d-byte"
7494 " alignment has changed in GCC 4.6",
7495 align / BITS_PER_UNIT);
7499 return align;
7502 /* Return true if N is a possible register number of function value. */
7504 static bool
7505 ix86_function_value_regno_p (const unsigned int regno)
7507 switch (regno)
7509 case AX_REG:
7510 case DX_REG:
7511 return true;
7512 case DI_REG:
7513 case SI_REG:
7514 return TARGET_64BIT && ix86_abi != MS_ABI;
7516 /* Complex values are returned in %st(0)/%st(1) pair. */
7517 case ST0_REG:
7518 case ST1_REG:
7519 /* TODO: The function should depend on current function ABI but
7520 builtins.c would need updating then. Therefore we use the
7521 default ABI. */
7522 if (TARGET_64BIT && ix86_abi == MS_ABI)
7523 return false;
7524 return TARGET_FLOAT_RETURNS_IN_80387;
7526 /* Complex values are returned in %xmm0/%xmm1 pair. */
7527 case XMM0_REG:
7528 case XMM1_REG:
7529 return TARGET_SSE;
7531 case MM0_REG:
7532 if (TARGET_MACHO || TARGET_64BIT)
7533 return false;
7534 return TARGET_MMX;
7537 return false;
7540 /* Define how to find the value returned by a function.
7541 VALTYPE is the data type of the value (as a tree).
7542 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7543 otherwise, FUNC is 0. */
7545 static rtx
7546 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7547 const_tree fntype, const_tree fn)
7549 unsigned int regno;
7551 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7552 we normally prevent this case when mmx is not available. However
7553 some ABIs may require the result to be returned like DImode. */
7554 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7555 regno = FIRST_MMX_REG;
7557 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7558 we prevent this case when sse is not available. However some ABIs
7559 may require the result to be returned like integer TImode. */
7560 else if (mode == TImode
7561 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7562 regno = FIRST_SSE_REG;
7564 /* 32-byte vector modes in %ymm0. */
7565 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7566 regno = FIRST_SSE_REG;
7568 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7569 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7570 regno = FIRST_FLOAT_REG;
7571 else
7572 /* Most things go in %eax. */
7573 regno = AX_REG;
7575 /* Override FP return register with %xmm0 for local functions when
7576 SSE math is enabled or for functions with sseregparm attribute. */
7577 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7579 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7580 if ((sse_level >= 1 && mode == SFmode)
7581 || (sse_level == 2 && mode == DFmode))
7582 regno = FIRST_SSE_REG;
7585 /* OImode shouldn't be used directly. */
7586 gcc_assert (mode != OImode);
7588 return gen_rtx_REG (orig_mode, regno);
7591 static rtx
7592 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7593 const_tree valtype)
7595 rtx ret;
7597 /* Handle libcalls, which don't provide a type node. */
7598 if (valtype == NULL)
7600 unsigned int regno;
7602 switch (mode)
7604 case SFmode:
7605 case SCmode:
7606 case DFmode:
7607 case DCmode:
7608 case TFmode:
7609 case SDmode:
7610 case DDmode:
7611 case TDmode:
7612 regno = FIRST_SSE_REG;
7613 break;
7614 case XFmode:
7615 case XCmode:
7616 regno = FIRST_FLOAT_REG;
7617 break;
7618 case TCmode:
7619 return NULL;
7620 default:
7621 regno = AX_REG;
7624 return gen_rtx_REG (mode, regno);
7626 else if (POINTER_TYPE_P (valtype))
7628 /* Pointers are always returned in word_mode. */
7629 mode = word_mode;
7632 ret = construct_container (mode, orig_mode, valtype, 1,
7633 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7634 x86_64_int_return_registers, 0);
7636 /* For zero sized structures, construct_container returns NULL, but we
7637 need to keep rest of compiler happy by returning meaningful value. */
7638 if (!ret)
7639 ret = gen_rtx_REG (orig_mode, AX_REG);
7641 return ret;
7644 static rtx
7645 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7646 const_tree valtype)
7648 unsigned int regno = AX_REG;
7650 if (TARGET_SSE)
7652 switch (GET_MODE_SIZE (mode))
7654 case 16:
7655 if (valtype != NULL_TREE
7656 && !VECTOR_INTEGER_TYPE_P (valtype)
7657 && !VECTOR_INTEGER_TYPE_P (valtype)
7658 && !INTEGRAL_TYPE_P (valtype)
7659 && !VECTOR_FLOAT_TYPE_P (valtype))
7660 break;
7661 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7662 && !COMPLEX_MODE_P (mode))
7663 regno = FIRST_SSE_REG;
7664 break;
7665 case 8:
7666 case 4:
7667 if (mode == SFmode || mode == DFmode)
7668 regno = FIRST_SSE_REG;
7669 break;
7670 default:
7671 break;
7674 return gen_rtx_REG (orig_mode, regno);
7677 static rtx
7678 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7679 enum machine_mode orig_mode, enum machine_mode mode)
7681 const_tree fn, fntype;
7683 fn = NULL_TREE;
7684 if (fntype_or_decl && DECL_P (fntype_or_decl))
7685 fn = fntype_or_decl;
7686 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7688 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7689 return function_value_ms_64 (orig_mode, mode, valtype);
7690 else if (TARGET_64BIT)
7691 return function_value_64 (orig_mode, mode, valtype);
7692 else
7693 return function_value_32 (orig_mode, mode, fntype, fn);
7696 static rtx
7697 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7698 bool outgoing ATTRIBUTE_UNUSED)
7700 enum machine_mode mode, orig_mode;
7702 orig_mode = TYPE_MODE (valtype);
7703 mode = type_natural_mode (valtype, NULL);
7704 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7707 /* Pointer function arguments and return values are promoted to
7708 word_mode. */
7710 static enum machine_mode
7711 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7712 int *punsignedp, const_tree fntype,
7713 int for_return)
7715 if (type != NULL_TREE && POINTER_TYPE_P (type))
7717 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7718 return word_mode;
7720 return default_promote_function_mode (type, mode, punsignedp, fntype,
7721 for_return);
7724 /* Return true if a structure, union or array with MODE containing FIELD
7725 should be accessed using BLKmode. */
7727 static bool
7728 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7730 /* Union with XFmode must be in BLKmode. */
7731 return (mode == XFmode
7732 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7733 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7737 ix86_libcall_value (enum machine_mode mode)
7739 return ix86_function_value_1 (NULL, NULL, mode, mode);
7742 /* Return true iff type is returned in memory. */
7744 static bool ATTRIBUTE_UNUSED
7745 return_in_memory_32 (const_tree type, enum machine_mode mode)
7747 HOST_WIDE_INT size;
7749 if (mode == BLKmode)
7750 return true;
7752 size = int_size_in_bytes (type);
7754 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7755 return false;
7757 if (VECTOR_MODE_P (mode) || mode == TImode)
7759 /* User-created vectors small enough to fit in EAX. */
7760 if (size < 8)
7761 return false;
7763 /* MMX/3dNow values are returned in MM0,
7764 except when it doesn't exits or the ABI prescribes otherwise. */
7765 if (size == 8)
7766 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7768 /* SSE values are returned in XMM0, except when it doesn't exist. */
7769 if (size == 16)
7770 return !TARGET_SSE;
7772 /* AVX values are returned in YMM0, except when it doesn't exist. */
7773 if (size == 32)
7774 return !TARGET_AVX;
7777 if (mode == XFmode)
7778 return false;
7780 if (size > 12)
7781 return true;
7783 /* OImode shouldn't be used directly. */
7784 gcc_assert (mode != OImode);
7786 return false;
7789 static bool ATTRIBUTE_UNUSED
7790 return_in_memory_64 (const_tree type, enum machine_mode mode)
7792 int needed_intregs, needed_sseregs;
7793 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7796 static bool ATTRIBUTE_UNUSED
7797 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7799 HOST_WIDE_INT size = int_size_in_bytes (type);
7801 /* __m128 is returned in xmm0. */
7802 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7803 || VECTOR_FLOAT_TYPE_P (type))
7804 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7805 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7806 return false;
7808 /* Otherwise, the size must be exactly in [1248]. */
7809 return size != 1 && size != 2 && size != 4 && size != 8;
7812 static bool
7813 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7815 #ifdef SUBTARGET_RETURN_IN_MEMORY
7816 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7817 #else
7818 const enum machine_mode mode = type_natural_mode (type, NULL);
7820 if (TARGET_64BIT)
7822 if (ix86_function_type_abi (fntype) == MS_ABI)
7823 return return_in_memory_ms_64 (type, mode);
7824 else
7825 return return_in_memory_64 (type, mode);
7827 else
7828 return return_in_memory_32 (type, mode);
7829 #endif
7832 /* When returning SSE vector types, we have a choice of either
7833 (1) being abi incompatible with a -march switch, or
7834 (2) generating an error.
7835 Given no good solution, I think the safest thing is one warning.
7836 The user won't be able to use -Werror, but....
7838 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7839 called in response to actually generating a caller or callee that
7840 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7841 via aggregate_value_p for general type probing from tree-ssa. */
7843 static rtx
7844 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7846 static bool warnedsse, warnedmmx;
7848 if (!TARGET_64BIT && type)
7850 /* Look at the return type of the function, not the function type. */
7851 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7853 if (!TARGET_SSE && !warnedsse)
7855 if (mode == TImode
7856 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7858 warnedsse = true;
7859 warning (0, "SSE vector return without SSE enabled "
7860 "changes the ABI");
7864 if (!TARGET_MMX && !warnedmmx)
7866 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7868 warnedmmx = true;
7869 warning (0, "MMX vector return without MMX enabled "
7870 "changes the ABI");
7875 return NULL;
7879 /* Create the va_list data type. */
7881 /* Returns the calling convention specific va_list date type.
7882 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7884 static tree
7885 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7887 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7889 /* For i386 we use plain pointer to argument area. */
7890 if (!TARGET_64BIT || abi == MS_ABI)
7891 return build_pointer_type (char_type_node);
7893 record = lang_hooks.types.make_type (RECORD_TYPE);
7894 type_decl = build_decl (BUILTINS_LOCATION,
7895 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7897 f_gpr = build_decl (BUILTINS_LOCATION,
7898 FIELD_DECL, get_identifier ("gp_offset"),
7899 unsigned_type_node);
7900 f_fpr = build_decl (BUILTINS_LOCATION,
7901 FIELD_DECL, get_identifier ("fp_offset"),
7902 unsigned_type_node);
7903 f_ovf = build_decl (BUILTINS_LOCATION,
7904 FIELD_DECL, get_identifier ("overflow_arg_area"),
7905 ptr_type_node);
7906 f_sav = build_decl (BUILTINS_LOCATION,
7907 FIELD_DECL, get_identifier ("reg_save_area"),
7908 ptr_type_node);
7910 va_list_gpr_counter_field = f_gpr;
7911 va_list_fpr_counter_field = f_fpr;
7913 DECL_FIELD_CONTEXT (f_gpr) = record;
7914 DECL_FIELD_CONTEXT (f_fpr) = record;
7915 DECL_FIELD_CONTEXT (f_ovf) = record;
7916 DECL_FIELD_CONTEXT (f_sav) = record;
7918 TYPE_STUB_DECL (record) = type_decl;
7919 TYPE_NAME (record) = type_decl;
7920 TYPE_FIELDS (record) = f_gpr;
7921 DECL_CHAIN (f_gpr) = f_fpr;
7922 DECL_CHAIN (f_fpr) = f_ovf;
7923 DECL_CHAIN (f_ovf) = f_sav;
7925 layout_type (record);
7927 /* The correct type is an array type of one element. */
7928 return build_array_type (record, build_index_type (size_zero_node));
7931 /* Setup the builtin va_list data type and for 64-bit the additional
7932 calling convention specific va_list data types. */
7934 static tree
7935 ix86_build_builtin_va_list (void)
7937 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7939 /* Initialize abi specific va_list builtin types. */
7940 if (TARGET_64BIT)
7942 tree t;
7943 if (ix86_abi == MS_ABI)
7945 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7946 if (TREE_CODE (t) != RECORD_TYPE)
7947 t = build_variant_type_copy (t);
7948 sysv_va_list_type_node = t;
7950 else
7952 t = ret;
7953 if (TREE_CODE (t) != RECORD_TYPE)
7954 t = build_variant_type_copy (t);
7955 sysv_va_list_type_node = t;
7957 if (ix86_abi != MS_ABI)
7959 t = ix86_build_builtin_va_list_abi (MS_ABI);
7960 if (TREE_CODE (t) != RECORD_TYPE)
7961 t = build_variant_type_copy (t);
7962 ms_va_list_type_node = t;
7964 else
7966 t = ret;
7967 if (TREE_CODE (t) != RECORD_TYPE)
7968 t = build_variant_type_copy (t);
7969 ms_va_list_type_node = t;
7973 return ret;
7976 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7978 static void
7979 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7981 rtx save_area, mem;
7982 alias_set_type set;
7983 int i, max;
7985 /* GPR size of varargs save area. */
7986 if (cfun->va_list_gpr_size)
7987 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7988 else
7989 ix86_varargs_gpr_size = 0;
7991 /* FPR size of varargs save area. We don't need it if we don't pass
7992 anything in SSE registers. */
7993 if (TARGET_SSE && cfun->va_list_fpr_size)
7994 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7995 else
7996 ix86_varargs_fpr_size = 0;
7998 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7999 return;
8001 save_area = frame_pointer_rtx;
8002 set = get_varargs_alias_set ();
8004 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
8005 if (max > X86_64_REGPARM_MAX)
8006 max = X86_64_REGPARM_MAX;
8008 for (i = cum->regno; i < max; i++)
8010 mem = gen_rtx_MEM (word_mode,
8011 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
8012 MEM_NOTRAP_P (mem) = 1;
8013 set_mem_alias_set (mem, set);
8014 emit_move_insn (mem,
8015 gen_rtx_REG (word_mode,
8016 x86_64_int_parameter_registers[i]));
8019 if (ix86_varargs_fpr_size)
8021 enum machine_mode smode;
8022 rtx label, test;
8024 /* Now emit code to save SSE registers. The AX parameter contains number
8025 of SSE parameter registers used to call this function, though all we
8026 actually check here is the zero/non-zero status. */
8028 label = gen_label_rtx ();
8029 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
8030 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
8031 label));
8033 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
8034 we used movdqa (i.e. TImode) instead? Perhaps even better would
8035 be if we could determine the real mode of the data, via a hook
8036 into pass_stdarg. Ignore all that for now. */
8037 smode = V4SFmode;
8038 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
8039 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
8041 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
8042 if (max > X86_64_SSE_REGPARM_MAX)
8043 max = X86_64_SSE_REGPARM_MAX;
8045 for (i = cum->sse_regno; i < max; ++i)
8047 mem = plus_constant (Pmode, save_area,
8048 i * 16 + ix86_varargs_gpr_size);
8049 mem = gen_rtx_MEM (smode, mem);
8050 MEM_NOTRAP_P (mem) = 1;
8051 set_mem_alias_set (mem, set);
8052 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
8054 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
8057 emit_label (label);
8061 static void
8062 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
8064 alias_set_type set = get_varargs_alias_set ();
8065 int i;
8067 /* Reset to zero, as there might be a sysv vaarg used
8068 before. */
8069 ix86_varargs_gpr_size = 0;
8070 ix86_varargs_fpr_size = 0;
8072 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
8074 rtx reg, mem;
8076 mem = gen_rtx_MEM (Pmode,
8077 plus_constant (Pmode, virtual_incoming_args_rtx,
8078 i * UNITS_PER_WORD));
8079 MEM_NOTRAP_P (mem) = 1;
8080 set_mem_alias_set (mem, set);
8082 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
8083 emit_move_insn (mem, reg);
8087 static void
8088 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
8089 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8090 int no_rtl)
8092 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8093 CUMULATIVE_ARGS next_cum;
8094 tree fntype;
8096 /* This argument doesn't appear to be used anymore. Which is good,
8097 because the old code here didn't suppress rtl generation. */
8098 gcc_assert (!no_rtl);
8100 if (!TARGET_64BIT)
8101 return;
8103 fntype = TREE_TYPE (current_function_decl);
8105 /* For varargs, we do not want to skip the dummy va_dcl argument.
8106 For stdargs, we do want to skip the last named argument. */
8107 next_cum = *cum;
8108 if (stdarg_p (fntype))
8109 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8110 true);
8112 if (cum->call_abi == MS_ABI)
8113 setup_incoming_varargs_ms_64 (&next_cum);
8114 else
8115 setup_incoming_varargs_64 (&next_cum);
8118 /* Checks if TYPE is of kind va_list char *. */
8120 static bool
8121 is_va_list_char_pointer (tree type)
8123 tree canonic;
8125 /* For 32-bit it is always true. */
8126 if (!TARGET_64BIT)
8127 return true;
8128 canonic = ix86_canonical_va_list_type (type);
8129 return (canonic == ms_va_list_type_node
8130 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8133 /* Implement va_start. */
8135 static void
8136 ix86_va_start (tree valist, rtx nextarg)
8138 HOST_WIDE_INT words, n_gpr, n_fpr;
8139 tree f_gpr, f_fpr, f_ovf, f_sav;
8140 tree gpr, fpr, ovf, sav, t;
8141 tree type;
8142 rtx ovf_rtx;
8144 if (flag_split_stack
8145 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8147 unsigned int scratch_regno;
8149 /* When we are splitting the stack, we can't refer to the stack
8150 arguments using internal_arg_pointer, because they may be on
8151 the old stack. The split stack prologue will arrange to
8152 leave a pointer to the old stack arguments in a scratch
8153 register, which we here copy to a pseudo-register. The split
8154 stack prologue can't set the pseudo-register directly because
8155 it (the prologue) runs before any registers have been saved. */
8157 scratch_regno = split_stack_prologue_scratch_regno ();
8158 if (scratch_regno != INVALID_REGNUM)
8160 rtx reg, seq;
8162 reg = gen_reg_rtx (Pmode);
8163 cfun->machine->split_stack_varargs_pointer = reg;
8165 start_sequence ();
8166 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8167 seq = get_insns ();
8168 end_sequence ();
8170 push_topmost_sequence ();
8171 emit_insn_after (seq, entry_of_function ());
8172 pop_topmost_sequence ();
8176 /* Only 64bit target needs something special. */
8177 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8179 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8180 std_expand_builtin_va_start (valist, nextarg);
8181 else
8183 rtx va_r, next;
8185 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8186 next = expand_binop (ptr_mode, add_optab,
8187 cfun->machine->split_stack_varargs_pointer,
8188 crtl->args.arg_offset_rtx,
8189 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8190 convert_move (va_r, next, 0);
8192 return;
8195 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8196 f_fpr = DECL_CHAIN (f_gpr);
8197 f_ovf = DECL_CHAIN (f_fpr);
8198 f_sav = DECL_CHAIN (f_ovf);
8200 valist = build_simple_mem_ref (valist);
8201 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8202 /* The following should be folded into the MEM_REF offset. */
8203 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8204 f_gpr, NULL_TREE);
8205 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8206 f_fpr, NULL_TREE);
8207 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8208 f_ovf, NULL_TREE);
8209 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8210 f_sav, NULL_TREE);
8212 /* Count number of gp and fp argument registers used. */
8213 words = crtl->args.info.words;
8214 n_gpr = crtl->args.info.regno;
8215 n_fpr = crtl->args.info.sse_regno;
8217 if (cfun->va_list_gpr_size)
8219 type = TREE_TYPE (gpr);
8220 t = build2 (MODIFY_EXPR, type,
8221 gpr, build_int_cst (type, n_gpr * 8));
8222 TREE_SIDE_EFFECTS (t) = 1;
8223 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8226 if (TARGET_SSE && cfun->va_list_fpr_size)
8228 type = TREE_TYPE (fpr);
8229 t = build2 (MODIFY_EXPR, type, fpr,
8230 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8231 TREE_SIDE_EFFECTS (t) = 1;
8232 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8235 /* Find the overflow area. */
8236 type = TREE_TYPE (ovf);
8237 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8238 ovf_rtx = crtl->args.internal_arg_pointer;
8239 else
8240 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8241 t = make_tree (type, ovf_rtx);
8242 if (words != 0)
8243 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8244 t = build2 (MODIFY_EXPR, type, ovf, t);
8245 TREE_SIDE_EFFECTS (t) = 1;
8246 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8248 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8250 /* Find the register save area.
8251 Prologue of the function save it right above stack frame. */
8252 type = TREE_TYPE (sav);
8253 t = make_tree (type, frame_pointer_rtx);
8254 if (!ix86_varargs_gpr_size)
8255 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8256 t = build2 (MODIFY_EXPR, type, sav, t);
8257 TREE_SIDE_EFFECTS (t) = 1;
8258 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8262 /* Implement va_arg. */
8264 static tree
8265 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8266 gimple_seq *post_p)
8268 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8269 tree f_gpr, f_fpr, f_ovf, f_sav;
8270 tree gpr, fpr, ovf, sav, t;
8271 int size, rsize;
8272 tree lab_false, lab_over = NULL_TREE;
8273 tree addr, t2;
8274 rtx container;
8275 int indirect_p = 0;
8276 tree ptrtype;
8277 enum machine_mode nat_mode;
8278 unsigned int arg_boundary;
8280 /* Only 64bit target needs something special. */
8281 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8282 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8284 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8285 f_fpr = DECL_CHAIN (f_gpr);
8286 f_ovf = DECL_CHAIN (f_fpr);
8287 f_sav = DECL_CHAIN (f_ovf);
8289 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8290 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8291 valist = build_va_arg_indirect_ref (valist);
8292 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8293 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8294 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8296 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8297 if (indirect_p)
8298 type = build_pointer_type (type);
8299 size = int_size_in_bytes (type);
8300 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8302 nat_mode = type_natural_mode (type, NULL);
8303 switch (nat_mode)
8305 case V8SFmode:
8306 case V8SImode:
8307 case V32QImode:
8308 case V16HImode:
8309 case V4DFmode:
8310 case V4DImode:
8311 /* Unnamed 256bit vector mode parameters are passed on stack. */
8312 if (!TARGET_64BIT_MS_ABI)
8314 container = NULL;
8315 break;
8318 default:
8319 container = construct_container (nat_mode, TYPE_MODE (type),
8320 type, 0, X86_64_REGPARM_MAX,
8321 X86_64_SSE_REGPARM_MAX, intreg,
8323 break;
8326 /* Pull the value out of the saved registers. */
8328 addr = create_tmp_var (ptr_type_node, "addr");
8330 if (container)
8332 int needed_intregs, needed_sseregs;
8333 bool need_temp;
8334 tree int_addr, sse_addr;
8336 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8337 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8339 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8341 need_temp = (!REG_P (container)
8342 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8343 || TYPE_ALIGN (type) > 128));
8345 /* In case we are passing structure, verify that it is consecutive block
8346 on the register save area. If not we need to do moves. */
8347 if (!need_temp && !REG_P (container))
8349 /* Verify that all registers are strictly consecutive */
8350 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8352 int i;
8354 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8356 rtx slot = XVECEXP (container, 0, i);
8357 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8358 || INTVAL (XEXP (slot, 1)) != i * 16)
8359 need_temp = 1;
8362 else
8364 int i;
8366 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8368 rtx slot = XVECEXP (container, 0, i);
8369 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8370 || INTVAL (XEXP (slot, 1)) != i * 8)
8371 need_temp = 1;
8375 if (!need_temp)
8377 int_addr = addr;
8378 sse_addr = addr;
8380 else
8382 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8383 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8386 /* First ensure that we fit completely in registers. */
8387 if (needed_intregs)
8389 t = build_int_cst (TREE_TYPE (gpr),
8390 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8391 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8392 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8393 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8394 gimplify_and_add (t, pre_p);
8396 if (needed_sseregs)
8398 t = build_int_cst (TREE_TYPE (fpr),
8399 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8400 + X86_64_REGPARM_MAX * 8);
8401 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8402 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8403 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8404 gimplify_and_add (t, pre_p);
8407 /* Compute index to start of area used for integer regs. */
8408 if (needed_intregs)
8410 /* int_addr = gpr + sav; */
8411 t = fold_build_pointer_plus (sav, gpr);
8412 gimplify_assign (int_addr, t, pre_p);
8414 if (needed_sseregs)
8416 /* sse_addr = fpr + sav; */
8417 t = fold_build_pointer_plus (sav, fpr);
8418 gimplify_assign (sse_addr, t, pre_p);
8420 if (need_temp)
8422 int i, prev_size = 0;
8423 tree temp = create_tmp_var (type, "va_arg_tmp");
8425 /* addr = &temp; */
8426 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8427 gimplify_assign (addr, t, pre_p);
8429 for (i = 0; i < XVECLEN (container, 0); i++)
8431 rtx slot = XVECEXP (container, 0, i);
8432 rtx reg = XEXP (slot, 0);
8433 enum machine_mode mode = GET_MODE (reg);
8434 tree piece_type;
8435 tree addr_type;
8436 tree daddr_type;
8437 tree src_addr, src;
8438 int src_offset;
8439 tree dest_addr, dest;
8440 int cur_size = GET_MODE_SIZE (mode);
8442 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8443 prev_size = INTVAL (XEXP (slot, 1));
8444 if (prev_size + cur_size > size)
8446 cur_size = size - prev_size;
8447 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8448 if (mode == BLKmode)
8449 mode = QImode;
8451 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8452 if (mode == GET_MODE (reg))
8453 addr_type = build_pointer_type (piece_type);
8454 else
8455 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8456 true);
8457 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8458 true);
8460 if (SSE_REGNO_P (REGNO (reg)))
8462 src_addr = sse_addr;
8463 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8465 else
8467 src_addr = int_addr;
8468 src_offset = REGNO (reg) * 8;
8470 src_addr = fold_convert (addr_type, src_addr);
8471 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8473 dest_addr = fold_convert (daddr_type, addr);
8474 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8475 if (cur_size == GET_MODE_SIZE (mode))
8477 src = build_va_arg_indirect_ref (src_addr);
8478 dest = build_va_arg_indirect_ref (dest_addr);
8480 gimplify_assign (dest, src, pre_p);
8482 else
8484 tree copy
8485 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8486 3, dest_addr, src_addr,
8487 size_int (cur_size));
8488 gimplify_and_add (copy, pre_p);
8490 prev_size += cur_size;
8494 if (needed_intregs)
8496 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8497 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8498 gimplify_assign (gpr, t, pre_p);
8501 if (needed_sseregs)
8503 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8504 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8505 gimplify_assign (fpr, t, pre_p);
8508 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8510 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8513 /* ... otherwise out of the overflow area. */
8515 /* When we align parameter on stack for caller, if the parameter
8516 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8517 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8518 here with caller. */
8519 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8520 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8521 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8523 /* Care for on-stack alignment if needed. */
8524 if (arg_boundary <= 64 || size == 0)
8525 t = ovf;
8526 else
8528 HOST_WIDE_INT align = arg_boundary / 8;
8529 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8530 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8531 build_int_cst (TREE_TYPE (t), -align));
8534 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8535 gimplify_assign (addr, t, pre_p);
8537 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8538 gimplify_assign (unshare_expr (ovf), t, pre_p);
8540 if (container)
8541 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8543 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8544 addr = fold_convert (ptrtype, addr);
8546 if (indirect_p)
8547 addr = build_va_arg_indirect_ref (addr);
8548 return build_va_arg_indirect_ref (addr);
8551 /* Return true if OPNUM's MEM should be matched
8552 in movabs* patterns. */
8554 bool
8555 ix86_check_movabs (rtx insn, int opnum)
8557 rtx set, mem;
8559 set = PATTERN (insn);
8560 if (GET_CODE (set) == PARALLEL)
8561 set = XVECEXP (set, 0, 0);
8562 gcc_assert (GET_CODE (set) == SET);
8563 mem = XEXP (set, opnum);
8564 while (GET_CODE (mem) == SUBREG)
8565 mem = SUBREG_REG (mem);
8566 gcc_assert (MEM_P (mem));
8567 return volatile_ok || !MEM_VOLATILE_P (mem);
8570 /* Initialize the table of extra 80387 mathematical constants. */
8572 static void
8573 init_ext_80387_constants (void)
8575 static const char * cst[5] =
8577 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8578 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8579 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8580 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8581 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8583 int i;
8585 for (i = 0; i < 5; i++)
8587 real_from_string (&ext_80387_constants_table[i], cst[i]);
8588 /* Ensure each constant is rounded to XFmode precision. */
8589 real_convert (&ext_80387_constants_table[i],
8590 XFmode, &ext_80387_constants_table[i]);
8593 ext_80387_constants_init = 1;
8596 /* Return non-zero if the constant is something that
8597 can be loaded with a special instruction. */
8600 standard_80387_constant_p (rtx x)
8602 enum machine_mode mode = GET_MODE (x);
8604 REAL_VALUE_TYPE r;
8606 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8607 return -1;
8609 if (x == CONST0_RTX (mode))
8610 return 1;
8611 if (x == CONST1_RTX (mode))
8612 return 2;
8614 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8616 /* For XFmode constants, try to find a special 80387 instruction when
8617 optimizing for size or on those CPUs that benefit from them. */
8618 if (mode == XFmode
8619 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8621 int i;
8623 if (! ext_80387_constants_init)
8624 init_ext_80387_constants ();
8626 for (i = 0; i < 5; i++)
8627 if (real_identical (&r, &ext_80387_constants_table[i]))
8628 return i + 3;
8631 /* Load of the constant -0.0 or -1.0 will be split as
8632 fldz;fchs or fld1;fchs sequence. */
8633 if (real_isnegzero (&r))
8634 return 8;
8635 if (real_identical (&r, &dconstm1))
8636 return 9;
8638 return 0;
8641 /* Return the opcode of the special instruction to be used to load
8642 the constant X. */
8644 const char *
8645 standard_80387_constant_opcode (rtx x)
8647 switch (standard_80387_constant_p (x))
8649 case 1:
8650 return "fldz";
8651 case 2:
8652 return "fld1";
8653 case 3:
8654 return "fldlg2";
8655 case 4:
8656 return "fldln2";
8657 case 5:
8658 return "fldl2e";
8659 case 6:
8660 return "fldl2t";
8661 case 7:
8662 return "fldpi";
8663 case 8:
8664 case 9:
8665 return "#";
8666 default:
8667 gcc_unreachable ();
8671 /* Return the CONST_DOUBLE representing the 80387 constant that is
8672 loaded by the specified special instruction. The argument IDX
8673 matches the return value from standard_80387_constant_p. */
8676 standard_80387_constant_rtx (int idx)
8678 int i;
8680 if (! ext_80387_constants_init)
8681 init_ext_80387_constants ();
8683 switch (idx)
8685 case 3:
8686 case 4:
8687 case 5:
8688 case 6:
8689 case 7:
8690 i = idx - 3;
8691 break;
8693 default:
8694 gcc_unreachable ();
8697 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8698 XFmode);
8701 /* Return 1 if X is all 0s and 2 if x is all 1s
8702 in supported SSE/AVX vector mode. */
8705 standard_sse_constant_p (rtx x)
8707 enum machine_mode mode = GET_MODE (x);
8709 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8710 return 1;
8711 if (vector_all_ones_operand (x, mode))
8712 switch (mode)
8714 case V16QImode:
8715 case V8HImode:
8716 case V4SImode:
8717 case V2DImode:
8718 if (TARGET_SSE2)
8719 return 2;
8720 case V32QImode:
8721 case V16HImode:
8722 case V8SImode:
8723 case V4DImode:
8724 if (TARGET_AVX2)
8725 return 2;
8726 default:
8727 break;
8730 return 0;
8733 /* Return the opcode of the special instruction to be used to load
8734 the constant X. */
8736 const char *
8737 standard_sse_constant_opcode (rtx insn, rtx x)
8739 switch (standard_sse_constant_p (x))
8741 case 1:
8742 switch (get_attr_mode (insn))
8744 case MODE_TI:
8745 return "%vpxor\t%0, %d0";
8746 case MODE_V2DF:
8747 return "%vxorpd\t%0, %d0";
8748 case MODE_V4SF:
8749 return "%vxorps\t%0, %d0";
8751 case MODE_OI:
8752 return "vpxor\t%x0, %x0, %x0";
8753 case MODE_V4DF:
8754 return "vxorpd\t%x0, %x0, %x0";
8755 case MODE_V8SF:
8756 return "vxorps\t%x0, %x0, %x0";
8758 default:
8759 break;
8762 case 2:
8763 if (get_attr_mode (insn) == MODE_XI
8764 || get_attr_mode (insn) == MODE_V8DF
8765 || get_attr_mode (insn) == MODE_V16SF)
8766 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
8767 if (TARGET_AVX)
8768 return "vpcmpeqd\t%0, %0, %0";
8769 else
8770 return "pcmpeqd\t%0, %0";
8772 default:
8773 break;
8775 gcc_unreachable ();
8778 /* Returns true if OP contains a symbol reference */
8780 bool
8781 symbolic_reference_mentioned_p (rtx op)
8783 const char *fmt;
8784 int i;
8786 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8787 return true;
8789 fmt = GET_RTX_FORMAT (GET_CODE (op));
8790 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8792 if (fmt[i] == 'E')
8794 int j;
8796 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8797 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8798 return true;
8801 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8802 return true;
8805 return false;
8808 /* Return true if it is appropriate to emit `ret' instructions in the
8809 body of a function. Do this only if the epilogue is simple, needing a
8810 couple of insns. Prior to reloading, we can't tell how many registers
8811 must be saved, so return false then. Return false if there is no frame
8812 marker to de-allocate. */
8814 bool
8815 ix86_can_use_return_insn_p (void)
8817 struct ix86_frame frame;
8819 if (! reload_completed || frame_pointer_needed)
8820 return 0;
8822 /* Don't allow more than 32k pop, since that's all we can do
8823 with one instruction. */
8824 if (crtl->args.pops_args && crtl->args.size >= 32768)
8825 return 0;
8827 ix86_compute_frame_layout (&frame);
8828 return (frame.stack_pointer_offset == UNITS_PER_WORD
8829 && (frame.nregs + frame.nsseregs) == 0);
8832 /* Value should be nonzero if functions must have frame pointers.
8833 Zero means the frame pointer need not be set up (and parms may
8834 be accessed via the stack pointer) in functions that seem suitable. */
8836 static bool
8837 ix86_frame_pointer_required (void)
8839 /* If we accessed previous frames, then the generated code expects
8840 to be able to access the saved ebp value in our frame. */
8841 if (cfun->machine->accesses_prev_frame)
8842 return true;
8844 /* Several x86 os'es need a frame pointer for other reasons,
8845 usually pertaining to setjmp. */
8846 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8847 return true;
8849 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8850 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8851 return true;
8853 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8854 allocation is 4GB. */
8855 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8856 return true;
8858 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8859 turns off the frame pointer by default. Turn it back on now if
8860 we've not got a leaf function. */
8861 if (TARGET_OMIT_LEAF_FRAME_POINTER
8862 && (!crtl->is_leaf
8863 || ix86_current_function_calls_tls_descriptor))
8864 return true;
8866 if (crtl->profile && !flag_fentry)
8867 return true;
8869 return false;
8872 /* Record that the current function accesses previous call frames. */
8874 void
8875 ix86_setup_frame_addresses (void)
8877 cfun->machine->accesses_prev_frame = 1;
8880 #ifndef USE_HIDDEN_LINKONCE
8881 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8882 # define USE_HIDDEN_LINKONCE 1
8883 # else
8884 # define USE_HIDDEN_LINKONCE 0
8885 # endif
8886 #endif
8888 static int pic_labels_used;
8890 /* Fills in the label name that should be used for a pc thunk for
8891 the given register. */
8893 static void
8894 get_pc_thunk_name (char name[32], unsigned int regno)
8896 gcc_assert (!TARGET_64BIT);
8898 if (USE_HIDDEN_LINKONCE)
8899 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8900 else
8901 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8905 /* This function generates code for -fpic that loads %ebx with
8906 the return address of the caller and then returns. */
8908 static void
8909 ix86_code_end (void)
8911 rtx xops[2];
8912 int regno;
8914 for (regno = AX_REG; regno <= SP_REG; regno++)
8916 char name[32];
8917 tree decl;
8919 if (!(pic_labels_used & (1 << regno)))
8920 continue;
8922 get_pc_thunk_name (name, regno);
8924 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8925 get_identifier (name),
8926 build_function_type_list (void_type_node, NULL_TREE));
8927 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8928 NULL_TREE, void_type_node);
8929 TREE_PUBLIC (decl) = 1;
8930 TREE_STATIC (decl) = 1;
8931 DECL_IGNORED_P (decl) = 1;
8933 #if TARGET_MACHO
8934 if (TARGET_MACHO)
8936 switch_to_section (darwin_sections[text_coal_section]);
8937 fputs ("\t.weak_definition\t", asm_out_file);
8938 assemble_name (asm_out_file, name);
8939 fputs ("\n\t.private_extern\t", asm_out_file);
8940 assemble_name (asm_out_file, name);
8941 putc ('\n', asm_out_file);
8942 ASM_OUTPUT_LABEL (asm_out_file, name);
8943 DECL_WEAK (decl) = 1;
8945 else
8946 #endif
8947 if (USE_HIDDEN_LINKONCE)
8949 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8951 targetm.asm_out.unique_section (decl, 0);
8952 switch_to_section (get_named_section (decl, NULL, 0));
8954 targetm.asm_out.globalize_label (asm_out_file, name);
8955 fputs ("\t.hidden\t", asm_out_file);
8956 assemble_name (asm_out_file, name);
8957 putc ('\n', asm_out_file);
8958 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8960 else
8962 switch_to_section (text_section);
8963 ASM_OUTPUT_LABEL (asm_out_file, name);
8966 DECL_INITIAL (decl) = make_node (BLOCK);
8967 current_function_decl = decl;
8968 init_function_start (decl);
8969 first_function_block_is_cold = false;
8970 /* Make sure unwind info is emitted for the thunk if needed. */
8971 final_start_function (emit_barrier (), asm_out_file, 1);
8973 /* Pad stack IP move with 4 instructions (two NOPs count
8974 as one instruction). */
8975 if (TARGET_PAD_SHORT_FUNCTION)
8977 int i = 8;
8979 while (i--)
8980 fputs ("\tnop\n", asm_out_file);
8983 xops[0] = gen_rtx_REG (Pmode, regno);
8984 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8985 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8986 output_asm_insn ("%!ret", NULL);
8987 final_end_function ();
8988 init_insn_lengths ();
8989 free_after_compilation (cfun);
8990 set_cfun (NULL);
8991 current_function_decl = NULL;
8994 if (flag_split_stack)
8995 file_end_indicate_split_stack ();
8998 /* Emit code for the SET_GOT patterns. */
9000 const char *
9001 output_set_got (rtx dest, rtx label)
9003 rtx xops[3];
9005 xops[0] = dest;
9007 if (TARGET_VXWORKS_RTP && flag_pic)
9009 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
9010 xops[2] = gen_rtx_MEM (Pmode,
9011 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
9012 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
9014 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
9015 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
9016 an unadorned address. */
9017 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
9018 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
9019 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
9020 return "";
9023 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
9025 if (!flag_pic)
9027 if (TARGET_MACHO)
9028 /* We don't need a pic base, we're not producing pic. */
9029 gcc_unreachable ();
9031 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
9032 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
9033 targetm.asm_out.internal_label (asm_out_file, "L",
9034 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
9036 else
9038 char name[32];
9039 get_pc_thunk_name (name, REGNO (dest));
9040 pic_labels_used |= 1 << REGNO (dest);
9042 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
9043 xops[2] = gen_rtx_MEM (QImode, xops[2]);
9044 output_asm_insn ("%!call\t%X2", xops);
9046 #if TARGET_MACHO
9047 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
9048 This is what will be referenced by the Mach-O PIC subsystem. */
9049 if (machopic_should_output_picbase_label () || !label)
9050 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
9052 /* When we are restoring the pic base at the site of a nonlocal label,
9053 and we decided to emit the pic base above, we will still output a
9054 local label used for calculating the correction offset (even though
9055 the offset will be 0 in that case). */
9056 if (label)
9057 targetm.asm_out.internal_label (asm_out_file, "L",
9058 CODE_LABEL_NUMBER (label));
9059 #endif
9062 if (!TARGET_MACHO)
9063 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
9065 return "";
9068 /* Generate an "push" pattern for input ARG. */
9070 static rtx
9071 gen_push (rtx arg)
9073 struct machine_function *m = cfun->machine;
9075 if (m->fs.cfa_reg == stack_pointer_rtx)
9076 m->fs.cfa_offset += UNITS_PER_WORD;
9077 m->fs.sp_offset += UNITS_PER_WORD;
9079 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9080 arg = gen_rtx_REG (word_mode, REGNO (arg));
9082 return gen_rtx_SET (VOIDmode,
9083 gen_rtx_MEM (word_mode,
9084 gen_rtx_PRE_DEC (Pmode,
9085 stack_pointer_rtx)),
9086 arg);
9089 /* Generate an "pop" pattern for input ARG. */
9091 static rtx
9092 gen_pop (rtx arg)
9094 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9095 arg = gen_rtx_REG (word_mode, REGNO (arg));
9097 return gen_rtx_SET (VOIDmode,
9098 arg,
9099 gen_rtx_MEM (word_mode,
9100 gen_rtx_POST_INC (Pmode,
9101 stack_pointer_rtx)));
9104 /* Return >= 0 if there is an unused call-clobbered register available
9105 for the entire function. */
9107 static unsigned int
9108 ix86_select_alt_pic_regnum (void)
9110 if (crtl->is_leaf
9111 && !crtl->profile
9112 && !ix86_current_function_calls_tls_descriptor)
9114 int i, drap;
9115 /* Can't use the same register for both PIC and DRAP. */
9116 if (crtl->drap_reg)
9117 drap = REGNO (crtl->drap_reg);
9118 else
9119 drap = -1;
9120 for (i = 2; i >= 0; --i)
9121 if (i != drap && !df_regs_ever_live_p (i))
9122 return i;
9125 return INVALID_REGNUM;
9128 /* Return TRUE if we need to save REGNO. */
9130 static bool
9131 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9133 if (pic_offset_table_rtx
9134 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9135 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9136 || crtl->profile
9137 || crtl->calls_eh_return
9138 || crtl->uses_const_pool
9139 || cfun->has_nonlocal_label))
9140 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9142 if (crtl->calls_eh_return && maybe_eh_return)
9144 unsigned i;
9145 for (i = 0; ; i++)
9147 unsigned test = EH_RETURN_DATA_REGNO (i);
9148 if (test == INVALID_REGNUM)
9149 break;
9150 if (test == regno)
9151 return true;
9155 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9156 return true;
9158 return (df_regs_ever_live_p (regno)
9159 && !call_used_regs[regno]
9160 && !fixed_regs[regno]
9161 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9164 /* Return number of saved general prupose registers. */
9166 static int
9167 ix86_nsaved_regs (void)
9169 int nregs = 0;
9170 int regno;
9172 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9173 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9174 nregs ++;
9175 return nregs;
9178 /* Return number of saved SSE registrers. */
9180 static int
9181 ix86_nsaved_sseregs (void)
9183 int nregs = 0;
9184 int regno;
9186 if (!TARGET_64BIT_MS_ABI)
9187 return 0;
9188 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9189 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9190 nregs ++;
9191 return nregs;
9194 /* Given FROM and TO register numbers, say whether this elimination is
9195 allowed. If stack alignment is needed, we can only replace argument
9196 pointer with hard frame pointer, or replace frame pointer with stack
9197 pointer. Otherwise, frame pointer elimination is automatically
9198 handled and all other eliminations are valid. */
9200 static bool
9201 ix86_can_eliminate (const int from, const int to)
9203 if (stack_realign_fp)
9204 return ((from == ARG_POINTER_REGNUM
9205 && to == HARD_FRAME_POINTER_REGNUM)
9206 || (from == FRAME_POINTER_REGNUM
9207 && to == STACK_POINTER_REGNUM));
9208 else
9209 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9212 /* Return the offset between two registers, one to be eliminated, and the other
9213 its replacement, at the start of a routine. */
9215 HOST_WIDE_INT
9216 ix86_initial_elimination_offset (int from, int to)
9218 struct ix86_frame frame;
9219 ix86_compute_frame_layout (&frame);
9221 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9222 return frame.hard_frame_pointer_offset;
9223 else if (from == FRAME_POINTER_REGNUM
9224 && to == HARD_FRAME_POINTER_REGNUM)
9225 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9226 else
9228 gcc_assert (to == STACK_POINTER_REGNUM);
9230 if (from == ARG_POINTER_REGNUM)
9231 return frame.stack_pointer_offset;
9233 gcc_assert (from == FRAME_POINTER_REGNUM);
9234 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9238 /* In a dynamically-aligned function, we can't know the offset from
9239 stack pointer to frame pointer, so we must ensure that setjmp
9240 eliminates fp against the hard fp (%ebp) rather than trying to
9241 index from %esp up to the top of the frame across a gap that is
9242 of unknown (at compile-time) size. */
9243 static rtx
9244 ix86_builtin_setjmp_frame_value (void)
9246 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9249 /* When using -fsplit-stack, the allocation routines set a field in
9250 the TCB to the bottom of the stack plus this much space, measured
9251 in bytes. */
9253 #define SPLIT_STACK_AVAILABLE 256
9255 /* Fill structure ix86_frame about frame of currently computed function. */
9257 static void
9258 ix86_compute_frame_layout (struct ix86_frame *frame)
9260 unsigned HOST_WIDE_INT stack_alignment_needed;
9261 HOST_WIDE_INT offset;
9262 unsigned HOST_WIDE_INT preferred_alignment;
9263 HOST_WIDE_INT size = get_frame_size ();
9264 HOST_WIDE_INT to_allocate;
9266 frame->nregs = ix86_nsaved_regs ();
9267 frame->nsseregs = ix86_nsaved_sseregs ();
9269 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9270 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9272 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9273 function prologues and leaf. */
9274 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9275 && (!crtl->is_leaf || cfun->calls_alloca != 0
9276 || ix86_current_function_calls_tls_descriptor))
9278 preferred_alignment = 16;
9279 stack_alignment_needed = 16;
9280 crtl->preferred_stack_boundary = 128;
9281 crtl->stack_alignment_needed = 128;
9284 gcc_assert (!size || stack_alignment_needed);
9285 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9286 gcc_assert (preferred_alignment <= stack_alignment_needed);
9288 /* For SEH we have to limit the amount of code movement into the prologue.
9289 At present we do this via a BLOCKAGE, at which point there's very little
9290 scheduling that can be done, which means that there's very little point
9291 in doing anything except PUSHs. */
9292 if (TARGET_SEH)
9293 cfun->machine->use_fast_prologue_epilogue = false;
9295 /* During reload iteration the amount of registers saved can change.
9296 Recompute the value as needed. Do not recompute when amount of registers
9297 didn't change as reload does multiple calls to the function and does not
9298 expect the decision to change within single iteration. */
9299 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR)
9300 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9302 int count = frame->nregs;
9303 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9305 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9307 /* The fast prologue uses move instead of push to save registers. This
9308 is significantly longer, but also executes faster as modern hardware
9309 can execute the moves in parallel, but can't do that for push/pop.
9311 Be careful about choosing what prologue to emit: When function takes
9312 many instructions to execute we may use slow version as well as in
9313 case function is known to be outside hot spot (this is known with
9314 feedback only). Weight the size of function by number of registers
9315 to save as it is cheap to use one or two push instructions but very
9316 slow to use many of them. */
9317 if (count)
9318 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9319 if (node->frequency < NODE_FREQUENCY_NORMAL
9320 || (flag_branch_probabilities
9321 && node->frequency < NODE_FREQUENCY_HOT))
9322 cfun->machine->use_fast_prologue_epilogue = false;
9323 else
9324 cfun->machine->use_fast_prologue_epilogue
9325 = !expensive_function_p (count);
9328 frame->save_regs_using_mov
9329 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9330 /* If static stack checking is enabled and done with probes,
9331 the registers need to be saved before allocating the frame. */
9332 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9334 /* Skip return address. */
9335 offset = UNITS_PER_WORD;
9337 /* Skip pushed static chain. */
9338 if (ix86_static_chain_on_stack)
9339 offset += UNITS_PER_WORD;
9341 /* Skip saved base pointer. */
9342 if (frame_pointer_needed)
9343 offset += UNITS_PER_WORD;
9344 frame->hfp_save_offset = offset;
9346 /* The traditional frame pointer location is at the top of the frame. */
9347 frame->hard_frame_pointer_offset = offset;
9349 /* Register save area */
9350 offset += frame->nregs * UNITS_PER_WORD;
9351 frame->reg_save_offset = offset;
9353 /* On SEH target, registers are pushed just before the frame pointer
9354 location. */
9355 if (TARGET_SEH)
9356 frame->hard_frame_pointer_offset = offset;
9358 /* Align and set SSE register save area. */
9359 if (frame->nsseregs)
9361 /* The only ABI that has saved SSE registers (Win64) also has a
9362 16-byte aligned default stack, and thus we don't need to be
9363 within the re-aligned local stack frame to save them. */
9364 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9365 offset = (offset + 16 - 1) & -16;
9366 offset += frame->nsseregs * 16;
9368 frame->sse_reg_save_offset = offset;
9370 /* The re-aligned stack starts here. Values before this point are not
9371 directly comparable with values below this point. In order to make
9372 sure that no value happens to be the same before and after, force
9373 the alignment computation below to add a non-zero value. */
9374 if (stack_realign_fp)
9375 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9377 /* Va-arg area */
9378 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9379 offset += frame->va_arg_size;
9381 /* Align start of frame for local function. */
9382 if (stack_realign_fp
9383 || offset != frame->sse_reg_save_offset
9384 || size != 0
9385 || !crtl->is_leaf
9386 || cfun->calls_alloca
9387 || ix86_current_function_calls_tls_descriptor)
9388 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9390 /* Frame pointer points here. */
9391 frame->frame_pointer_offset = offset;
9393 offset += size;
9395 /* Add outgoing arguments area. Can be skipped if we eliminated
9396 all the function calls as dead code.
9397 Skipping is however impossible when function calls alloca. Alloca
9398 expander assumes that last crtl->outgoing_args_size
9399 of stack frame are unused. */
9400 if (ACCUMULATE_OUTGOING_ARGS
9401 && (!crtl->is_leaf || cfun->calls_alloca
9402 || ix86_current_function_calls_tls_descriptor))
9404 offset += crtl->outgoing_args_size;
9405 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9407 else
9408 frame->outgoing_arguments_size = 0;
9410 /* Align stack boundary. Only needed if we're calling another function
9411 or using alloca. */
9412 if (!crtl->is_leaf || cfun->calls_alloca
9413 || ix86_current_function_calls_tls_descriptor)
9414 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9416 /* We've reached end of stack frame. */
9417 frame->stack_pointer_offset = offset;
9419 /* Size prologue needs to allocate. */
9420 to_allocate = offset - frame->sse_reg_save_offset;
9422 if ((!to_allocate && frame->nregs <= 1)
9423 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9424 frame->save_regs_using_mov = false;
9426 if (ix86_using_red_zone ()
9427 && crtl->sp_is_unchanging
9428 && crtl->is_leaf
9429 && !ix86_current_function_calls_tls_descriptor)
9431 frame->red_zone_size = to_allocate;
9432 if (frame->save_regs_using_mov)
9433 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9434 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9435 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9437 else
9438 frame->red_zone_size = 0;
9439 frame->stack_pointer_offset -= frame->red_zone_size;
9441 /* The SEH frame pointer location is near the bottom of the frame.
9442 This is enforced by the fact that the difference between the
9443 stack pointer and the frame pointer is limited to 240 bytes in
9444 the unwind data structure. */
9445 if (TARGET_SEH)
9447 HOST_WIDE_INT diff;
9449 /* If we can leave the frame pointer where it is, do so. Also, returns
9450 the establisher frame for __builtin_frame_address (0). */
9451 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9452 if (diff <= SEH_MAX_FRAME_SIZE
9453 && (diff > 240 || (diff & 15) != 0)
9454 && !crtl->accesses_prior_frames)
9456 /* Ideally we'd determine what portion of the local stack frame
9457 (within the constraint of the lowest 240) is most heavily used.
9458 But without that complication, simply bias the frame pointer
9459 by 128 bytes so as to maximize the amount of the local stack
9460 frame that is addressable with 8-bit offsets. */
9461 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9466 /* This is semi-inlined memory_address_length, but simplified
9467 since we know that we're always dealing with reg+offset, and
9468 to avoid having to create and discard all that rtl. */
9470 static inline int
9471 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9473 int len = 4;
9475 if (offset == 0)
9477 /* EBP and R13 cannot be encoded without an offset. */
9478 len = (regno == BP_REG || regno == R13_REG);
9480 else if (IN_RANGE (offset, -128, 127))
9481 len = 1;
9483 /* ESP and R12 must be encoded with a SIB byte. */
9484 if (regno == SP_REG || regno == R12_REG)
9485 len++;
9487 return len;
9490 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9491 The valid base registers are taken from CFUN->MACHINE->FS. */
9493 static rtx
9494 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9496 const struct machine_function *m = cfun->machine;
9497 rtx base_reg = NULL;
9498 HOST_WIDE_INT base_offset = 0;
9500 if (m->use_fast_prologue_epilogue)
9502 /* Choose the base register most likely to allow the most scheduling
9503 opportunities. Generally FP is valid throughout the function,
9504 while DRAP must be reloaded within the epilogue. But choose either
9505 over the SP due to increased encoding size. */
9507 if (m->fs.fp_valid)
9509 base_reg = hard_frame_pointer_rtx;
9510 base_offset = m->fs.fp_offset - cfa_offset;
9512 else if (m->fs.drap_valid)
9514 base_reg = crtl->drap_reg;
9515 base_offset = 0 - cfa_offset;
9517 else if (m->fs.sp_valid)
9519 base_reg = stack_pointer_rtx;
9520 base_offset = m->fs.sp_offset - cfa_offset;
9523 else
9525 HOST_WIDE_INT toffset;
9526 int len = 16, tlen;
9528 /* Choose the base register with the smallest address encoding.
9529 With a tie, choose FP > DRAP > SP. */
9530 if (m->fs.sp_valid)
9532 base_reg = stack_pointer_rtx;
9533 base_offset = m->fs.sp_offset - cfa_offset;
9534 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9536 if (m->fs.drap_valid)
9538 toffset = 0 - cfa_offset;
9539 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9540 if (tlen <= len)
9542 base_reg = crtl->drap_reg;
9543 base_offset = toffset;
9544 len = tlen;
9547 if (m->fs.fp_valid)
9549 toffset = m->fs.fp_offset - cfa_offset;
9550 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9551 if (tlen <= len)
9553 base_reg = hard_frame_pointer_rtx;
9554 base_offset = toffset;
9555 len = tlen;
9559 gcc_assert (base_reg != NULL);
9561 return plus_constant (Pmode, base_reg, base_offset);
9564 /* Emit code to save registers in the prologue. */
9566 static void
9567 ix86_emit_save_regs (void)
9569 unsigned int regno;
9570 rtx insn;
9572 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9573 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9575 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9576 RTX_FRAME_RELATED_P (insn) = 1;
9580 /* Emit a single register save at CFA - CFA_OFFSET. */
9582 static void
9583 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9584 HOST_WIDE_INT cfa_offset)
9586 struct machine_function *m = cfun->machine;
9587 rtx reg = gen_rtx_REG (mode, regno);
9588 rtx mem, addr, base, insn;
9590 addr = choose_baseaddr (cfa_offset);
9591 mem = gen_frame_mem (mode, addr);
9593 /* For SSE saves, we need to indicate the 128-bit alignment. */
9594 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9596 insn = emit_move_insn (mem, reg);
9597 RTX_FRAME_RELATED_P (insn) = 1;
9599 base = addr;
9600 if (GET_CODE (base) == PLUS)
9601 base = XEXP (base, 0);
9602 gcc_checking_assert (REG_P (base));
9604 /* When saving registers into a re-aligned local stack frame, avoid
9605 any tricky guessing by dwarf2out. */
9606 if (m->fs.realigned)
9608 gcc_checking_assert (stack_realign_drap);
9610 if (regno == REGNO (crtl->drap_reg))
9612 /* A bit of a hack. We force the DRAP register to be saved in
9613 the re-aligned stack frame, which provides us with a copy
9614 of the CFA that will last past the prologue. Install it. */
9615 gcc_checking_assert (cfun->machine->fs.fp_valid);
9616 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9617 cfun->machine->fs.fp_offset - cfa_offset);
9618 mem = gen_rtx_MEM (mode, addr);
9619 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9621 else
9623 /* The frame pointer is a stable reference within the
9624 aligned frame. Use it. */
9625 gcc_checking_assert (cfun->machine->fs.fp_valid);
9626 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9627 cfun->machine->fs.fp_offset - cfa_offset);
9628 mem = gen_rtx_MEM (mode, addr);
9629 add_reg_note (insn, REG_CFA_EXPRESSION,
9630 gen_rtx_SET (VOIDmode, mem, reg));
9634 /* The memory may not be relative to the current CFA register,
9635 which means that we may need to generate a new pattern for
9636 use by the unwind info. */
9637 else if (base != m->fs.cfa_reg)
9639 addr = plus_constant (Pmode, m->fs.cfa_reg,
9640 m->fs.cfa_offset - cfa_offset);
9641 mem = gen_rtx_MEM (mode, addr);
9642 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9646 /* Emit code to save registers using MOV insns.
9647 First register is stored at CFA - CFA_OFFSET. */
9648 static void
9649 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9651 unsigned int regno;
9653 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9654 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9656 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9657 cfa_offset -= UNITS_PER_WORD;
9661 /* Emit code to save SSE registers using MOV insns.
9662 First register is stored at CFA - CFA_OFFSET. */
9663 static void
9664 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9666 unsigned int regno;
9668 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9669 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9671 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9672 cfa_offset -= 16;
9676 static GTY(()) rtx queued_cfa_restores;
9678 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9679 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9680 Don't add the note if the previously saved value will be left untouched
9681 within stack red-zone till return, as unwinders can find the same value
9682 in the register and on the stack. */
9684 static void
9685 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9687 if (!crtl->shrink_wrapped
9688 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9689 return;
9691 if (insn)
9693 add_reg_note (insn, REG_CFA_RESTORE, reg);
9694 RTX_FRAME_RELATED_P (insn) = 1;
9696 else
9697 queued_cfa_restores
9698 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9701 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9703 static void
9704 ix86_add_queued_cfa_restore_notes (rtx insn)
9706 rtx last;
9707 if (!queued_cfa_restores)
9708 return;
9709 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9711 XEXP (last, 1) = REG_NOTES (insn);
9712 REG_NOTES (insn) = queued_cfa_restores;
9713 queued_cfa_restores = NULL_RTX;
9714 RTX_FRAME_RELATED_P (insn) = 1;
9717 /* Expand prologue or epilogue stack adjustment.
9718 The pattern exist to put a dependency on all ebp-based memory accesses.
9719 STYLE should be negative if instructions should be marked as frame related,
9720 zero if %r11 register is live and cannot be freely used and positive
9721 otherwise. */
9723 static void
9724 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9725 int style, bool set_cfa)
9727 struct machine_function *m = cfun->machine;
9728 rtx insn;
9729 bool add_frame_related_expr = false;
9731 if (Pmode == SImode)
9732 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9733 else if (x86_64_immediate_operand (offset, DImode))
9734 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9735 else
9737 rtx tmp;
9738 /* r11 is used by indirect sibcall return as well, set before the
9739 epilogue and used after the epilogue. */
9740 if (style)
9741 tmp = gen_rtx_REG (DImode, R11_REG);
9742 else
9744 gcc_assert (src != hard_frame_pointer_rtx
9745 && dest != hard_frame_pointer_rtx);
9746 tmp = hard_frame_pointer_rtx;
9748 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9749 if (style < 0)
9750 add_frame_related_expr = true;
9752 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9755 insn = emit_insn (insn);
9756 if (style >= 0)
9757 ix86_add_queued_cfa_restore_notes (insn);
9759 if (set_cfa)
9761 rtx r;
9763 gcc_assert (m->fs.cfa_reg == src);
9764 m->fs.cfa_offset += INTVAL (offset);
9765 m->fs.cfa_reg = dest;
9767 r = gen_rtx_PLUS (Pmode, src, offset);
9768 r = gen_rtx_SET (VOIDmode, dest, r);
9769 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9770 RTX_FRAME_RELATED_P (insn) = 1;
9772 else if (style < 0)
9774 RTX_FRAME_RELATED_P (insn) = 1;
9775 if (add_frame_related_expr)
9777 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9778 r = gen_rtx_SET (VOIDmode, dest, r);
9779 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9783 if (dest == stack_pointer_rtx)
9785 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9786 bool valid = m->fs.sp_valid;
9788 if (src == hard_frame_pointer_rtx)
9790 valid = m->fs.fp_valid;
9791 ooffset = m->fs.fp_offset;
9793 else if (src == crtl->drap_reg)
9795 valid = m->fs.drap_valid;
9796 ooffset = 0;
9798 else
9800 /* Else there are two possibilities: SP itself, which we set
9801 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9802 taken care of this by hand along the eh_return path. */
9803 gcc_checking_assert (src == stack_pointer_rtx
9804 || offset == const0_rtx);
9807 m->fs.sp_offset = ooffset - INTVAL (offset);
9808 m->fs.sp_valid = valid;
9812 /* Find an available register to be used as dynamic realign argument
9813 pointer regsiter. Such a register will be written in prologue and
9814 used in begin of body, so it must not be
9815 1. parameter passing register.
9816 2. GOT pointer.
9817 We reuse static-chain register if it is available. Otherwise, we
9818 use DI for i386 and R13 for x86-64. We chose R13 since it has
9819 shorter encoding.
9821 Return: the regno of chosen register. */
9823 static unsigned int
9824 find_drap_reg (void)
9826 tree decl = cfun->decl;
9828 if (TARGET_64BIT)
9830 /* Use R13 for nested function or function need static chain.
9831 Since function with tail call may use any caller-saved
9832 registers in epilogue, DRAP must not use caller-saved
9833 register in such case. */
9834 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9835 return R13_REG;
9837 return R10_REG;
9839 else
9841 /* Use DI for nested function or function need static chain.
9842 Since function with tail call may use any caller-saved
9843 registers in epilogue, DRAP must not use caller-saved
9844 register in such case. */
9845 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9846 return DI_REG;
9848 /* Reuse static chain register if it isn't used for parameter
9849 passing. */
9850 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9852 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9853 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9854 return CX_REG;
9856 return DI_REG;
9860 /* Return minimum incoming stack alignment. */
9862 static unsigned int
9863 ix86_minimum_incoming_stack_boundary (bool sibcall)
9865 unsigned int incoming_stack_boundary;
9867 /* Prefer the one specified at command line. */
9868 if (ix86_user_incoming_stack_boundary)
9869 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9870 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9871 if -mstackrealign is used, it isn't used for sibcall check and
9872 estimated stack alignment is 128bit. */
9873 else if (!sibcall
9874 && !TARGET_64BIT
9875 && ix86_force_align_arg_pointer
9876 && crtl->stack_alignment_estimated == 128)
9877 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9878 else
9879 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9881 /* Incoming stack alignment can be changed on individual functions
9882 via force_align_arg_pointer attribute. We use the smallest
9883 incoming stack boundary. */
9884 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9885 && lookup_attribute (ix86_force_align_arg_pointer_string,
9886 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9887 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9889 /* The incoming stack frame has to be aligned at least at
9890 parm_stack_boundary. */
9891 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9892 incoming_stack_boundary = crtl->parm_stack_boundary;
9894 /* Stack at entrance of main is aligned by runtime. We use the
9895 smallest incoming stack boundary. */
9896 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9897 && DECL_NAME (current_function_decl)
9898 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9899 && DECL_FILE_SCOPE_P (current_function_decl))
9900 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9902 return incoming_stack_boundary;
9905 /* Update incoming stack boundary and estimated stack alignment. */
9907 static void
9908 ix86_update_stack_boundary (void)
9910 ix86_incoming_stack_boundary
9911 = ix86_minimum_incoming_stack_boundary (false);
9913 /* x86_64 vararg needs 16byte stack alignment for register save
9914 area. */
9915 if (TARGET_64BIT
9916 && cfun->stdarg
9917 && crtl->stack_alignment_estimated < 128)
9918 crtl->stack_alignment_estimated = 128;
9921 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9922 needed or an rtx for DRAP otherwise. */
9924 static rtx
9925 ix86_get_drap_rtx (void)
9927 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9928 crtl->need_drap = true;
9930 if (stack_realign_drap)
9932 /* Assign DRAP to vDRAP and returns vDRAP */
9933 unsigned int regno = find_drap_reg ();
9934 rtx drap_vreg;
9935 rtx arg_ptr;
9936 rtx seq, insn;
9938 arg_ptr = gen_rtx_REG (Pmode, regno);
9939 crtl->drap_reg = arg_ptr;
9941 start_sequence ();
9942 drap_vreg = copy_to_reg (arg_ptr);
9943 seq = get_insns ();
9944 end_sequence ();
9946 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9947 if (!optimize)
9949 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9950 RTX_FRAME_RELATED_P (insn) = 1;
9952 return drap_vreg;
9954 else
9955 return NULL;
9958 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9960 static rtx
9961 ix86_internal_arg_pointer (void)
9963 return virtual_incoming_args_rtx;
9966 struct scratch_reg {
9967 rtx reg;
9968 bool saved;
9971 /* Return a short-lived scratch register for use on function entry.
9972 In 32-bit mode, it is valid only after the registers are saved
9973 in the prologue. This register must be released by means of
9974 release_scratch_register_on_entry once it is dead. */
9976 static void
9977 get_scratch_register_on_entry (struct scratch_reg *sr)
9979 int regno;
9981 sr->saved = false;
9983 if (TARGET_64BIT)
9985 /* We always use R11 in 64-bit mode. */
9986 regno = R11_REG;
9988 else
9990 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9991 bool fastcall_p
9992 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9993 bool thiscall_p
9994 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9995 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9996 int regparm = ix86_function_regparm (fntype, decl);
9997 int drap_regno
9998 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
10000 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
10001 for the static chain register. */
10002 if ((regparm < 1 || (fastcall_p && !static_chain_p))
10003 && drap_regno != AX_REG)
10004 regno = AX_REG;
10005 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
10006 for the static chain register. */
10007 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
10008 regno = AX_REG;
10009 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
10010 regno = DX_REG;
10011 /* ecx is the static chain register. */
10012 else if (regparm < 3 && !fastcall_p && !thiscall_p
10013 && !static_chain_p
10014 && drap_regno != CX_REG)
10015 regno = CX_REG;
10016 else if (ix86_save_reg (BX_REG, true))
10017 regno = BX_REG;
10018 /* esi is the static chain register. */
10019 else if (!(regparm == 3 && static_chain_p)
10020 && ix86_save_reg (SI_REG, true))
10021 regno = SI_REG;
10022 else if (ix86_save_reg (DI_REG, true))
10023 regno = DI_REG;
10024 else
10026 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
10027 sr->saved = true;
10031 sr->reg = gen_rtx_REG (Pmode, regno);
10032 if (sr->saved)
10034 rtx insn = emit_insn (gen_push (sr->reg));
10035 RTX_FRAME_RELATED_P (insn) = 1;
10039 /* Release a scratch register obtained from the preceding function. */
10041 static void
10042 release_scratch_register_on_entry (struct scratch_reg *sr)
10044 if (sr->saved)
10046 struct machine_function *m = cfun->machine;
10047 rtx x, insn = emit_insn (gen_pop (sr->reg));
10049 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
10050 RTX_FRAME_RELATED_P (insn) = 1;
10051 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
10052 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10053 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
10054 m->fs.sp_offset -= UNITS_PER_WORD;
10058 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
10060 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
10062 static void
10063 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
10065 /* We skip the probe for the first interval + a small dope of 4 words and
10066 probe that many bytes past the specified size to maintain a protection
10067 area at the botton of the stack. */
10068 const int dope = 4 * UNITS_PER_WORD;
10069 rtx size_rtx = GEN_INT (size), last;
10071 /* See if we have a constant small number of probes to generate. If so,
10072 that's the easy case. The run-time loop is made up of 11 insns in the
10073 generic case while the compile-time loop is made up of 3+2*(n-1) insns
10074 for n # of intervals. */
10075 if (size <= 5 * PROBE_INTERVAL)
10077 HOST_WIDE_INT i, adjust;
10078 bool first_probe = true;
10080 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
10081 values of N from 1 until it exceeds SIZE. If only one probe is
10082 needed, this will not generate any code. Then adjust and probe
10083 to PROBE_INTERVAL + SIZE. */
10084 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10086 if (first_probe)
10088 adjust = 2 * PROBE_INTERVAL + dope;
10089 first_probe = false;
10091 else
10092 adjust = PROBE_INTERVAL;
10094 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10095 plus_constant (Pmode, stack_pointer_rtx,
10096 -adjust)));
10097 emit_stack_probe (stack_pointer_rtx);
10100 if (first_probe)
10101 adjust = size + PROBE_INTERVAL + dope;
10102 else
10103 adjust = size + PROBE_INTERVAL - i;
10105 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10106 plus_constant (Pmode, stack_pointer_rtx,
10107 -adjust)));
10108 emit_stack_probe (stack_pointer_rtx);
10110 /* Adjust back to account for the additional first interval. */
10111 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10112 plus_constant (Pmode, stack_pointer_rtx,
10113 PROBE_INTERVAL + dope)));
10116 /* Otherwise, do the same as above, but in a loop. Note that we must be
10117 extra careful with variables wrapping around because we might be at
10118 the very top (or the very bottom) of the address space and we have
10119 to be able to handle this case properly; in particular, we use an
10120 equality test for the loop condition. */
10121 else
10123 HOST_WIDE_INT rounded_size;
10124 struct scratch_reg sr;
10126 get_scratch_register_on_entry (&sr);
10129 /* Step 1: round SIZE to the previous multiple of the interval. */
10131 rounded_size = size & -PROBE_INTERVAL;
10134 /* Step 2: compute initial and final value of the loop counter. */
10136 /* SP = SP_0 + PROBE_INTERVAL. */
10137 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10138 plus_constant (Pmode, stack_pointer_rtx,
10139 - (PROBE_INTERVAL + dope))));
10141 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10142 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10143 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10144 gen_rtx_PLUS (Pmode, sr.reg,
10145 stack_pointer_rtx)));
10148 /* Step 3: the loop
10150 while (SP != LAST_ADDR)
10152 SP = SP + PROBE_INTERVAL
10153 probe at SP
10156 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10157 values of N from 1 until it is equal to ROUNDED_SIZE. */
10159 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10162 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10163 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10165 if (size != rounded_size)
10167 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10168 plus_constant (Pmode, stack_pointer_rtx,
10169 rounded_size - size)));
10170 emit_stack_probe (stack_pointer_rtx);
10173 /* Adjust back to account for the additional first interval. */
10174 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10175 plus_constant (Pmode, stack_pointer_rtx,
10176 PROBE_INTERVAL + dope)));
10178 release_scratch_register_on_entry (&sr);
10181 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10183 /* Even if the stack pointer isn't the CFA register, we need to correctly
10184 describe the adjustments made to it, in particular differentiate the
10185 frame-related ones from the frame-unrelated ones. */
10186 if (size > 0)
10188 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10189 XVECEXP (expr, 0, 0)
10190 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10191 plus_constant (Pmode, stack_pointer_rtx, -size));
10192 XVECEXP (expr, 0, 1)
10193 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10194 plus_constant (Pmode, stack_pointer_rtx,
10195 PROBE_INTERVAL + dope + size));
10196 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10197 RTX_FRAME_RELATED_P (last) = 1;
10199 cfun->machine->fs.sp_offset += size;
10202 /* Make sure nothing is scheduled before we are done. */
10203 emit_insn (gen_blockage ());
10206 /* Adjust the stack pointer up to REG while probing it. */
10208 const char *
10209 output_adjust_stack_and_probe (rtx reg)
10211 static int labelno = 0;
10212 char loop_lab[32], end_lab[32];
10213 rtx xops[2];
10215 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10216 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10218 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10220 /* Jump to END_LAB if SP == LAST_ADDR. */
10221 xops[0] = stack_pointer_rtx;
10222 xops[1] = reg;
10223 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10224 fputs ("\tje\t", asm_out_file);
10225 assemble_name_raw (asm_out_file, end_lab);
10226 fputc ('\n', asm_out_file);
10228 /* SP = SP + PROBE_INTERVAL. */
10229 xops[1] = GEN_INT (PROBE_INTERVAL);
10230 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10232 /* Probe at SP. */
10233 xops[1] = const0_rtx;
10234 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10236 fprintf (asm_out_file, "\tjmp\t");
10237 assemble_name_raw (asm_out_file, loop_lab);
10238 fputc ('\n', asm_out_file);
10240 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10242 return "";
10245 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10246 inclusive. These are offsets from the current stack pointer. */
10248 static void
10249 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10251 /* See if we have a constant small number of probes to generate. If so,
10252 that's the easy case. The run-time loop is made up of 7 insns in the
10253 generic case while the compile-time loop is made up of n insns for n #
10254 of intervals. */
10255 if (size <= 7 * PROBE_INTERVAL)
10257 HOST_WIDE_INT i;
10259 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10260 it exceeds SIZE. If only one probe is needed, this will not
10261 generate any code. Then probe at FIRST + SIZE. */
10262 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10263 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10264 -(first + i)));
10266 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10267 -(first + size)));
10270 /* Otherwise, do the same as above, but in a loop. Note that we must be
10271 extra careful with variables wrapping around because we might be at
10272 the very top (or the very bottom) of the address space and we have
10273 to be able to handle this case properly; in particular, we use an
10274 equality test for the loop condition. */
10275 else
10277 HOST_WIDE_INT rounded_size, last;
10278 struct scratch_reg sr;
10280 get_scratch_register_on_entry (&sr);
10283 /* Step 1: round SIZE to the previous multiple of the interval. */
10285 rounded_size = size & -PROBE_INTERVAL;
10288 /* Step 2: compute initial and final value of the loop counter. */
10290 /* TEST_OFFSET = FIRST. */
10291 emit_move_insn (sr.reg, GEN_INT (-first));
10293 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10294 last = first + rounded_size;
10297 /* Step 3: the loop
10299 while (TEST_ADDR != LAST_ADDR)
10301 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10302 probe at TEST_ADDR
10305 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10306 until it is equal to ROUNDED_SIZE. */
10308 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10311 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10312 that SIZE is equal to ROUNDED_SIZE. */
10314 if (size != rounded_size)
10315 emit_stack_probe (plus_constant (Pmode,
10316 gen_rtx_PLUS (Pmode,
10317 stack_pointer_rtx,
10318 sr.reg),
10319 rounded_size - size));
10321 release_scratch_register_on_entry (&sr);
10324 /* Make sure nothing is scheduled before we are done. */
10325 emit_insn (gen_blockage ());
10328 /* Probe a range of stack addresses from REG to END, inclusive. These are
10329 offsets from the current stack pointer. */
10331 const char *
10332 output_probe_stack_range (rtx reg, rtx end)
10334 static int labelno = 0;
10335 char loop_lab[32], end_lab[32];
10336 rtx xops[3];
10338 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10339 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10341 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10343 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10344 xops[0] = reg;
10345 xops[1] = end;
10346 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10347 fputs ("\tje\t", asm_out_file);
10348 assemble_name_raw (asm_out_file, end_lab);
10349 fputc ('\n', asm_out_file);
10351 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10352 xops[1] = GEN_INT (PROBE_INTERVAL);
10353 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10355 /* Probe at TEST_ADDR. */
10356 xops[0] = stack_pointer_rtx;
10357 xops[1] = reg;
10358 xops[2] = const0_rtx;
10359 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10361 fprintf (asm_out_file, "\tjmp\t");
10362 assemble_name_raw (asm_out_file, loop_lab);
10363 fputc ('\n', asm_out_file);
10365 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10367 return "";
10370 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10371 to be generated in correct form. */
10372 static void
10373 ix86_finalize_stack_realign_flags (void)
10375 /* Check if stack realign is really needed after reload, and
10376 stores result in cfun */
10377 unsigned int incoming_stack_boundary
10378 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10379 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10380 unsigned int stack_realign = (incoming_stack_boundary
10381 < (crtl->is_leaf
10382 ? crtl->max_used_stack_slot_alignment
10383 : crtl->stack_alignment_needed));
10385 if (crtl->stack_realign_finalized)
10387 /* After stack_realign_needed is finalized, we can't no longer
10388 change it. */
10389 gcc_assert (crtl->stack_realign_needed == stack_realign);
10390 return;
10393 /* If the only reason for frame_pointer_needed is that we conservatively
10394 assumed stack realignment might be needed, but in the end nothing that
10395 needed the stack alignment had been spilled, clear frame_pointer_needed
10396 and say we don't need stack realignment. */
10397 if (stack_realign
10398 && !crtl->need_drap
10399 && frame_pointer_needed
10400 && crtl->is_leaf
10401 && flag_omit_frame_pointer
10402 && crtl->sp_is_unchanging
10403 && !ix86_current_function_calls_tls_descriptor
10404 && !crtl->accesses_prior_frames
10405 && !cfun->calls_alloca
10406 && !crtl->calls_eh_return
10407 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10408 && !ix86_frame_pointer_required ()
10409 && get_frame_size () == 0
10410 && ix86_nsaved_sseregs () == 0
10411 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10413 HARD_REG_SET set_up_by_prologue, prologue_used;
10414 basic_block bb;
10416 CLEAR_HARD_REG_SET (prologue_used);
10417 CLEAR_HARD_REG_SET (set_up_by_prologue);
10418 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10419 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10420 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10421 HARD_FRAME_POINTER_REGNUM);
10422 FOR_EACH_BB (bb)
10424 rtx insn;
10425 FOR_BB_INSNS (bb, insn)
10426 if (NONDEBUG_INSN_P (insn)
10427 && requires_stack_frame_p (insn, prologue_used,
10428 set_up_by_prologue))
10430 crtl->stack_realign_needed = stack_realign;
10431 crtl->stack_realign_finalized = true;
10432 return;
10436 frame_pointer_needed = false;
10437 stack_realign = false;
10438 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10439 crtl->stack_alignment_needed = incoming_stack_boundary;
10440 crtl->stack_alignment_estimated = incoming_stack_boundary;
10441 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10442 crtl->preferred_stack_boundary = incoming_stack_boundary;
10443 df_finish_pass (true);
10444 df_scan_alloc (NULL);
10445 df_scan_blocks ();
10446 df_compute_regs_ever_live (true);
10447 df_analyze ();
10450 crtl->stack_realign_needed = stack_realign;
10451 crtl->stack_realign_finalized = true;
10454 /* Expand the prologue into a bunch of separate insns. */
10456 void
10457 ix86_expand_prologue (void)
10459 struct machine_function *m = cfun->machine;
10460 rtx insn, t;
10461 bool pic_reg_used;
10462 struct ix86_frame frame;
10463 HOST_WIDE_INT allocate;
10464 bool int_registers_saved;
10465 bool sse_registers_saved;
10467 ix86_finalize_stack_realign_flags ();
10469 /* DRAP should not coexist with stack_realign_fp */
10470 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10472 memset (&m->fs, 0, sizeof (m->fs));
10474 /* Initialize CFA state for before the prologue. */
10475 m->fs.cfa_reg = stack_pointer_rtx;
10476 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10478 /* Track SP offset to the CFA. We continue tracking this after we've
10479 swapped the CFA register away from SP. In the case of re-alignment
10480 this is fudged; we're interested to offsets within the local frame. */
10481 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10482 m->fs.sp_valid = true;
10484 ix86_compute_frame_layout (&frame);
10486 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10488 /* We should have already generated an error for any use of
10489 ms_hook on a nested function. */
10490 gcc_checking_assert (!ix86_static_chain_on_stack);
10492 /* Check if profiling is active and we shall use profiling before
10493 prologue variant. If so sorry. */
10494 if (crtl->profile && flag_fentry != 0)
10495 sorry ("ms_hook_prologue attribute isn%'t compatible "
10496 "with -mfentry for 32-bit");
10498 /* In ix86_asm_output_function_label we emitted:
10499 8b ff movl.s %edi,%edi
10500 55 push %ebp
10501 8b ec movl.s %esp,%ebp
10503 This matches the hookable function prologue in Win32 API
10504 functions in Microsoft Windows XP Service Pack 2 and newer.
10505 Wine uses this to enable Windows apps to hook the Win32 API
10506 functions provided by Wine.
10508 What that means is that we've already set up the frame pointer. */
10510 if (frame_pointer_needed
10511 && !(crtl->drap_reg && crtl->stack_realign_needed))
10513 rtx push, mov;
10515 /* We've decided to use the frame pointer already set up.
10516 Describe this to the unwinder by pretending that both
10517 push and mov insns happen right here.
10519 Putting the unwind info here at the end of the ms_hook
10520 is done so that we can make absolutely certain we get
10521 the required byte sequence at the start of the function,
10522 rather than relying on an assembler that can produce
10523 the exact encoding required.
10525 However it does mean (in the unpatched case) that we have
10526 a 1 insn window where the asynchronous unwind info is
10527 incorrect. However, if we placed the unwind info at
10528 its correct location we would have incorrect unwind info
10529 in the patched case. Which is probably all moot since
10530 I don't expect Wine generates dwarf2 unwind info for the
10531 system libraries that use this feature. */
10533 insn = emit_insn (gen_blockage ());
10535 push = gen_push (hard_frame_pointer_rtx);
10536 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10537 stack_pointer_rtx);
10538 RTX_FRAME_RELATED_P (push) = 1;
10539 RTX_FRAME_RELATED_P (mov) = 1;
10541 RTX_FRAME_RELATED_P (insn) = 1;
10542 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10543 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10545 /* Note that gen_push incremented m->fs.cfa_offset, even
10546 though we didn't emit the push insn here. */
10547 m->fs.cfa_reg = hard_frame_pointer_rtx;
10548 m->fs.fp_offset = m->fs.cfa_offset;
10549 m->fs.fp_valid = true;
10551 else
10553 /* The frame pointer is not needed so pop %ebp again.
10554 This leaves us with a pristine state. */
10555 emit_insn (gen_pop (hard_frame_pointer_rtx));
10559 /* The first insn of a function that accepts its static chain on the
10560 stack is to push the register that would be filled in by a direct
10561 call. This insn will be skipped by the trampoline. */
10562 else if (ix86_static_chain_on_stack)
10564 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10565 emit_insn (gen_blockage ());
10567 /* We don't want to interpret this push insn as a register save,
10568 only as a stack adjustment. The real copy of the register as
10569 a save will be done later, if needed. */
10570 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10571 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10572 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10573 RTX_FRAME_RELATED_P (insn) = 1;
10576 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10577 of DRAP is needed and stack realignment is really needed after reload */
10578 if (stack_realign_drap)
10580 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10582 /* Only need to push parameter pointer reg if it is caller saved. */
10583 if (!call_used_regs[REGNO (crtl->drap_reg)])
10585 /* Push arg pointer reg */
10586 insn = emit_insn (gen_push (crtl->drap_reg));
10587 RTX_FRAME_RELATED_P (insn) = 1;
10590 /* Grab the argument pointer. */
10591 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10592 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10593 RTX_FRAME_RELATED_P (insn) = 1;
10594 m->fs.cfa_reg = crtl->drap_reg;
10595 m->fs.cfa_offset = 0;
10597 /* Align the stack. */
10598 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10599 stack_pointer_rtx,
10600 GEN_INT (-align_bytes)));
10601 RTX_FRAME_RELATED_P (insn) = 1;
10603 /* Replicate the return address on the stack so that return
10604 address can be reached via (argp - 1) slot. This is needed
10605 to implement macro RETURN_ADDR_RTX and intrinsic function
10606 expand_builtin_return_addr etc. */
10607 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10608 t = gen_frame_mem (word_mode, t);
10609 insn = emit_insn (gen_push (t));
10610 RTX_FRAME_RELATED_P (insn) = 1;
10612 /* For the purposes of frame and register save area addressing,
10613 we've started over with a new frame. */
10614 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10615 m->fs.realigned = true;
10618 int_registers_saved = (frame.nregs == 0);
10619 sse_registers_saved = (frame.nsseregs == 0);
10621 if (frame_pointer_needed && !m->fs.fp_valid)
10623 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10624 slower on all targets. Also sdb doesn't like it. */
10625 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10626 RTX_FRAME_RELATED_P (insn) = 1;
10628 /* Push registers now, before setting the frame pointer
10629 on SEH target. */
10630 if (!int_registers_saved
10631 && TARGET_SEH
10632 && !frame.save_regs_using_mov)
10634 ix86_emit_save_regs ();
10635 int_registers_saved = true;
10636 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10639 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10641 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10642 RTX_FRAME_RELATED_P (insn) = 1;
10644 if (m->fs.cfa_reg == stack_pointer_rtx)
10645 m->fs.cfa_reg = hard_frame_pointer_rtx;
10646 m->fs.fp_offset = m->fs.sp_offset;
10647 m->fs.fp_valid = true;
10651 if (!int_registers_saved)
10653 /* If saving registers via PUSH, do so now. */
10654 if (!frame.save_regs_using_mov)
10656 ix86_emit_save_regs ();
10657 int_registers_saved = true;
10658 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10661 /* When using red zone we may start register saving before allocating
10662 the stack frame saving one cycle of the prologue. However, avoid
10663 doing this if we have to probe the stack; at least on x86_64 the
10664 stack probe can turn into a call that clobbers a red zone location. */
10665 else if (ix86_using_red_zone ()
10666 && (! TARGET_STACK_PROBE
10667 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10669 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10670 int_registers_saved = true;
10674 if (stack_realign_fp)
10676 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10677 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10679 /* The computation of the size of the re-aligned stack frame means
10680 that we must allocate the size of the register save area before
10681 performing the actual alignment. Otherwise we cannot guarantee
10682 that there's enough storage above the realignment point. */
10683 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10684 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10685 GEN_INT (m->fs.sp_offset
10686 - frame.sse_reg_save_offset),
10687 -1, false);
10689 /* Align the stack. */
10690 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10691 stack_pointer_rtx,
10692 GEN_INT (-align_bytes)));
10694 /* For the purposes of register save area addressing, the stack
10695 pointer is no longer valid. As for the value of sp_offset,
10696 see ix86_compute_frame_layout, which we need to match in order
10697 to pass verification of stack_pointer_offset at the end. */
10698 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10699 m->fs.sp_valid = false;
10702 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10704 if (flag_stack_usage_info)
10706 /* We start to count from ARG_POINTER. */
10707 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10709 /* If it was realigned, take into account the fake frame. */
10710 if (stack_realign_drap)
10712 if (ix86_static_chain_on_stack)
10713 stack_size += UNITS_PER_WORD;
10715 if (!call_used_regs[REGNO (crtl->drap_reg)])
10716 stack_size += UNITS_PER_WORD;
10718 /* This over-estimates by 1 minimal-stack-alignment-unit but
10719 mitigates that by counting in the new return address slot. */
10720 current_function_dynamic_stack_size
10721 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10724 current_function_static_stack_size = stack_size;
10727 /* On SEH target with very large frame size, allocate an area to save
10728 SSE registers (as the very large allocation won't be described). */
10729 if (TARGET_SEH
10730 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10731 && !sse_registers_saved)
10733 HOST_WIDE_INT sse_size =
10734 frame.sse_reg_save_offset - frame.reg_save_offset;
10736 gcc_assert (int_registers_saved);
10738 /* No need to do stack checking as the area will be immediately
10739 written. */
10740 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10741 GEN_INT (-sse_size), -1,
10742 m->fs.cfa_reg == stack_pointer_rtx);
10743 allocate -= sse_size;
10744 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10745 sse_registers_saved = true;
10748 /* The stack has already been decremented by the instruction calling us
10749 so probe if the size is non-negative to preserve the protection area. */
10750 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10752 /* We expect the registers to be saved when probes are used. */
10753 gcc_assert (int_registers_saved);
10755 if (STACK_CHECK_MOVING_SP)
10757 if (!(crtl->is_leaf && !cfun->calls_alloca
10758 && allocate <= PROBE_INTERVAL))
10760 ix86_adjust_stack_and_probe (allocate);
10761 allocate = 0;
10764 else
10766 HOST_WIDE_INT size = allocate;
10768 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10769 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10771 if (TARGET_STACK_PROBE)
10773 if (crtl->is_leaf && !cfun->calls_alloca)
10775 if (size > PROBE_INTERVAL)
10776 ix86_emit_probe_stack_range (0, size);
10778 else
10779 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10781 else
10783 if (crtl->is_leaf && !cfun->calls_alloca)
10785 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
10786 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
10787 size - STACK_CHECK_PROTECT);
10789 else
10790 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10795 if (allocate == 0)
10797 else if (!ix86_target_stack_probe ()
10798 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10800 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10801 GEN_INT (-allocate), -1,
10802 m->fs.cfa_reg == stack_pointer_rtx);
10804 else
10806 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10807 rtx r10 = NULL;
10808 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10809 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10810 bool eax_live = false;
10811 bool r10_live = false;
10813 if (TARGET_64BIT)
10814 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10815 if (!TARGET_64BIT_MS_ABI)
10816 eax_live = ix86_eax_live_at_start_p ();
10818 /* Note that SEH directives need to continue tracking the stack
10819 pointer even after the frame pointer has been set up. */
10820 if (eax_live)
10822 insn = emit_insn (gen_push (eax));
10823 allocate -= UNITS_PER_WORD;
10824 if (sp_is_cfa_reg || TARGET_SEH)
10826 if (sp_is_cfa_reg)
10827 m->fs.cfa_offset += UNITS_PER_WORD;
10828 RTX_FRAME_RELATED_P (insn) = 1;
10832 if (r10_live)
10834 r10 = gen_rtx_REG (Pmode, R10_REG);
10835 insn = emit_insn (gen_push (r10));
10836 allocate -= UNITS_PER_WORD;
10837 if (sp_is_cfa_reg || TARGET_SEH)
10839 if (sp_is_cfa_reg)
10840 m->fs.cfa_offset += UNITS_PER_WORD;
10841 RTX_FRAME_RELATED_P (insn) = 1;
10845 emit_move_insn (eax, GEN_INT (allocate));
10846 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10848 /* Use the fact that AX still contains ALLOCATE. */
10849 adjust_stack_insn = (Pmode == DImode
10850 ? gen_pro_epilogue_adjust_stack_di_sub
10851 : gen_pro_epilogue_adjust_stack_si_sub);
10853 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10854 stack_pointer_rtx, eax));
10856 if (sp_is_cfa_reg || TARGET_SEH)
10858 if (sp_is_cfa_reg)
10859 m->fs.cfa_offset += allocate;
10860 RTX_FRAME_RELATED_P (insn) = 1;
10861 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10862 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10863 plus_constant (Pmode, stack_pointer_rtx,
10864 -allocate)));
10866 m->fs.sp_offset += allocate;
10868 if (r10_live && eax_live)
10870 t = choose_baseaddr (m->fs.sp_offset - allocate);
10871 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10872 gen_frame_mem (word_mode, t));
10873 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10874 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10875 gen_frame_mem (word_mode, t));
10877 else if (eax_live || r10_live)
10879 t = choose_baseaddr (m->fs.sp_offset - allocate);
10880 emit_move_insn (gen_rtx_REG (word_mode,
10881 (eax_live ? AX_REG : R10_REG)),
10882 gen_frame_mem (word_mode, t));
10885 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10887 /* If we havn't already set up the frame pointer, do so now. */
10888 if (frame_pointer_needed && !m->fs.fp_valid)
10890 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10891 GEN_INT (frame.stack_pointer_offset
10892 - frame.hard_frame_pointer_offset));
10893 insn = emit_insn (insn);
10894 RTX_FRAME_RELATED_P (insn) = 1;
10895 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10897 if (m->fs.cfa_reg == stack_pointer_rtx)
10898 m->fs.cfa_reg = hard_frame_pointer_rtx;
10899 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10900 m->fs.fp_valid = true;
10903 if (!int_registers_saved)
10904 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10905 if (!sse_registers_saved)
10906 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10908 pic_reg_used = false;
10909 /* We don't use pic-register for pe-coff target. */
10910 if (pic_offset_table_rtx
10911 && !TARGET_PECOFF
10912 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10913 || crtl->profile))
10915 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10917 if (alt_pic_reg_used != INVALID_REGNUM)
10918 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10920 pic_reg_used = true;
10923 if (pic_reg_used)
10925 if (TARGET_64BIT)
10927 if (ix86_cmodel == CM_LARGE_PIC)
10929 rtx label, tmp_reg;
10931 gcc_assert (Pmode == DImode);
10932 label = gen_label_rtx ();
10933 emit_label (label);
10934 LABEL_PRESERVE_P (label) = 1;
10935 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10936 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10937 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10938 label));
10939 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10940 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10941 pic_offset_table_rtx, tmp_reg));
10943 else
10944 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10946 else
10948 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10949 RTX_FRAME_RELATED_P (insn) = 1;
10950 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10954 /* In the pic_reg_used case, make sure that the got load isn't deleted
10955 when mcount needs it. Blockage to avoid call movement across mcount
10956 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10957 note. */
10958 if (crtl->profile && !flag_fentry && pic_reg_used)
10959 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10961 if (crtl->drap_reg && !crtl->stack_realign_needed)
10963 /* vDRAP is setup but after reload it turns out stack realign
10964 isn't necessary, here we will emit prologue to setup DRAP
10965 without stack realign adjustment */
10966 t = choose_baseaddr (0);
10967 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10970 /* Prevent instructions from being scheduled into register save push
10971 sequence when access to the redzone area is done through frame pointer.
10972 The offset between the frame pointer and the stack pointer is calculated
10973 relative to the value of the stack pointer at the end of the function
10974 prologue, and moving instructions that access redzone area via frame
10975 pointer inside push sequence violates this assumption. */
10976 if (frame_pointer_needed && frame.red_zone_size)
10977 emit_insn (gen_memory_blockage ());
10979 /* Emit cld instruction if stringops are used in the function. */
10980 if (TARGET_CLD && ix86_current_function_needs_cld)
10981 emit_insn (gen_cld ());
10983 /* SEH requires that the prologue end within 256 bytes of the start of
10984 the function. Prevent instruction schedules that would extend that.
10985 Further, prevent alloca modifications to the stack pointer from being
10986 combined with prologue modifications. */
10987 if (TARGET_SEH)
10988 emit_insn (gen_prologue_use (stack_pointer_rtx));
10991 /* Emit code to restore REG using a POP insn. */
10993 static void
10994 ix86_emit_restore_reg_using_pop (rtx reg)
10996 struct machine_function *m = cfun->machine;
10997 rtx insn = emit_insn (gen_pop (reg));
10999 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
11000 m->fs.sp_offset -= UNITS_PER_WORD;
11002 if (m->fs.cfa_reg == crtl->drap_reg
11003 && REGNO (reg) == REGNO (crtl->drap_reg))
11005 /* Previously we'd represented the CFA as an expression
11006 like *(%ebp - 8). We've just popped that value from
11007 the stack, which means we need to reset the CFA to
11008 the drap register. This will remain until we restore
11009 the stack pointer. */
11010 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11011 RTX_FRAME_RELATED_P (insn) = 1;
11013 /* This means that the DRAP register is valid for addressing too. */
11014 m->fs.drap_valid = true;
11015 return;
11018 if (m->fs.cfa_reg == stack_pointer_rtx)
11020 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11021 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
11022 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11023 RTX_FRAME_RELATED_P (insn) = 1;
11025 m->fs.cfa_offset -= UNITS_PER_WORD;
11028 /* When the frame pointer is the CFA, and we pop it, we are
11029 swapping back to the stack pointer as the CFA. This happens
11030 for stack frames that don't allocate other data, so we assume
11031 the stack pointer is now pointing at the return address, i.e.
11032 the function entry state, which makes the offset be 1 word. */
11033 if (reg == hard_frame_pointer_rtx)
11035 m->fs.fp_valid = false;
11036 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11038 m->fs.cfa_reg = stack_pointer_rtx;
11039 m->fs.cfa_offset -= UNITS_PER_WORD;
11041 add_reg_note (insn, REG_CFA_DEF_CFA,
11042 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11043 GEN_INT (m->fs.cfa_offset)));
11044 RTX_FRAME_RELATED_P (insn) = 1;
11049 /* Emit code to restore saved registers using POP insns. */
11051 static void
11052 ix86_emit_restore_regs_using_pop (void)
11054 unsigned int regno;
11056 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11057 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
11058 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
11061 /* Emit code and notes for the LEAVE instruction. */
11063 static void
11064 ix86_emit_leave (void)
11066 struct machine_function *m = cfun->machine;
11067 rtx insn = emit_insn (ix86_gen_leave ());
11069 ix86_add_queued_cfa_restore_notes (insn);
11071 gcc_assert (m->fs.fp_valid);
11072 m->fs.sp_valid = true;
11073 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
11074 m->fs.fp_valid = false;
11076 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11078 m->fs.cfa_reg = stack_pointer_rtx;
11079 m->fs.cfa_offset = m->fs.sp_offset;
11081 add_reg_note (insn, REG_CFA_DEF_CFA,
11082 plus_constant (Pmode, stack_pointer_rtx,
11083 m->fs.sp_offset));
11084 RTX_FRAME_RELATED_P (insn) = 1;
11086 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
11087 m->fs.fp_offset);
11090 /* Emit code to restore saved registers using MOV insns.
11091 First register is restored from CFA - CFA_OFFSET. */
11092 static void
11093 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
11094 bool maybe_eh_return)
11096 struct machine_function *m = cfun->machine;
11097 unsigned int regno;
11099 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11100 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11102 rtx reg = gen_rtx_REG (word_mode, regno);
11103 rtx insn, mem;
11105 mem = choose_baseaddr (cfa_offset);
11106 mem = gen_frame_mem (word_mode, mem);
11107 insn = emit_move_insn (reg, mem);
11109 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
11111 /* Previously we'd represented the CFA as an expression
11112 like *(%ebp - 8). We've just popped that value from
11113 the stack, which means we need to reset the CFA to
11114 the drap register. This will remain until we restore
11115 the stack pointer. */
11116 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11117 RTX_FRAME_RELATED_P (insn) = 1;
11119 /* This means that the DRAP register is valid for addressing. */
11120 m->fs.drap_valid = true;
11122 else
11123 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11125 cfa_offset -= UNITS_PER_WORD;
11129 /* Emit code to restore saved registers using MOV insns.
11130 First register is restored from CFA - CFA_OFFSET. */
11131 static void
11132 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11133 bool maybe_eh_return)
11135 unsigned int regno;
11137 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11138 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11140 rtx reg = gen_rtx_REG (V4SFmode, regno);
11141 rtx mem;
11143 mem = choose_baseaddr (cfa_offset);
11144 mem = gen_rtx_MEM (V4SFmode, mem);
11145 set_mem_align (mem, 128);
11146 emit_move_insn (reg, mem);
11148 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11150 cfa_offset -= 16;
11154 /* Restore function stack, frame, and registers. */
11156 void
11157 ix86_expand_epilogue (int style)
11159 struct machine_function *m = cfun->machine;
11160 struct machine_frame_state frame_state_save = m->fs;
11161 struct ix86_frame frame;
11162 bool restore_regs_via_mov;
11163 bool using_drap;
11165 ix86_finalize_stack_realign_flags ();
11166 ix86_compute_frame_layout (&frame);
11168 m->fs.sp_valid = (!frame_pointer_needed
11169 || (crtl->sp_is_unchanging
11170 && !stack_realign_fp));
11171 gcc_assert (!m->fs.sp_valid
11172 || m->fs.sp_offset == frame.stack_pointer_offset);
11174 /* The FP must be valid if the frame pointer is present. */
11175 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11176 gcc_assert (!m->fs.fp_valid
11177 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11179 /* We must have *some* valid pointer to the stack frame. */
11180 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11182 /* The DRAP is never valid at this point. */
11183 gcc_assert (!m->fs.drap_valid);
11185 /* See the comment about red zone and frame
11186 pointer usage in ix86_expand_prologue. */
11187 if (frame_pointer_needed && frame.red_zone_size)
11188 emit_insn (gen_memory_blockage ());
11190 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11191 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11193 /* Determine the CFA offset of the end of the red-zone. */
11194 m->fs.red_zone_offset = 0;
11195 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11197 /* The red-zone begins below the return address. */
11198 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11200 /* When the register save area is in the aligned portion of
11201 the stack, determine the maximum runtime displacement that
11202 matches up with the aligned frame. */
11203 if (stack_realign_drap)
11204 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11205 + UNITS_PER_WORD);
11208 /* Special care must be taken for the normal return case of a function
11209 using eh_return: the eax and edx registers are marked as saved, but
11210 not restored along this path. Adjust the save location to match. */
11211 if (crtl->calls_eh_return && style != 2)
11212 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11214 /* EH_RETURN requires the use of moves to function properly. */
11215 if (crtl->calls_eh_return)
11216 restore_regs_via_mov = true;
11217 /* SEH requires the use of pops to identify the epilogue. */
11218 else if (TARGET_SEH)
11219 restore_regs_via_mov = false;
11220 /* If we're only restoring one register and sp is not valid then
11221 using a move instruction to restore the register since it's
11222 less work than reloading sp and popping the register. */
11223 else if (!m->fs.sp_valid && frame.nregs <= 1)
11224 restore_regs_via_mov = true;
11225 else if (TARGET_EPILOGUE_USING_MOVE
11226 && cfun->machine->use_fast_prologue_epilogue
11227 && (frame.nregs > 1
11228 || m->fs.sp_offset != frame.reg_save_offset))
11229 restore_regs_via_mov = true;
11230 else if (frame_pointer_needed
11231 && !frame.nregs
11232 && m->fs.sp_offset != frame.reg_save_offset)
11233 restore_regs_via_mov = true;
11234 else if (frame_pointer_needed
11235 && TARGET_USE_LEAVE
11236 && cfun->machine->use_fast_prologue_epilogue
11237 && frame.nregs == 1)
11238 restore_regs_via_mov = true;
11239 else
11240 restore_regs_via_mov = false;
11242 if (restore_regs_via_mov || frame.nsseregs)
11244 /* Ensure that the entire register save area is addressable via
11245 the stack pointer, if we will restore via sp. */
11246 if (TARGET_64BIT
11247 && m->fs.sp_offset > 0x7fffffff
11248 && !(m->fs.fp_valid || m->fs.drap_valid)
11249 && (frame.nsseregs + frame.nregs) != 0)
11251 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11252 GEN_INT (m->fs.sp_offset
11253 - frame.sse_reg_save_offset),
11254 style,
11255 m->fs.cfa_reg == stack_pointer_rtx);
11259 /* If there are any SSE registers to restore, then we have to do it
11260 via moves, since there's obviously no pop for SSE regs. */
11261 if (frame.nsseregs)
11262 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11263 style == 2);
11265 if (restore_regs_via_mov)
11267 rtx t;
11269 if (frame.nregs)
11270 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11272 /* eh_return epilogues need %ecx added to the stack pointer. */
11273 if (style == 2)
11275 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11277 /* Stack align doesn't work with eh_return. */
11278 gcc_assert (!stack_realign_drap);
11279 /* Neither does regparm nested functions. */
11280 gcc_assert (!ix86_static_chain_on_stack);
11282 if (frame_pointer_needed)
11284 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11285 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11286 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11288 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11289 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11291 /* Note that we use SA as a temporary CFA, as the return
11292 address is at the proper place relative to it. We
11293 pretend this happens at the FP restore insn because
11294 prior to this insn the FP would be stored at the wrong
11295 offset relative to SA, and after this insn we have no
11296 other reasonable register to use for the CFA. We don't
11297 bother resetting the CFA to the SP for the duration of
11298 the return insn. */
11299 add_reg_note (insn, REG_CFA_DEF_CFA,
11300 plus_constant (Pmode, sa, UNITS_PER_WORD));
11301 ix86_add_queued_cfa_restore_notes (insn);
11302 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11303 RTX_FRAME_RELATED_P (insn) = 1;
11305 m->fs.cfa_reg = sa;
11306 m->fs.cfa_offset = UNITS_PER_WORD;
11307 m->fs.fp_valid = false;
11309 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11310 const0_rtx, style, false);
11312 else
11314 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11315 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11316 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11317 ix86_add_queued_cfa_restore_notes (insn);
11319 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11320 if (m->fs.cfa_offset != UNITS_PER_WORD)
11322 m->fs.cfa_offset = UNITS_PER_WORD;
11323 add_reg_note (insn, REG_CFA_DEF_CFA,
11324 plus_constant (Pmode, stack_pointer_rtx,
11325 UNITS_PER_WORD));
11326 RTX_FRAME_RELATED_P (insn) = 1;
11329 m->fs.sp_offset = UNITS_PER_WORD;
11330 m->fs.sp_valid = true;
11333 else
11335 /* SEH requires that the function end with (1) a stack adjustment
11336 if necessary, (2) a sequence of pops, and (3) a return or
11337 jump instruction. Prevent insns from the function body from
11338 being scheduled into this sequence. */
11339 if (TARGET_SEH)
11341 /* Prevent a catch region from being adjacent to the standard
11342 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11343 several other flags that would be interesting to test are
11344 not yet set up. */
11345 if (flag_non_call_exceptions)
11346 emit_insn (gen_nops (const1_rtx));
11347 else
11348 emit_insn (gen_blockage ());
11351 /* First step is to deallocate the stack frame so that we can
11352 pop the registers. Also do it on SEH target for very large
11353 frame as the emitted instructions aren't allowed by the ABI in
11354 epilogues. */
11355 if (!m->fs.sp_valid
11356 || (TARGET_SEH
11357 && (m->fs.sp_offset - frame.reg_save_offset
11358 >= SEH_MAX_FRAME_SIZE)))
11360 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11361 GEN_INT (m->fs.fp_offset
11362 - frame.reg_save_offset),
11363 style, false);
11365 else if (m->fs.sp_offset != frame.reg_save_offset)
11367 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11368 GEN_INT (m->fs.sp_offset
11369 - frame.reg_save_offset),
11370 style,
11371 m->fs.cfa_reg == stack_pointer_rtx);
11374 ix86_emit_restore_regs_using_pop ();
11377 /* If we used a stack pointer and haven't already got rid of it,
11378 then do so now. */
11379 if (m->fs.fp_valid)
11381 /* If the stack pointer is valid and pointing at the frame
11382 pointer store address, then we only need a pop. */
11383 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11384 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11385 /* Leave results in shorter dependency chains on CPUs that are
11386 able to grok it fast. */
11387 else if (TARGET_USE_LEAVE
11388 || optimize_bb_for_size_p (EXIT_BLOCK_PTR)
11389 || !cfun->machine->use_fast_prologue_epilogue)
11390 ix86_emit_leave ();
11391 else
11393 pro_epilogue_adjust_stack (stack_pointer_rtx,
11394 hard_frame_pointer_rtx,
11395 const0_rtx, style, !using_drap);
11396 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11400 if (using_drap)
11402 int param_ptr_offset = UNITS_PER_WORD;
11403 rtx insn;
11405 gcc_assert (stack_realign_drap);
11407 if (ix86_static_chain_on_stack)
11408 param_ptr_offset += UNITS_PER_WORD;
11409 if (!call_used_regs[REGNO (crtl->drap_reg)])
11410 param_ptr_offset += UNITS_PER_WORD;
11412 insn = emit_insn (gen_rtx_SET
11413 (VOIDmode, stack_pointer_rtx,
11414 gen_rtx_PLUS (Pmode,
11415 crtl->drap_reg,
11416 GEN_INT (-param_ptr_offset))));
11417 m->fs.cfa_reg = stack_pointer_rtx;
11418 m->fs.cfa_offset = param_ptr_offset;
11419 m->fs.sp_offset = param_ptr_offset;
11420 m->fs.realigned = false;
11422 add_reg_note (insn, REG_CFA_DEF_CFA,
11423 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11424 GEN_INT (param_ptr_offset)));
11425 RTX_FRAME_RELATED_P (insn) = 1;
11427 if (!call_used_regs[REGNO (crtl->drap_reg)])
11428 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11431 /* At this point the stack pointer must be valid, and we must have
11432 restored all of the registers. We may not have deallocated the
11433 entire stack frame. We've delayed this until now because it may
11434 be possible to merge the local stack deallocation with the
11435 deallocation forced by ix86_static_chain_on_stack. */
11436 gcc_assert (m->fs.sp_valid);
11437 gcc_assert (!m->fs.fp_valid);
11438 gcc_assert (!m->fs.realigned);
11439 if (m->fs.sp_offset != UNITS_PER_WORD)
11441 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11442 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11443 style, true);
11445 else
11446 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11448 /* Sibcall epilogues don't want a return instruction. */
11449 if (style == 0)
11451 m->fs = frame_state_save;
11452 return;
11455 if (crtl->args.pops_args && crtl->args.size)
11457 rtx popc = GEN_INT (crtl->args.pops_args);
11459 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11460 address, do explicit add, and jump indirectly to the caller. */
11462 if (crtl->args.pops_args >= 65536)
11464 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11465 rtx insn;
11467 /* There is no "pascal" calling convention in any 64bit ABI. */
11468 gcc_assert (!TARGET_64BIT);
11470 insn = emit_insn (gen_pop (ecx));
11471 m->fs.cfa_offset -= UNITS_PER_WORD;
11472 m->fs.sp_offset -= UNITS_PER_WORD;
11474 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11475 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11476 add_reg_note (insn, REG_CFA_REGISTER,
11477 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11478 RTX_FRAME_RELATED_P (insn) = 1;
11480 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11481 popc, -1, true);
11482 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11484 else
11485 emit_jump_insn (gen_simple_return_pop_internal (popc));
11487 else
11488 emit_jump_insn (gen_simple_return_internal ());
11490 /* Restore the state back to the state from the prologue,
11491 so that it's correct for the next epilogue. */
11492 m->fs = frame_state_save;
11495 /* Reset from the function's potential modifications. */
11497 static void
11498 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11499 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11501 if (pic_offset_table_rtx)
11502 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11503 #if TARGET_MACHO
11504 /* Mach-O doesn't support labels at the end of objects, so if
11505 it looks like we might want one, insert a NOP. */
11507 rtx insn = get_last_insn ();
11508 rtx deleted_debug_label = NULL_RTX;
11509 while (insn
11510 && NOTE_P (insn)
11511 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11513 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11514 notes only, instead set their CODE_LABEL_NUMBER to -1,
11515 otherwise there would be code generation differences
11516 in between -g and -g0. */
11517 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11518 deleted_debug_label = insn;
11519 insn = PREV_INSN (insn);
11521 if (insn
11522 && (LABEL_P (insn)
11523 || (NOTE_P (insn)
11524 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11525 fputs ("\tnop\n", file);
11526 else if (deleted_debug_label)
11527 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11528 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11529 CODE_LABEL_NUMBER (insn) = -1;
11531 #endif
11535 /* Return a scratch register to use in the split stack prologue. The
11536 split stack prologue is used for -fsplit-stack. It is the first
11537 instructions in the function, even before the regular prologue.
11538 The scratch register can be any caller-saved register which is not
11539 used for parameters or for the static chain. */
11541 static unsigned int
11542 split_stack_prologue_scratch_regno (void)
11544 if (TARGET_64BIT)
11545 return R11_REG;
11546 else
11548 bool is_fastcall, is_thiscall;
11549 int regparm;
11551 is_fastcall = (lookup_attribute ("fastcall",
11552 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11553 != NULL);
11554 is_thiscall = (lookup_attribute ("thiscall",
11555 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11556 != NULL);
11557 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11559 if (is_fastcall)
11561 if (DECL_STATIC_CHAIN (cfun->decl))
11563 sorry ("-fsplit-stack does not support fastcall with "
11564 "nested function");
11565 return INVALID_REGNUM;
11567 return AX_REG;
11569 else if (is_thiscall)
11571 if (!DECL_STATIC_CHAIN (cfun->decl))
11572 return DX_REG;
11573 return AX_REG;
11575 else if (regparm < 3)
11577 if (!DECL_STATIC_CHAIN (cfun->decl))
11578 return CX_REG;
11579 else
11581 if (regparm >= 2)
11583 sorry ("-fsplit-stack does not support 2 register "
11584 " parameters for a nested function");
11585 return INVALID_REGNUM;
11587 return DX_REG;
11590 else
11592 /* FIXME: We could make this work by pushing a register
11593 around the addition and comparison. */
11594 sorry ("-fsplit-stack does not support 3 register parameters");
11595 return INVALID_REGNUM;
11600 /* A SYMBOL_REF for the function which allocates new stackspace for
11601 -fsplit-stack. */
11603 static GTY(()) rtx split_stack_fn;
11605 /* A SYMBOL_REF for the more stack function when using the large
11606 model. */
11608 static GTY(()) rtx split_stack_fn_large;
11610 /* Handle -fsplit-stack. These are the first instructions in the
11611 function, even before the regular prologue. */
11613 void
11614 ix86_expand_split_stack_prologue (void)
11616 struct ix86_frame frame;
11617 HOST_WIDE_INT allocate;
11618 unsigned HOST_WIDE_INT args_size;
11619 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11620 rtx scratch_reg = NULL_RTX;
11621 rtx varargs_label = NULL_RTX;
11622 rtx fn;
11624 gcc_assert (flag_split_stack && reload_completed);
11626 ix86_finalize_stack_realign_flags ();
11627 ix86_compute_frame_layout (&frame);
11628 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11630 /* This is the label we will branch to if we have enough stack
11631 space. We expect the basic block reordering pass to reverse this
11632 branch if optimizing, so that we branch in the unlikely case. */
11633 label = gen_label_rtx ();
11635 /* We need to compare the stack pointer minus the frame size with
11636 the stack boundary in the TCB. The stack boundary always gives
11637 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11638 can compare directly. Otherwise we need to do an addition. */
11640 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11641 UNSPEC_STACK_CHECK);
11642 limit = gen_rtx_CONST (Pmode, limit);
11643 limit = gen_rtx_MEM (Pmode, limit);
11644 if (allocate < SPLIT_STACK_AVAILABLE)
11645 current = stack_pointer_rtx;
11646 else
11648 unsigned int scratch_regno;
11649 rtx offset;
11651 /* We need a scratch register to hold the stack pointer minus
11652 the required frame size. Since this is the very start of the
11653 function, the scratch register can be any caller-saved
11654 register which is not used for parameters. */
11655 offset = GEN_INT (- allocate);
11656 scratch_regno = split_stack_prologue_scratch_regno ();
11657 if (scratch_regno == INVALID_REGNUM)
11658 return;
11659 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11660 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11662 /* We don't use ix86_gen_add3 in this case because it will
11663 want to split to lea, but when not optimizing the insn
11664 will not be split after this point. */
11665 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11666 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11667 offset)));
11669 else
11671 emit_move_insn (scratch_reg, offset);
11672 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11673 stack_pointer_rtx));
11675 current = scratch_reg;
11678 ix86_expand_branch (GEU, current, limit, label);
11679 jump_insn = get_last_insn ();
11680 JUMP_LABEL (jump_insn) = label;
11682 /* Mark the jump as very likely to be taken. */
11683 add_int_reg_note (jump_insn, REG_BR_PROB,
11684 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
11686 if (split_stack_fn == NULL_RTX)
11687 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11688 fn = split_stack_fn;
11690 /* Get more stack space. We pass in the desired stack space and the
11691 size of the arguments to copy to the new stack. In 32-bit mode
11692 we push the parameters; __morestack will return on a new stack
11693 anyhow. In 64-bit mode we pass the parameters in r10 and
11694 r11. */
11695 allocate_rtx = GEN_INT (allocate);
11696 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11697 call_fusage = NULL_RTX;
11698 if (TARGET_64BIT)
11700 rtx reg10, reg11;
11702 reg10 = gen_rtx_REG (Pmode, R10_REG);
11703 reg11 = gen_rtx_REG (Pmode, R11_REG);
11705 /* If this function uses a static chain, it will be in %r10.
11706 Preserve it across the call to __morestack. */
11707 if (DECL_STATIC_CHAIN (cfun->decl))
11709 rtx rax;
11711 rax = gen_rtx_REG (word_mode, AX_REG);
11712 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11713 use_reg (&call_fusage, rax);
11716 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11717 && !TARGET_PECOFF)
11719 HOST_WIDE_INT argval;
11721 gcc_assert (Pmode == DImode);
11722 /* When using the large model we need to load the address
11723 into a register, and we've run out of registers. So we
11724 switch to a different calling convention, and we call a
11725 different function: __morestack_large. We pass the
11726 argument size in the upper 32 bits of r10 and pass the
11727 frame size in the lower 32 bits. */
11728 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11729 gcc_assert ((args_size & 0xffffffff) == args_size);
11731 if (split_stack_fn_large == NULL_RTX)
11732 split_stack_fn_large =
11733 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11735 if (ix86_cmodel == CM_LARGE_PIC)
11737 rtx label, x;
11739 label = gen_label_rtx ();
11740 emit_label (label);
11741 LABEL_PRESERVE_P (label) = 1;
11742 emit_insn (gen_set_rip_rex64 (reg10, label));
11743 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11744 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11745 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11746 UNSPEC_GOT);
11747 x = gen_rtx_CONST (Pmode, x);
11748 emit_move_insn (reg11, x);
11749 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11750 x = gen_const_mem (Pmode, x);
11751 emit_move_insn (reg11, x);
11753 else
11754 emit_move_insn (reg11, split_stack_fn_large);
11756 fn = reg11;
11758 argval = ((args_size << 16) << 16) + allocate;
11759 emit_move_insn (reg10, GEN_INT (argval));
11761 else
11763 emit_move_insn (reg10, allocate_rtx);
11764 emit_move_insn (reg11, GEN_INT (args_size));
11765 use_reg (&call_fusage, reg11);
11768 use_reg (&call_fusage, reg10);
11770 else
11772 emit_insn (gen_push (GEN_INT (args_size)));
11773 emit_insn (gen_push (allocate_rtx));
11775 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11776 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11777 NULL_RTX, false);
11778 add_function_usage_to (call_insn, call_fusage);
11780 /* In order to make call/return prediction work right, we now need
11781 to execute a return instruction. See
11782 libgcc/config/i386/morestack.S for the details on how this works.
11784 For flow purposes gcc must not see this as a return
11785 instruction--we need control flow to continue at the subsequent
11786 label. Therefore, we use an unspec. */
11787 gcc_assert (crtl->args.pops_args < 65536);
11788 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11790 /* If we are in 64-bit mode and this function uses a static chain,
11791 we saved %r10 in %rax before calling _morestack. */
11792 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11793 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11794 gen_rtx_REG (word_mode, AX_REG));
11796 /* If this function calls va_start, we need to store a pointer to
11797 the arguments on the old stack, because they may not have been
11798 all copied to the new stack. At this point the old stack can be
11799 found at the frame pointer value used by __morestack, because
11800 __morestack has set that up before calling back to us. Here we
11801 store that pointer in a scratch register, and in
11802 ix86_expand_prologue we store the scratch register in a stack
11803 slot. */
11804 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11806 unsigned int scratch_regno;
11807 rtx frame_reg;
11808 int words;
11810 scratch_regno = split_stack_prologue_scratch_regno ();
11811 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11812 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11814 /* 64-bit:
11815 fp -> old fp value
11816 return address within this function
11817 return address of caller of this function
11818 stack arguments
11819 So we add three words to get to the stack arguments.
11821 32-bit:
11822 fp -> old fp value
11823 return address within this function
11824 first argument to __morestack
11825 second argument to __morestack
11826 return address of caller of this function
11827 stack arguments
11828 So we add five words to get to the stack arguments.
11830 words = TARGET_64BIT ? 3 : 5;
11831 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11832 gen_rtx_PLUS (Pmode, frame_reg,
11833 GEN_INT (words * UNITS_PER_WORD))));
11835 varargs_label = gen_label_rtx ();
11836 emit_jump_insn (gen_jump (varargs_label));
11837 JUMP_LABEL (get_last_insn ()) = varargs_label;
11839 emit_barrier ();
11842 emit_label (label);
11843 LABEL_NUSES (label) = 1;
11845 /* If this function calls va_start, we now have to set the scratch
11846 register for the case where we do not call __morestack. In this
11847 case we need to set it based on the stack pointer. */
11848 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11850 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11851 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11852 GEN_INT (UNITS_PER_WORD))));
11854 emit_label (varargs_label);
11855 LABEL_NUSES (varargs_label) = 1;
11859 /* We may have to tell the dataflow pass that the split stack prologue
11860 is initializing a scratch register. */
11862 static void
11863 ix86_live_on_entry (bitmap regs)
11865 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11867 gcc_assert (flag_split_stack);
11868 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11872 /* Determine if op is suitable SUBREG RTX for address. */
11874 static bool
11875 ix86_address_subreg_operand (rtx op)
11877 enum machine_mode mode;
11879 if (!REG_P (op))
11880 return false;
11882 mode = GET_MODE (op);
11884 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11885 failures when the register is one word out of a two word structure. */
11886 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11887 return false;
11889 /* Allow only SUBREGs of non-eliminable hard registers. */
11890 return register_no_elim_operand (op, mode);
11893 /* Extract the parts of an RTL expression that is a valid memory address
11894 for an instruction. Return 0 if the structure of the address is
11895 grossly off. Return -1 if the address contains ASHIFT, so it is not
11896 strictly valid, but still used for computing length of lea instruction. */
11899 ix86_decompose_address (rtx addr, struct ix86_address *out)
11901 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11902 rtx base_reg, index_reg;
11903 HOST_WIDE_INT scale = 1;
11904 rtx scale_rtx = NULL_RTX;
11905 rtx tmp;
11906 int retval = 1;
11907 enum ix86_address_seg seg = SEG_DEFAULT;
11909 /* Allow zero-extended SImode addresses,
11910 they will be emitted with addr32 prefix. */
11911 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11913 if (GET_CODE (addr) == ZERO_EXTEND
11914 && GET_MODE (XEXP (addr, 0)) == SImode)
11916 addr = XEXP (addr, 0);
11917 if (CONST_INT_P (addr))
11918 return 0;
11920 else if (GET_CODE (addr) == AND
11921 && const_32bit_mask (XEXP (addr, 1), DImode))
11923 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11924 if (addr == NULL_RTX)
11925 return 0;
11927 if (CONST_INT_P (addr))
11928 return 0;
11932 /* Allow SImode subregs of DImode addresses,
11933 they will be emitted with addr32 prefix. */
11934 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11936 if (GET_CODE (addr) == SUBREG
11937 && GET_MODE (SUBREG_REG (addr)) == DImode)
11939 addr = SUBREG_REG (addr);
11940 if (CONST_INT_P (addr))
11941 return 0;
11945 if (REG_P (addr))
11946 base = addr;
11947 else if (GET_CODE (addr) == SUBREG)
11949 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11950 base = addr;
11951 else
11952 return 0;
11954 else if (GET_CODE (addr) == PLUS)
11956 rtx addends[4], op;
11957 int n = 0, i;
11959 op = addr;
11962 if (n >= 4)
11963 return 0;
11964 addends[n++] = XEXP (op, 1);
11965 op = XEXP (op, 0);
11967 while (GET_CODE (op) == PLUS);
11968 if (n >= 4)
11969 return 0;
11970 addends[n] = op;
11972 for (i = n; i >= 0; --i)
11974 op = addends[i];
11975 switch (GET_CODE (op))
11977 case MULT:
11978 if (index)
11979 return 0;
11980 index = XEXP (op, 0);
11981 scale_rtx = XEXP (op, 1);
11982 break;
11984 case ASHIFT:
11985 if (index)
11986 return 0;
11987 index = XEXP (op, 0);
11988 tmp = XEXP (op, 1);
11989 if (!CONST_INT_P (tmp))
11990 return 0;
11991 scale = INTVAL (tmp);
11992 if ((unsigned HOST_WIDE_INT) scale > 3)
11993 return 0;
11994 scale = 1 << scale;
11995 break;
11997 case ZERO_EXTEND:
11998 op = XEXP (op, 0);
11999 if (GET_CODE (op) != UNSPEC)
12000 return 0;
12001 /* FALLTHRU */
12003 case UNSPEC:
12004 if (XINT (op, 1) == UNSPEC_TP
12005 && TARGET_TLS_DIRECT_SEG_REFS
12006 && seg == SEG_DEFAULT)
12007 seg = DEFAULT_TLS_SEG_REG;
12008 else
12009 return 0;
12010 break;
12012 case SUBREG:
12013 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
12014 return 0;
12015 /* FALLTHRU */
12017 case REG:
12018 if (!base)
12019 base = op;
12020 else if (!index)
12021 index = op;
12022 else
12023 return 0;
12024 break;
12026 case CONST:
12027 case CONST_INT:
12028 case SYMBOL_REF:
12029 case LABEL_REF:
12030 if (disp)
12031 return 0;
12032 disp = op;
12033 break;
12035 default:
12036 return 0;
12040 else if (GET_CODE (addr) == MULT)
12042 index = XEXP (addr, 0); /* index*scale */
12043 scale_rtx = XEXP (addr, 1);
12045 else if (GET_CODE (addr) == ASHIFT)
12047 /* We're called for lea too, which implements ashift on occasion. */
12048 index = XEXP (addr, 0);
12049 tmp = XEXP (addr, 1);
12050 if (!CONST_INT_P (tmp))
12051 return 0;
12052 scale = INTVAL (tmp);
12053 if ((unsigned HOST_WIDE_INT) scale > 3)
12054 return 0;
12055 scale = 1 << scale;
12056 retval = -1;
12058 else
12059 disp = addr; /* displacement */
12061 if (index)
12063 if (REG_P (index))
12065 else if (GET_CODE (index) == SUBREG
12066 && ix86_address_subreg_operand (SUBREG_REG (index)))
12068 else
12069 return 0;
12072 /* Address override works only on the (%reg) part of %fs:(%reg). */
12073 if (seg != SEG_DEFAULT
12074 && ((base && GET_MODE (base) != word_mode)
12075 || (index && GET_MODE (index) != word_mode)))
12076 return 0;
12078 /* Extract the integral value of scale. */
12079 if (scale_rtx)
12081 if (!CONST_INT_P (scale_rtx))
12082 return 0;
12083 scale = INTVAL (scale_rtx);
12086 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
12087 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
12089 /* Avoid useless 0 displacement. */
12090 if (disp == const0_rtx && (base || index))
12091 disp = NULL_RTX;
12093 /* Allow arg pointer and stack pointer as index if there is not scaling. */
12094 if (base_reg && index_reg && scale == 1
12095 && (index_reg == arg_pointer_rtx
12096 || index_reg == frame_pointer_rtx
12097 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12099 rtx tmp;
12100 tmp = base, base = index, index = tmp;
12101 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12104 /* Special case: %ebp cannot be encoded as a base without a displacement.
12105 Similarly %r13. */
12106 if (!disp
12107 && base_reg
12108 && (base_reg == hard_frame_pointer_rtx
12109 || base_reg == frame_pointer_rtx
12110 || base_reg == arg_pointer_rtx
12111 || (REG_P (base_reg)
12112 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12113 || REGNO (base_reg) == R13_REG))))
12114 disp = const0_rtx;
12116 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12117 Avoid this by transforming to [%esi+0].
12118 Reload calls address legitimization without cfun defined, so we need
12119 to test cfun for being non-NULL. */
12120 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12121 && base_reg && !index_reg && !disp
12122 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12123 disp = const0_rtx;
12125 /* Special case: encode reg+reg instead of reg*2. */
12126 if (!base && index && scale == 2)
12127 base = index, base_reg = index_reg, scale = 1;
12129 /* Special case: scaling cannot be encoded without base or displacement. */
12130 if (!base && !disp && index && scale != 1)
12131 disp = const0_rtx;
12133 out->base = base;
12134 out->index = index;
12135 out->disp = disp;
12136 out->scale = scale;
12137 out->seg = seg;
12139 return retval;
12142 /* Return cost of the memory address x.
12143 For i386, it is better to use a complex address than let gcc copy
12144 the address into a reg and make a new pseudo. But not if the address
12145 requires to two regs - that would mean more pseudos with longer
12146 lifetimes. */
12147 static int
12148 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12149 addr_space_t as ATTRIBUTE_UNUSED,
12150 bool speed ATTRIBUTE_UNUSED)
12152 struct ix86_address parts;
12153 int cost = 1;
12154 int ok = ix86_decompose_address (x, &parts);
12156 gcc_assert (ok);
12158 if (parts.base && GET_CODE (parts.base) == SUBREG)
12159 parts.base = SUBREG_REG (parts.base);
12160 if (parts.index && GET_CODE (parts.index) == SUBREG)
12161 parts.index = SUBREG_REG (parts.index);
12163 /* Attempt to minimize number of registers in the address. */
12164 if ((parts.base
12165 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12166 || (parts.index
12167 && (!REG_P (parts.index)
12168 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12169 cost++;
12171 if (parts.base
12172 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12173 && parts.index
12174 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12175 && parts.base != parts.index)
12176 cost++;
12178 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12179 since it's predecode logic can't detect the length of instructions
12180 and it degenerates to vector decoded. Increase cost of such
12181 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12182 to split such addresses or even refuse such addresses at all.
12184 Following addressing modes are affected:
12185 [base+scale*index]
12186 [scale*index+disp]
12187 [base+index]
12189 The first and last case may be avoidable by explicitly coding the zero in
12190 memory address, but I don't have AMD-K6 machine handy to check this
12191 theory. */
12193 if (TARGET_K6
12194 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12195 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12196 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12197 cost += 10;
12199 return cost;
12202 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12203 this is used for to form addresses to local data when -fPIC is in
12204 use. */
12206 static bool
12207 darwin_local_data_pic (rtx disp)
12209 return (GET_CODE (disp) == UNSPEC
12210 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12213 /* Determine if a given RTX is a valid constant. We already know this
12214 satisfies CONSTANT_P. */
12216 static bool
12217 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12219 switch (GET_CODE (x))
12221 case CONST:
12222 x = XEXP (x, 0);
12224 if (GET_CODE (x) == PLUS)
12226 if (!CONST_INT_P (XEXP (x, 1)))
12227 return false;
12228 x = XEXP (x, 0);
12231 if (TARGET_MACHO && darwin_local_data_pic (x))
12232 return true;
12234 /* Only some unspecs are valid as "constants". */
12235 if (GET_CODE (x) == UNSPEC)
12236 switch (XINT (x, 1))
12238 case UNSPEC_GOT:
12239 case UNSPEC_GOTOFF:
12240 case UNSPEC_PLTOFF:
12241 return TARGET_64BIT;
12242 case UNSPEC_TPOFF:
12243 case UNSPEC_NTPOFF:
12244 x = XVECEXP (x, 0, 0);
12245 return (GET_CODE (x) == SYMBOL_REF
12246 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12247 case UNSPEC_DTPOFF:
12248 x = XVECEXP (x, 0, 0);
12249 return (GET_CODE (x) == SYMBOL_REF
12250 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12251 default:
12252 return false;
12255 /* We must have drilled down to a symbol. */
12256 if (GET_CODE (x) == LABEL_REF)
12257 return true;
12258 if (GET_CODE (x) != SYMBOL_REF)
12259 return false;
12260 /* FALLTHRU */
12262 case SYMBOL_REF:
12263 /* TLS symbols are never valid. */
12264 if (SYMBOL_REF_TLS_MODEL (x))
12265 return false;
12267 /* DLLIMPORT symbols are never valid. */
12268 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12269 && SYMBOL_REF_DLLIMPORT_P (x))
12270 return false;
12272 #if TARGET_MACHO
12273 /* mdynamic-no-pic */
12274 if (MACHO_DYNAMIC_NO_PIC_P)
12275 return machopic_symbol_defined_p (x);
12276 #endif
12277 break;
12279 case CONST_DOUBLE:
12280 if (GET_MODE (x) == TImode
12281 && x != CONST0_RTX (TImode)
12282 && !TARGET_64BIT)
12283 return false;
12284 break;
12286 case CONST_VECTOR:
12287 if (!standard_sse_constant_p (x))
12288 return false;
12290 default:
12291 break;
12294 /* Otherwise we handle everything else in the move patterns. */
12295 return true;
12298 /* Determine if it's legal to put X into the constant pool. This
12299 is not possible for the address of thread-local symbols, which
12300 is checked above. */
12302 static bool
12303 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12305 /* We can always put integral constants and vectors in memory. */
12306 switch (GET_CODE (x))
12308 case CONST_INT:
12309 case CONST_DOUBLE:
12310 case CONST_VECTOR:
12311 return false;
12313 default:
12314 break;
12316 return !ix86_legitimate_constant_p (mode, x);
12319 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12320 otherwise zero. */
12322 static bool
12323 is_imported_p (rtx x)
12325 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12326 || GET_CODE (x) != SYMBOL_REF)
12327 return false;
12329 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12333 /* Nonzero if the constant value X is a legitimate general operand
12334 when generating PIC code. It is given that flag_pic is on and
12335 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12337 bool
12338 legitimate_pic_operand_p (rtx x)
12340 rtx inner;
12342 switch (GET_CODE (x))
12344 case CONST:
12345 inner = XEXP (x, 0);
12346 if (GET_CODE (inner) == PLUS
12347 && CONST_INT_P (XEXP (inner, 1)))
12348 inner = XEXP (inner, 0);
12350 /* Only some unspecs are valid as "constants". */
12351 if (GET_CODE (inner) == UNSPEC)
12352 switch (XINT (inner, 1))
12354 case UNSPEC_GOT:
12355 case UNSPEC_GOTOFF:
12356 case UNSPEC_PLTOFF:
12357 return TARGET_64BIT;
12358 case UNSPEC_TPOFF:
12359 x = XVECEXP (inner, 0, 0);
12360 return (GET_CODE (x) == SYMBOL_REF
12361 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12362 case UNSPEC_MACHOPIC_OFFSET:
12363 return legitimate_pic_address_disp_p (x);
12364 default:
12365 return false;
12367 /* FALLTHRU */
12369 case SYMBOL_REF:
12370 case LABEL_REF:
12371 return legitimate_pic_address_disp_p (x);
12373 default:
12374 return true;
12378 /* Determine if a given CONST RTX is a valid memory displacement
12379 in PIC mode. */
12381 bool
12382 legitimate_pic_address_disp_p (rtx disp)
12384 bool saw_plus;
12386 /* In 64bit mode we can allow direct addresses of symbols and labels
12387 when they are not dynamic symbols. */
12388 if (TARGET_64BIT)
12390 rtx op0 = disp, op1;
12392 switch (GET_CODE (disp))
12394 case LABEL_REF:
12395 return true;
12397 case CONST:
12398 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12399 break;
12400 op0 = XEXP (XEXP (disp, 0), 0);
12401 op1 = XEXP (XEXP (disp, 0), 1);
12402 if (!CONST_INT_P (op1)
12403 || INTVAL (op1) >= 16*1024*1024
12404 || INTVAL (op1) < -16*1024*1024)
12405 break;
12406 if (GET_CODE (op0) == LABEL_REF)
12407 return true;
12408 if (GET_CODE (op0) == CONST
12409 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12410 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12411 return true;
12412 if (GET_CODE (op0) == UNSPEC
12413 && XINT (op0, 1) == UNSPEC_PCREL)
12414 return true;
12415 if (GET_CODE (op0) != SYMBOL_REF)
12416 break;
12417 /* FALLTHRU */
12419 case SYMBOL_REF:
12420 /* TLS references should always be enclosed in UNSPEC.
12421 The dllimported symbol needs always to be resolved. */
12422 if (SYMBOL_REF_TLS_MODEL (op0)
12423 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12424 return false;
12426 if (TARGET_PECOFF)
12428 if (is_imported_p (op0))
12429 return true;
12431 if (SYMBOL_REF_FAR_ADDR_P (op0)
12432 || !SYMBOL_REF_LOCAL_P (op0))
12433 break;
12435 /* Function-symbols need to be resolved only for
12436 large-model.
12437 For the small-model we don't need to resolve anything
12438 here. */
12439 if ((ix86_cmodel != CM_LARGE_PIC
12440 && SYMBOL_REF_FUNCTION_P (op0))
12441 || ix86_cmodel == CM_SMALL_PIC)
12442 return true;
12443 /* Non-external symbols don't need to be resolved for
12444 large, and medium-model. */
12445 if ((ix86_cmodel == CM_LARGE_PIC
12446 || ix86_cmodel == CM_MEDIUM_PIC)
12447 && !SYMBOL_REF_EXTERNAL_P (op0))
12448 return true;
12450 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12451 && SYMBOL_REF_LOCAL_P (op0)
12452 && ix86_cmodel != CM_LARGE_PIC)
12453 return true;
12454 break;
12456 default:
12457 break;
12460 if (GET_CODE (disp) != CONST)
12461 return false;
12462 disp = XEXP (disp, 0);
12464 if (TARGET_64BIT)
12466 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12467 of GOT tables. We should not need these anyway. */
12468 if (GET_CODE (disp) != UNSPEC
12469 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12470 && XINT (disp, 1) != UNSPEC_GOTOFF
12471 && XINT (disp, 1) != UNSPEC_PCREL
12472 && XINT (disp, 1) != UNSPEC_PLTOFF))
12473 return false;
12475 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12476 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12477 return false;
12478 return true;
12481 saw_plus = false;
12482 if (GET_CODE (disp) == PLUS)
12484 if (!CONST_INT_P (XEXP (disp, 1)))
12485 return false;
12486 disp = XEXP (disp, 0);
12487 saw_plus = true;
12490 if (TARGET_MACHO && darwin_local_data_pic (disp))
12491 return true;
12493 if (GET_CODE (disp) != UNSPEC)
12494 return false;
12496 switch (XINT (disp, 1))
12498 case UNSPEC_GOT:
12499 if (saw_plus)
12500 return false;
12501 /* We need to check for both symbols and labels because VxWorks loads
12502 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12503 details. */
12504 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12505 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12506 case UNSPEC_GOTOFF:
12507 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12508 While ABI specify also 32bit relocation but we don't produce it in
12509 small PIC model at all. */
12510 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12511 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12512 && !TARGET_64BIT)
12513 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12514 return false;
12515 case UNSPEC_GOTTPOFF:
12516 case UNSPEC_GOTNTPOFF:
12517 case UNSPEC_INDNTPOFF:
12518 if (saw_plus)
12519 return false;
12520 disp = XVECEXP (disp, 0, 0);
12521 return (GET_CODE (disp) == SYMBOL_REF
12522 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12523 case UNSPEC_NTPOFF:
12524 disp = XVECEXP (disp, 0, 0);
12525 return (GET_CODE (disp) == SYMBOL_REF
12526 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12527 case UNSPEC_DTPOFF:
12528 disp = XVECEXP (disp, 0, 0);
12529 return (GET_CODE (disp) == SYMBOL_REF
12530 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12533 return false;
12536 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12537 replace the input X, or the original X if no replacement is called for.
12538 The output parameter *WIN is 1 if the calling macro should goto WIN,
12539 0 if it should not. */
12541 bool
12542 ix86_legitimize_reload_address (rtx x,
12543 enum machine_mode mode ATTRIBUTE_UNUSED,
12544 int opnum, int type,
12545 int ind_levels ATTRIBUTE_UNUSED)
12547 /* Reload can generate:
12549 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12550 (reg:DI 97))
12551 (reg:DI 2 cx))
12553 This RTX is rejected from ix86_legitimate_address_p due to
12554 non-strictness of base register 97. Following this rejection,
12555 reload pushes all three components into separate registers,
12556 creating invalid memory address RTX.
12558 Following code reloads only the invalid part of the
12559 memory address RTX. */
12561 if (GET_CODE (x) == PLUS
12562 && REG_P (XEXP (x, 1))
12563 && GET_CODE (XEXP (x, 0)) == PLUS
12564 && REG_P (XEXP (XEXP (x, 0), 1)))
12566 rtx base, index;
12567 bool something_reloaded = false;
12569 base = XEXP (XEXP (x, 0), 1);
12570 if (!REG_OK_FOR_BASE_STRICT_P (base))
12572 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12573 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12574 opnum, (enum reload_type) type);
12575 something_reloaded = true;
12578 index = XEXP (x, 1);
12579 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12581 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12582 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12583 opnum, (enum reload_type) type);
12584 something_reloaded = true;
12587 gcc_assert (something_reloaded);
12588 return true;
12591 return false;
12594 /* Recognizes RTL expressions that are valid memory addresses for an
12595 instruction. The MODE argument is the machine mode for the MEM
12596 expression that wants to use this address.
12598 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12599 convert common non-canonical forms to canonical form so that they will
12600 be recognized. */
12602 static bool
12603 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12604 rtx addr, bool strict)
12606 struct ix86_address parts;
12607 rtx base, index, disp;
12608 HOST_WIDE_INT scale;
12610 if (ix86_decompose_address (addr, &parts) <= 0)
12611 /* Decomposition failed. */
12612 return false;
12614 base = parts.base;
12615 index = parts.index;
12616 disp = parts.disp;
12617 scale = parts.scale;
12619 /* Validate base register. */
12620 if (base)
12622 rtx reg;
12624 if (REG_P (base))
12625 reg = base;
12626 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12627 reg = SUBREG_REG (base);
12628 else
12629 /* Base is not a register. */
12630 return false;
12632 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12633 return false;
12635 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12636 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12637 /* Base is not valid. */
12638 return false;
12641 /* Validate index register. */
12642 if (index)
12644 rtx reg;
12646 if (REG_P (index))
12647 reg = index;
12648 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12649 reg = SUBREG_REG (index);
12650 else
12651 /* Index is not a register. */
12652 return false;
12654 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12655 return false;
12657 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12658 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12659 /* Index is not valid. */
12660 return false;
12663 /* Index and base should have the same mode. */
12664 if (base && index
12665 && GET_MODE (base) != GET_MODE (index))
12666 return false;
12668 /* Validate scale factor. */
12669 if (scale != 1)
12671 if (!index)
12672 /* Scale without index. */
12673 return false;
12675 if (scale != 2 && scale != 4 && scale != 8)
12676 /* Scale is not a valid multiplier. */
12677 return false;
12680 /* Validate displacement. */
12681 if (disp)
12683 if (GET_CODE (disp) == CONST
12684 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12685 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12686 switch (XINT (XEXP (disp, 0), 1))
12688 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12689 used. While ABI specify also 32bit relocations, we don't produce
12690 them at all and use IP relative instead. */
12691 case UNSPEC_GOT:
12692 case UNSPEC_GOTOFF:
12693 gcc_assert (flag_pic);
12694 if (!TARGET_64BIT)
12695 goto is_legitimate_pic;
12697 /* 64bit address unspec. */
12698 return false;
12700 case UNSPEC_GOTPCREL:
12701 case UNSPEC_PCREL:
12702 gcc_assert (flag_pic);
12703 goto is_legitimate_pic;
12705 case UNSPEC_GOTTPOFF:
12706 case UNSPEC_GOTNTPOFF:
12707 case UNSPEC_INDNTPOFF:
12708 case UNSPEC_NTPOFF:
12709 case UNSPEC_DTPOFF:
12710 break;
12712 case UNSPEC_STACK_CHECK:
12713 gcc_assert (flag_split_stack);
12714 break;
12716 default:
12717 /* Invalid address unspec. */
12718 return false;
12721 else if (SYMBOLIC_CONST (disp)
12722 && (flag_pic
12723 || (TARGET_MACHO
12724 #if TARGET_MACHO
12725 && MACHOPIC_INDIRECT
12726 && !machopic_operand_p (disp)
12727 #endif
12731 is_legitimate_pic:
12732 if (TARGET_64BIT && (index || base))
12734 /* foo@dtpoff(%rX) is ok. */
12735 if (GET_CODE (disp) != CONST
12736 || GET_CODE (XEXP (disp, 0)) != PLUS
12737 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12738 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12739 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12740 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12741 /* Non-constant pic memory reference. */
12742 return false;
12744 else if ((!TARGET_MACHO || flag_pic)
12745 && ! legitimate_pic_address_disp_p (disp))
12746 /* Displacement is an invalid pic construct. */
12747 return false;
12748 #if TARGET_MACHO
12749 else if (MACHO_DYNAMIC_NO_PIC_P
12750 && !ix86_legitimate_constant_p (Pmode, disp))
12751 /* displacment must be referenced via non_lazy_pointer */
12752 return false;
12753 #endif
12755 /* This code used to verify that a symbolic pic displacement
12756 includes the pic_offset_table_rtx register.
12758 While this is good idea, unfortunately these constructs may
12759 be created by "adds using lea" optimization for incorrect
12760 code like:
12762 int a;
12763 int foo(int i)
12765 return *(&a+i);
12768 This code is nonsensical, but results in addressing
12769 GOT table with pic_offset_table_rtx base. We can't
12770 just refuse it easily, since it gets matched by
12771 "addsi3" pattern, that later gets split to lea in the
12772 case output register differs from input. While this
12773 can be handled by separate addsi pattern for this case
12774 that never results in lea, this seems to be easier and
12775 correct fix for crash to disable this test. */
12777 else if (GET_CODE (disp) != LABEL_REF
12778 && !CONST_INT_P (disp)
12779 && (GET_CODE (disp) != CONST
12780 || !ix86_legitimate_constant_p (Pmode, disp))
12781 && (GET_CODE (disp) != SYMBOL_REF
12782 || !ix86_legitimate_constant_p (Pmode, disp)))
12783 /* Displacement is not constant. */
12784 return false;
12785 else if (TARGET_64BIT
12786 && !x86_64_immediate_operand (disp, VOIDmode))
12787 /* Displacement is out of range. */
12788 return false;
12789 /* In x32 mode, constant addresses are sign extended to 64bit, so
12790 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12791 else if (TARGET_X32 && !(index || base)
12792 && CONST_INT_P (disp)
12793 && val_signbit_known_set_p (SImode, INTVAL (disp)))
12794 return false;
12797 /* Everything looks valid. */
12798 return true;
12801 /* Determine if a given RTX is a valid constant address. */
12803 bool
12804 constant_address_p (rtx x)
12806 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12809 /* Return a unique alias set for the GOT. */
12811 static alias_set_type
12812 ix86_GOT_alias_set (void)
12814 static alias_set_type set = -1;
12815 if (set == -1)
12816 set = new_alias_set ();
12817 return set;
12820 /* Return a legitimate reference for ORIG (an address) using the
12821 register REG. If REG is 0, a new pseudo is generated.
12823 There are two types of references that must be handled:
12825 1. Global data references must load the address from the GOT, via
12826 the PIC reg. An insn is emitted to do this load, and the reg is
12827 returned.
12829 2. Static data references, constant pool addresses, and code labels
12830 compute the address as an offset from the GOT, whose base is in
12831 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12832 differentiate them from global data objects. The returned
12833 address is the PIC reg + an unspec constant.
12835 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12836 reg also appears in the address. */
12838 static rtx
12839 legitimize_pic_address (rtx orig, rtx reg)
12841 rtx addr = orig;
12842 rtx new_rtx = orig;
12844 #if TARGET_MACHO
12845 if (TARGET_MACHO && !TARGET_64BIT)
12847 if (reg == 0)
12848 reg = gen_reg_rtx (Pmode);
12849 /* Use the generic Mach-O PIC machinery. */
12850 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12852 #endif
12854 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12856 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12857 if (tmp)
12858 return tmp;
12861 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12862 new_rtx = addr;
12863 else if (TARGET_64BIT && !TARGET_PECOFF
12864 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12866 rtx tmpreg;
12867 /* This symbol may be referenced via a displacement from the PIC
12868 base address (@GOTOFF). */
12870 if (reload_in_progress)
12871 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12872 if (GET_CODE (addr) == CONST)
12873 addr = XEXP (addr, 0);
12874 if (GET_CODE (addr) == PLUS)
12876 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12877 UNSPEC_GOTOFF);
12878 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12880 else
12881 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12882 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12883 if (!reg)
12884 tmpreg = gen_reg_rtx (Pmode);
12885 else
12886 tmpreg = reg;
12887 emit_move_insn (tmpreg, new_rtx);
12889 if (reg != 0)
12891 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12892 tmpreg, 1, OPTAB_DIRECT);
12893 new_rtx = reg;
12895 else
12896 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12898 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12900 /* This symbol may be referenced via a displacement from the PIC
12901 base address (@GOTOFF). */
12903 if (reload_in_progress)
12904 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12905 if (GET_CODE (addr) == CONST)
12906 addr = XEXP (addr, 0);
12907 if (GET_CODE (addr) == PLUS)
12909 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12910 UNSPEC_GOTOFF);
12911 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12913 else
12914 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12915 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12916 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12918 if (reg != 0)
12920 emit_move_insn (reg, new_rtx);
12921 new_rtx = reg;
12924 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12925 /* We can't use @GOTOFF for text labels on VxWorks;
12926 see gotoff_operand. */
12927 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12929 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12930 if (tmp)
12931 return tmp;
12933 /* For x64 PE-COFF there is no GOT table. So we use address
12934 directly. */
12935 if (TARGET_64BIT && TARGET_PECOFF)
12937 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12938 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12940 if (reg == 0)
12941 reg = gen_reg_rtx (Pmode);
12942 emit_move_insn (reg, new_rtx);
12943 new_rtx = reg;
12945 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12947 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12948 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12949 new_rtx = gen_const_mem (Pmode, new_rtx);
12950 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12952 if (reg == 0)
12953 reg = gen_reg_rtx (Pmode);
12954 /* Use directly gen_movsi, otherwise the address is loaded
12955 into register for CSE. We don't want to CSE this addresses,
12956 instead we CSE addresses from the GOT table, so skip this. */
12957 emit_insn (gen_movsi (reg, new_rtx));
12958 new_rtx = reg;
12960 else
12962 /* This symbol must be referenced via a load from the
12963 Global Offset Table (@GOT). */
12965 if (reload_in_progress)
12966 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12967 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12968 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12969 if (TARGET_64BIT)
12970 new_rtx = force_reg (Pmode, new_rtx);
12971 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12972 new_rtx = gen_const_mem (Pmode, new_rtx);
12973 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12975 if (reg == 0)
12976 reg = gen_reg_rtx (Pmode);
12977 emit_move_insn (reg, new_rtx);
12978 new_rtx = reg;
12981 else
12983 if (CONST_INT_P (addr)
12984 && !x86_64_immediate_operand (addr, VOIDmode))
12986 if (reg)
12988 emit_move_insn (reg, addr);
12989 new_rtx = reg;
12991 else
12992 new_rtx = force_reg (Pmode, addr);
12994 else if (GET_CODE (addr) == CONST)
12996 addr = XEXP (addr, 0);
12998 /* We must match stuff we generate before. Assume the only
12999 unspecs that can get here are ours. Not that we could do
13000 anything with them anyway.... */
13001 if (GET_CODE (addr) == UNSPEC
13002 || (GET_CODE (addr) == PLUS
13003 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
13004 return orig;
13005 gcc_assert (GET_CODE (addr) == PLUS);
13007 if (GET_CODE (addr) == PLUS)
13009 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
13011 /* Check first to see if this is a constant offset from a @GOTOFF
13012 symbol reference. */
13013 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
13014 && CONST_INT_P (op1))
13016 if (!TARGET_64BIT)
13018 if (reload_in_progress)
13019 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13020 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
13021 UNSPEC_GOTOFF);
13022 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
13023 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13024 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
13026 if (reg != 0)
13028 emit_move_insn (reg, new_rtx);
13029 new_rtx = reg;
13032 else
13034 if (INTVAL (op1) < -16*1024*1024
13035 || INTVAL (op1) >= 16*1024*1024)
13037 if (!x86_64_immediate_operand (op1, Pmode))
13038 op1 = force_reg (Pmode, op1);
13039 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
13043 else
13045 rtx base = legitimize_pic_address (op0, reg);
13046 enum machine_mode mode = GET_MODE (base);
13047 new_rtx
13048 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
13050 if (CONST_INT_P (new_rtx))
13052 if (INTVAL (new_rtx) < -16*1024*1024
13053 || INTVAL (new_rtx) >= 16*1024*1024)
13055 if (!x86_64_immediate_operand (new_rtx, mode))
13056 new_rtx = force_reg (mode, new_rtx);
13057 new_rtx
13058 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
13060 else
13061 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
13063 else
13065 if (GET_CODE (new_rtx) == PLUS
13066 && CONSTANT_P (XEXP (new_rtx, 1)))
13068 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
13069 new_rtx = XEXP (new_rtx, 1);
13071 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
13076 return new_rtx;
13079 /* Load the thread pointer. If TO_REG is true, force it into a register. */
13081 static rtx
13082 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
13084 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
13086 if (GET_MODE (tp) != tp_mode)
13088 gcc_assert (GET_MODE (tp) == SImode);
13089 gcc_assert (tp_mode == DImode);
13091 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
13094 if (to_reg)
13095 tp = copy_to_mode_reg (tp_mode, tp);
13097 return tp;
13100 /* Construct the SYMBOL_REF for the tls_get_addr function. */
13102 static GTY(()) rtx ix86_tls_symbol;
13104 static rtx
13105 ix86_tls_get_addr (void)
13107 if (!ix86_tls_symbol)
13109 const char *sym
13110 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
13111 ? "___tls_get_addr" : "__tls_get_addr");
13113 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
13116 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
13118 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
13119 UNSPEC_PLTOFF);
13120 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
13121 gen_rtx_CONST (Pmode, unspec));
13124 return ix86_tls_symbol;
13127 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
13129 static GTY(()) rtx ix86_tls_module_base_symbol;
13132 ix86_tls_module_base (void)
13134 if (!ix86_tls_module_base_symbol)
13136 ix86_tls_module_base_symbol
13137 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
13139 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
13140 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13143 return ix86_tls_module_base_symbol;
13146 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13147 false if we expect this to be used for a memory address and true if
13148 we expect to load the address into a register. */
13150 static rtx
13151 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13153 rtx dest, base, off;
13154 rtx pic = NULL_RTX, tp = NULL_RTX;
13155 enum machine_mode tp_mode = Pmode;
13156 int type;
13158 switch (model)
13160 case TLS_MODEL_GLOBAL_DYNAMIC:
13161 dest = gen_reg_rtx (Pmode);
13163 if (!TARGET_64BIT)
13165 if (flag_pic && !TARGET_PECOFF)
13166 pic = pic_offset_table_rtx;
13167 else
13169 pic = gen_reg_rtx (Pmode);
13170 emit_insn (gen_set_got (pic));
13174 if (TARGET_GNU2_TLS)
13176 if (TARGET_64BIT)
13177 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13178 else
13179 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13181 tp = get_thread_pointer (Pmode, true);
13182 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13184 if (GET_MODE (x) != Pmode)
13185 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13187 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13189 else
13191 rtx caddr = ix86_tls_get_addr ();
13193 if (TARGET_64BIT)
13195 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13196 rtx insns;
13198 start_sequence ();
13199 emit_call_insn
13200 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13201 insns = get_insns ();
13202 end_sequence ();
13204 if (GET_MODE (x) != Pmode)
13205 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13207 RTL_CONST_CALL_P (insns) = 1;
13208 emit_libcall_block (insns, dest, rax, x);
13210 else
13211 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13213 break;
13215 case TLS_MODEL_LOCAL_DYNAMIC:
13216 base = gen_reg_rtx (Pmode);
13218 if (!TARGET_64BIT)
13220 if (flag_pic)
13221 pic = pic_offset_table_rtx;
13222 else
13224 pic = gen_reg_rtx (Pmode);
13225 emit_insn (gen_set_got (pic));
13229 if (TARGET_GNU2_TLS)
13231 rtx tmp = ix86_tls_module_base ();
13233 if (TARGET_64BIT)
13234 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13235 else
13236 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13238 tp = get_thread_pointer (Pmode, true);
13239 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13240 gen_rtx_MINUS (Pmode, tmp, tp));
13242 else
13244 rtx caddr = ix86_tls_get_addr ();
13246 if (TARGET_64BIT)
13248 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13249 rtx insns, eqv;
13251 start_sequence ();
13252 emit_call_insn
13253 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13254 insns = get_insns ();
13255 end_sequence ();
13257 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13258 share the LD_BASE result with other LD model accesses. */
13259 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13260 UNSPEC_TLS_LD_BASE);
13262 RTL_CONST_CALL_P (insns) = 1;
13263 emit_libcall_block (insns, base, rax, eqv);
13265 else
13266 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13269 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13270 off = gen_rtx_CONST (Pmode, off);
13272 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13274 if (TARGET_GNU2_TLS)
13276 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13278 if (GET_MODE (x) != Pmode)
13279 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13281 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13283 break;
13285 case TLS_MODEL_INITIAL_EXEC:
13286 if (TARGET_64BIT)
13288 if (TARGET_SUN_TLS && !TARGET_X32)
13290 /* The Sun linker took the AMD64 TLS spec literally
13291 and can only handle %rax as destination of the
13292 initial executable code sequence. */
13294 dest = gen_reg_rtx (DImode);
13295 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13296 return dest;
13299 /* Generate DImode references to avoid %fs:(%reg32)
13300 problems and linker IE->LE relaxation bug. */
13301 tp_mode = DImode;
13302 pic = NULL;
13303 type = UNSPEC_GOTNTPOFF;
13305 else if (flag_pic)
13307 if (reload_in_progress)
13308 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13309 pic = pic_offset_table_rtx;
13310 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13312 else if (!TARGET_ANY_GNU_TLS)
13314 pic = gen_reg_rtx (Pmode);
13315 emit_insn (gen_set_got (pic));
13316 type = UNSPEC_GOTTPOFF;
13318 else
13320 pic = NULL;
13321 type = UNSPEC_INDNTPOFF;
13324 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13325 off = gen_rtx_CONST (tp_mode, off);
13326 if (pic)
13327 off = gen_rtx_PLUS (tp_mode, pic, off);
13328 off = gen_const_mem (tp_mode, off);
13329 set_mem_alias_set (off, ix86_GOT_alias_set ());
13331 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13333 base = get_thread_pointer (tp_mode,
13334 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13335 off = force_reg (tp_mode, off);
13336 return gen_rtx_PLUS (tp_mode, base, off);
13338 else
13340 base = get_thread_pointer (Pmode, true);
13341 dest = gen_reg_rtx (Pmode);
13342 emit_insn (ix86_gen_sub3 (dest, base, off));
13344 break;
13346 case TLS_MODEL_LOCAL_EXEC:
13347 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13348 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13349 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13350 off = gen_rtx_CONST (Pmode, off);
13352 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13354 base = get_thread_pointer (Pmode,
13355 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13356 return gen_rtx_PLUS (Pmode, base, off);
13358 else
13360 base = get_thread_pointer (Pmode, true);
13361 dest = gen_reg_rtx (Pmode);
13362 emit_insn (ix86_gen_sub3 (dest, base, off));
13364 break;
13366 default:
13367 gcc_unreachable ();
13370 return dest;
13373 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13374 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13375 unique refptr-DECL symbol corresponding to symbol DECL. */
13377 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13378 htab_t dllimport_map;
13380 static tree
13381 get_dllimport_decl (tree decl, bool beimport)
13383 struct tree_map *h, in;
13384 void **loc;
13385 const char *name;
13386 const char *prefix;
13387 size_t namelen, prefixlen;
13388 char *imp_name;
13389 tree to;
13390 rtx rtl;
13392 if (!dllimport_map)
13393 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13395 in.hash = htab_hash_pointer (decl);
13396 in.base.from = decl;
13397 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13398 h = (struct tree_map *) *loc;
13399 if (h)
13400 return h->to;
13402 *loc = h = ggc_alloc_tree_map ();
13403 h->hash = in.hash;
13404 h->base.from = decl;
13405 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13406 VAR_DECL, NULL, ptr_type_node);
13407 DECL_ARTIFICIAL (to) = 1;
13408 DECL_IGNORED_P (to) = 1;
13409 DECL_EXTERNAL (to) = 1;
13410 TREE_READONLY (to) = 1;
13412 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13413 name = targetm.strip_name_encoding (name);
13414 if (beimport)
13415 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13416 ? "*__imp_" : "*__imp__";
13417 else
13418 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13419 namelen = strlen (name);
13420 prefixlen = strlen (prefix);
13421 imp_name = (char *) alloca (namelen + prefixlen + 1);
13422 memcpy (imp_name, prefix, prefixlen);
13423 memcpy (imp_name + prefixlen, name, namelen + 1);
13425 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13426 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13427 SET_SYMBOL_REF_DECL (rtl, to);
13428 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13429 if (!beimport)
13431 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13432 #ifdef SUB_TARGET_RECORD_STUB
13433 SUB_TARGET_RECORD_STUB (name);
13434 #endif
13437 rtl = gen_const_mem (Pmode, rtl);
13438 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13440 SET_DECL_RTL (to, rtl);
13441 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13443 return to;
13446 /* Expand SYMBOL into its corresponding far-addresse symbol.
13447 WANT_REG is true if we require the result be a register. */
13449 static rtx
13450 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13452 tree imp_decl;
13453 rtx x;
13455 gcc_assert (SYMBOL_REF_DECL (symbol));
13456 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13458 x = DECL_RTL (imp_decl);
13459 if (want_reg)
13460 x = force_reg (Pmode, x);
13461 return x;
13464 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13465 true if we require the result be a register. */
13467 static rtx
13468 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13470 tree imp_decl;
13471 rtx x;
13473 gcc_assert (SYMBOL_REF_DECL (symbol));
13474 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13476 x = DECL_RTL (imp_decl);
13477 if (want_reg)
13478 x = force_reg (Pmode, x);
13479 return x;
13482 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13483 is true if we require the result be a register. */
13485 static rtx
13486 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13488 if (!TARGET_PECOFF)
13489 return NULL_RTX;
13491 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13493 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13494 return legitimize_dllimport_symbol (addr, inreg);
13495 if (GET_CODE (addr) == CONST
13496 && GET_CODE (XEXP (addr, 0)) == PLUS
13497 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13498 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13500 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13501 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13505 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13506 return NULL_RTX;
13507 if (GET_CODE (addr) == SYMBOL_REF
13508 && !is_imported_p (addr)
13509 && SYMBOL_REF_EXTERNAL_P (addr)
13510 && SYMBOL_REF_DECL (addr))
13511 return legitimize_pe_coff_extern_decl (addr, inreg);
13513 if (GET_CODE (addr) == CONST
13514 && GET_CODE (XEXP (addr, 0)) == PLUS
13515 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13516 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13517 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13518 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13520 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13521 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13523 return NULL_RTX;
13526 /* Try machine-dependent ways of modifying an illegitimate address
13527 to be legitimate. If we find one, return the new, valid address.
13528 This macro is used in only one place: `memory_address' in explow.c.
13530 OLDX is the address as it was before break_out_memory_refs was called.
13531 In some cases it is useful to look at this to decide what needs to be done.
13533 It is always safe for this macro to do nothing. It exists to recognize
13534 opportunities to optimize the output.
13536 For the 80386, we handle X+REG by loading X into a register R and
13537 using R+REG. R will go in a general reg and indexing will be used.
13538 However, if REG is a broken-out memory address or multiplication,
13539 nothing needs to be done because REG can certainly go in a general reg.
13541 When -fpic is used, special handling is needed for symbolic references.
13542 See comments by legitimize_pic_address in i386.c for details. */
13544 static rtx
13545 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13546 enum machine_mode mode)
13548 int changed = 0;
13549 unsigned log;
13551 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13552 if (log)
13553 return legitimize_tls_address (x, (enum tls_model) log, false);
13554 if (GET_CODE (x) == CONST
13555 && GET_CODE (XEXP (x, 0)) == PLUS
13556 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13557 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13559 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13560 (enum tls_model) log, false);
13561 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13564 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13566 rtx tmp = legitimize_pe_coff_symbol (x, true);
13567 if (tmp)
13568 return tmp;
13571 if (flag_pic && SYMBOLIC_CONST (x))
13572 return legitimize_pic_address (x, 0);
13574 #if TARGET_MACHO
13575 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13576 return machopic_indirect_data_reference (x, 0);
13577 #endif
13579 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13580 if (GET_CODE (x) == ASHIFT
13581 && CONST_INT_P (XEXP (x, 1))
13582 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13584 changed = 1;
13585 log = INTVAL (XEXP (x, 1));
13586 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13587 GEN_INT (1 << log));
13590 if (GET_CODE (x) == PLUS)
13592 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13594 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13595 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13596 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13598 changed = 1;
13599 log = INTVAL (XEXP (XEXP (x, 0), 1));
13600 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13601 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13602 GEN_INT (1 << log));
13605 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13606 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13607 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13609 changed = 1;
13610 log = INTVAL (XEXP (XEXP (x, 1), 1));
13611 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13612 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13613 GEN_INT (1 << log));
13616 /* Put multiply first if it isn't already. */
13617 if (GET_CODE (XEXP (x, 1)) == MULT)
13619 rtx tmp = XEXP (x, 0);
13620 XEXP (x, 0) = XEXP (x, 1);
13621 XEXP (x, 1) = tmp;
13622 changed = 1;
13625 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13626 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13627 created by virtual register instantiation, register elimination, and
13628 similar optimizations. */
13629 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13631 changed = 1;
13632 x = gen_rtx_PLUS (Pmode,
13633 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13634 XEXP (XEXP (x, 1), 0)),
13635 XEXP (XEXP (x, 1), 1));
13638 /* Canonicalize
13639 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13640 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13641 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13642 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13643 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13644 && CONSTANT_P (XEXP (x, 1)))
13646 rtx constant;
13647 rtx other = NULL_RTX;
13649 if (CONST_INT_P (XEXP (x, 1)))
13651 constant = XEXP (x, 1);
13652 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13654 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13656 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13657 other = XEXP (x, 1);
13659 else
13660 constant = 0;
13662 if (constant)
13664 changed = 1;
13665 x = gen_rtx_PLUS (Pmode,
13666 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13667 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13668 plus_constant (Pmode, other,
13669 INTVAL (constant)));
13673 if (changed && ix86_legitimate_address_p (mode, x, false))
13674 return x;
13676 if (GET_CODE (XEXP (x, 0)) == MULT)
13678 changed = 1;
13679 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13682 if (GET_CODE (XEXP (x, 1)) == MULT)
13684 changed = 1;
13685 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13688 if (changed
13689 && REG_P (XEXP (x, 1))
13690 && REG_P (XEXP (x, 0)))
13691 return x;
13693 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13695 changed = 1;
13696 x = legitimize_pic_address (x, 0);
13699 if (changed && ix86_legitimate_address_p (mode, x, false))
13700 return x;
13702 if (REG_P (XEXP (x, 0)))
13704 rtx temp = gen_reg_rtx (Pmode);
13705 rtx val = force_operand (XEXP (x, 1), temp);
13706 if (val != temp)
13708 val = convert_to_mode (Pmode, val, 1);
13709 emit_move_insn (temp, val);
13712 XEXP (x, 1) = temp;
13713 return x;
13716 else if (REG_P (XEXP (x, 1)))
13718 rtx temp = gen_reg_rtx (Pmode);
13719 rtx val = force_operand (XEXP (x, 0), temp);
13720 if (val != temp)
13722 val = convert_to_mode (Pmode, val, 1);
13723 emit_move_insn (temp, val);
13726 XEXP (x, 0) = temp;
13727 return x;
13731 return x;
13734 /* Print an integer constant expression in assembler syntax. Addition
13735 and subtraction are the only arithmetic that may appear in these
13736 expressions. FILE is the stdio stream to write to, X is the rtx, and
13737 CODE is the operand print code from the output string. */
13739 static void
13740 output_pic_addr_const (FILE *file, rtx x, int code)
13742 char buf[256];
13744 switch (GET_CODE (x))
13746 case PC:
13747 gcc_assert (flag_pic);
13748 putc ('.', file);
13749 break;
13751 case SYMBOL_REF:
13752 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13753 output_addr_const (file, x);
13754 else
13756 const char *name = XSTR (x, 0);
13758 /* Mark the decl as referenced so that cgraph will
13759 output the function. */
13760 if (SYMBOL_REF_DECL (x))
13761 mark_decl_referenced (SYMBOL_REF_DECL (x));
13763 #if TARGET_MACHO
13764 if (MACHOPIC_INDIRECT
13765 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13766 name = machopic_indirection_name (x, /*stub_p=*/true);
13767 #endif
13768 assemble_name (file, name);
13770 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13771 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13772 fputs ("@PLT", file);
13773 break;
13775 case LABEL_REF:
13776 x = XEXP (x, 0);
13777 /* FALLTHRU */
13778 case CODE_LABEL:
13779 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13780 assemble_name (asm_out_file, buf);
13781 break;
13783 case CONST_INT:
13784 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13785 break;
13787 case CONST:
13788 /* This used to output parentheses around the expression,
13789 but that does not work on the 386 (either ATT or BSD assembler). */
13790 output_pic_addr_const (file, XEXP (x, 0), code);
13791 break;
13793 case CONST_DOUBLE:
13794 if (GET_MODE (x) == VOIDmode)
13796 /* We can use %d if the number is <32 bits and positive. */
13797 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13798 fprintf (file, "0x%lx%08lx",
13799 (unsigned long) CONST_DOUBLE_HIGH (x),
13800 (unsigned long) CONST_DOUBLE_LOW (x));
13801 else
13802 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13804 else
13805 /* We can't handle floating point constants;
13806 TARGET_PRINT_OPERAND must handle them. */
13807 output_operand_lossage ("floating constant misused");
13808 break;
13810 case PLUS:
13811 /* Some assemblers need integer constants to appear first. */
13812 if (CONST_INT_P (XEXP (x, 0)))
13814 output_pic_addr_const (file, XEXP (x, 0), code);
13815 putc ('+', file);
13816 output_pic_addr_const (file, XEXP (x, 1), code);
13818 else
13820 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13821 output_pic_addr_const (file, XEXP (x, 1), code);
13822 putc ('+', file);
13823 output_pic_addr_const (file, XEXP (x, 0), code);
13825 break;
13827 case MINUS:
13828 if (!TARGET_MACHO)
13829 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13830 output_pic_addr_const (file, XEXP (x, 0), code);
13831 putc ('-', file);
13832 output_pic_addr_const (file, XEXP (x, 1), code);
13833 if (!TARGET_MACHO)
13834 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13835 break;
13837 case UNSPEC:
13838 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13840 bool f = i386_asm_output_addr_const_extra (file, x);
13841 gcc_assert (f);
13842 break;
13845 gcc_assert (XVECLEN (x, 0) == 1);
13846 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13847 switch (XINT (x, 1))
13849 case UNSPEC_GOT:
13850 fputs ("@GOT", file);
13851 break;
13852 case UNSPEC_GOTOFF:
13853 fputs ("@GOTOFF", file);
13854 break;
13855 case UNSPEC_PLTOFF:
13856 fputs ("@PLTOFF", file);
13857 break;
13858 case UNSPEC_PCREL:
13859 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13860 "(%rip)" : "[rip]", file);
13861 break;
13862 case UNSPEC_GOTPCREL:
13863 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13864 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13865 break;
13866 case UNSPEC_GOTTPOFF:
13867 /* FIXME: This might be @TPOFF in Sun ld too. */
13868 fputs ("@gottpoff", file);
13869 break;
13870 case UNSPEC_TPOFF:
13871 fputs ("@tpoff", file);
13872 break;
13873 case UNSPEC_NTPOFF:
13874 if (TARGET_64BIT)
13875 fputs ("@tpoff", file);
13876 else
13877 fputs ("@ntpoff", file);
13878 break;
13879 case UNSPEC_DTPOFF:
13880 fputs ("@dtpoff", file);
13881 break;
13882 case UNSPEC_GOTNTPOFF:
13883 if (TARGET_64BIT)
13884 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13885 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13886 else
13887 fputs ("@gotntpoff", file);
13888 break;
13889 case UNSPEC_INDNTPOFF:
13890 fputs ("@indntpoff", file);
13891 break;
13892 #if TARGET_MACHO
13893 case UNSPEC_MACHOPIC_OFFSET:
13894 putc ('-', file);
13895 machopic_output_function_base_name (file);
13896 break;
13897 #endif
13898 default:
13899 output_operand_lossage ("invalid UNSPEC as operand");
13900 break;
13902 break;
13904 default:
13905 output_operand_lossage ("invalid expression as operand");
13909 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13910 We need to emit DTP-relative relocations. */
13912 static void ATTRIBUTE_UNUSED
13913 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13915 fputs (ASM_LONG, file);
13916 output_addr_const (file, x);
13917 fputs ("@dtpoff", file);
13918 switch (size)
13920 case 4:
13921 break;
13922 case 8:
13923 fputs (", 0", file);
13924 break;
13925 default:
13926 gcc_unreachable ();
13930 /* Return true if X is a representation of the PIC register. This copes
13931 with calls from ix86_find_base_term, where the register might have
13932 been replaced by a cselib value. */
13934 static bool
13935 ix86_pic_register_p (rtx x)
13937 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13938 return (pic_offset_table_rtx
13939 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13940 else
13941 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13944 /* Helper function for ix86_delegitimize_address.
13945 Attempt to delegitimize TLS local-exec accesses. */
13947 static rtx
13948 ix86_delegitimize_tls_address (rtx orig_x)
13950 rtx x = orig_x, unspec;
13951 struct ix86_address addr;
13953 if (!TARGET_TLS_DIRECT_SEG_REFS)
13954 return orig_x;
13955 if (MEM_P (x))
13956 x = XEXP (x, 0);
13957 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13958 return orig_x;
13959 if (ix86_decompose_address (x, &addr) == 0
13960 || addr.seg != DEFAULT_TLS_SEG_REG
13961 || addr.disp == NULL_RTX
13962 || GET_CODE (addr.disp) != CONST)
13963 return orig_x;
13964 unspec = XEXP (addr.disp, 0);
13965 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13966 unspec = XEXP (unspec, 0);
13967 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13968 return orig_x;
13969 x = XVECEXP (unspec, 0, 0);
13970 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13971 if (unspec != XEXP (addr.disp, 0))
13972 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13973 if (addr.index)
13975 rtx idx = addr.index;
13976 if (addr.scale != 1)
13977 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13978 x = gen_rtx_PLUS (Pmode, idx, x);
13980 if (addr.base)
13981 x = gen_rtx_PLUS (Pmode, addr.base, x);
13982 if (MEM_P (orig_x))
13983 x = replace_equiv_address_nv (orig_x, x);
13984 return x;
13987 /* In the name of slightly smaller debug output, and to cater to
13988 general assembler lossage, recognize PIC+GOTOFF and turn it back
13989 into a direct symbol reference.
13991 On Darwin, this is necessary to avoid a crash, because Darwin
13992 has a different PIC label for each routine but the DWARF debugging
13993 information is not associated with any particular routine, so it's
13994 necessary to remove references to the PIC label from RTL stored by
13995 the DWARF output code. */
13997 static rtx
13998 ix86_delegitimize_address (rtx x)
14000 rtx orig_x = delegitimize_mem_from_attrs (x);
14001 /* addend is NULL or some rtx if x is something+GOTOFF where
14002 something doesn't include the PIC register. */
14003 rtx addend = NULL_RTX;
14004 /* reg_addend is NULL or a multiple of some register. */
14005 rtx reg_addend = NULL_RTX;
14006 /* const_addend is NULL or a const_int. */
14007 rtx const_addend = NULL_RTX;
14008 /* This is the result, or NULL. */
14009 rtx result = NULL_RTX;
14011 x = orig_x;
14013 if (MEM_P (x))
14014 x = XEXP (x, 0);
14016 if (TARGET_64BIT)
14018 if (GET_CODE (x) == CONST
14019 && GET_CODE (XEXP (x, 0)) == PLUS
14020 && GET_MODE (XEXP (x, 0)) == Pmode
14021 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
14022 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
14023 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
14025 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
14026 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
14027 if (MEM_P (orig_x))
14028 x = replace_equiv_address_nv (orig_x, x);
14029 return x;
14032 if (GET_CODE (x) == CONST
14033 && GET_CODE (XEXP (x, 0)) == UNSPEC
14034 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
14035 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
14036 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
14038 x = XVECEXP (XEXP (x, 0), 0, 0);
14039 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
14041 x = simplify_gen_subreg (GET_MODE (orig_x), x,
14042 GET_MODE (x), 0);
14043 if (x == NULL_RTX)
14044 return orig_x;
14046 return x;
14049 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
14050 return ix86_delegitimize_tls_address (orig_x);
14052 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
14053 and -mcmodel=medium -fpic. */
14056 if (GET_CODE (x) != PLUS
14057 || GET_CODE (XEXP (x, 1)) != CONST)
14058 return ix86_delegitimize_tls_address (orig_x);
14060 if (ix86_pic_register_p (XEXP (x, 0)))
14061 /* %ebx + GOT/GOTOFF */
14063 else if (GET_CODE (XEXP (x, 0)) == PLUS)
14065 /* %ebx + %reg * scale + GOT/GOTOFF */
14066 reg_addend = XEXP (x, 0);
14067 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
14068 reg_addend = XEXP (reg_addend, 1);
14069 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
14070 reg_addend = XEXP (reg_addend, 0);
14071 else
14073 reg_addend = NULL_RTX;
14074 addend = XEXP (x, 0);
14077 else
14078 addend = XEXP (x, 0);
14080 x = XEXP (XEXP (x, 1), 0);
14081 if (GET_CODE (x) == PLUS
14082 && CONST_INT_P (XEXP (x, 1)))
14084 const_addend = XEXP (x, 1);
14085 x = XEXP (x, 0);
14088 if (GET_CODE (x) == UNSPEC
14089 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
14090 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
14091 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
14092 && !MEM_P (orig_x) && !addend)))
14093 result = XVECEXP (x, 0, 0);
14095 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
14096 && !MEM_P (orig_x))
14097 result = XVECEXP (x, 0, 0);
14099 if (! result)
14100 return ix86_delegitimize_tls_address (orig_x);
14102 if (const_addend)
14103 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
14104 if (reg_addend)
14105 result = gen_rtx_PLUS (Pmode, reg_addend, result);
14106 if (addend)
14108 /* If the rest of original X doesn't involve the PIC register, add
14109 addend and subtract pic_offset_table_rtx. This can happen e.g.
14110 for code like:
14111 leal (%ebx, %ecx, 4), %ecx
14113 movl foo@GOTOFF(%ecx), %edx
14114 in which case we return (%ecx - %ebx) + foo. */
14115 if (pic_offset_table_rtx)
14116 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
14117 pic_offset_table_rtx),
14118 result);
14119 else
14120 return orig_x;
14122 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
14124 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
14125 if (result == NULL_RTX)
14126 return orig_x;
14128 return result;
14131 /* If X is a machine specific address (i.e. a symbol or label being
14132 referenced as a displacement from the GOT implemented using an
14133 UNSPEC), then return the base term. Otherwise return X. */
14136 ix86_find_base_term (rtx x)
14138 rtx term;
14140 if (TARGET_64BIT)
14142 if (GET_CODE (x) != CONST)
14143 return x;
14144 term = XEXP (x, 0);
14145 if (GET_CODE (term) == PLUS
14146 && (CONST_INT_P (XEXP (term, 1))
14147 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14148 term = XEXP (term, 0);
14149 if (GET_CODE (term) != UNSPEC
14150 || (XINT (term, 1) != UNSPEC_GOTPCREL
14151 && XINT (term, 1) != UNSPEC_PCREL))
14152 return x;
14154 return XVECEXP (term, 0, 0);
14157 return ix86_delegitimize_address (x);
14160 static void
14161 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14162 bool fp, FILE *file)
14164 const char *suffix;
14166 if (mode == CCFPmode || mode == CCFPUmode)
14168 code = ix86_fp_compare_code_to_integer (code);
14169 mode = CCmode;
14171 if (reverse)
14172 code = reverse_condition (code);
14174 switch (code)
14176 case EQ:
14177 switch (mode)
14179 case CCAmode:
14180 suffix = "a";
14181 break;
14183 case CCCmode:
14184 suffix = "c";
14185 break;
14187 case CCOmode:
14188 suffix = "o";
14189 break;
14191 case CCSmode:
14192 suffix = "s";
14193 break;
14195 default:
14196 suffix = "e";
14198 break;
14199 case NE:
14200 switch (mode)
14202 case CCAmode:
14203 suffix = "na";
14204 break;
14206 case CCCmode:
14207 suffix = "nc";
14208 break;
14210 case CCOmode:
14211 suffix = "no";
14212 break;
14214 case CCSmode:
14215 suffix = "ns";
14216 break;
14218 default:
14219 suffix = "ne";
14221 break;
14222 case GT:
14223 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14224 suffix = "g";
14225 break;
14226 case GTU:
14227 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14228 Those same assemblers have the same but opposite lossage on cmov. */
14229 if (mode == CCmode)
14230 suffix = fp ? "nbe" : "a";
14231 else
14232 gcc_unreachable ();
14233 break;
14234 case LT:
14235 switch (mode)
14237 case CCNOmode:
14238 case CCGOCmode:
14239 suffix = "s";
14240 break;
14242 case CCmode:
14243 case CCGCmode:
14244 suffix = "l";
14245 break;
14247 default:
14248 gcc_unreachable ();
14250 break;
14251 case LTU:
14252 if (mode == CCmode)
14253 suffix = "b";
14254 else if (mode == CCCmode)
14255 suffix = "c";
14256 else
14257 gcc_unreachable ();
14258 break;
14259 case GE:
14260 switch (mode)
14262 case CCNOmode:
14263 case CCGOCmode:
14264 suffix = "ns";
14265 break;
14267 case CCmode:
14268 case CCGCmode:
14269 suffix = "ge";
14270 break;
14272 default:
14273 gcc_unreachable ();
14275 break;
14276 case GEU:
14277 if (mode == CCmode)
14278 suffix = fp ? "nb" : "ae";
14279 else if (mode == CCCmode)
14280 suffix = "nc";
14281 else
14282 gcc_unreachable ();
14283 break;
14284 case LE:
14285 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14286 suffix = "le";
14287 break;
14288 case LEU:
14289 if (mode == CCmode)
14290 suffix = "be";
14291 else
14292 gcc_unreachable ();
14293 break;
14294 case UNORDERED:
14295 suffix = fp ? "u" : "p";
14296 break;
14297 case ORDERED:
14298 suffix = fp ? "nu" : "np";
14299 break;
14300 default:
14301 gcc_unreachable ();
14303 fputs (suffix, file);
14306 /* Print the name of register X to FILE based on its machine mode and number.
14307 If CODE is 'w', pretend the mode is HImode.
14308 If CODE is 'b', pretend the mode is QImode.
14309 If CODE is 'k', pretend the mode is SImode.
14310 If CODE is 'q', pretend the mode is DImode.
14311 If CODE is 'x', pretend the mode is V4SFmode.
14312 If CODE is 't', pretend the mode is V8SFmode.
14313 If CODE is 'g', pretend the mode is V16SFmode.
14314 If CODE is 'h', pretend the reg is the 'high' byte register.
14315 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14316 If CODE is 'd', duplicate the operand for AVX instruction.
14319 void
14320 print_reg (rtx x, int code, FILE *file)
14322 const char *reg;
14323 unsigned int regno;
14324 bool duplicated = code == 'd' && TARGET_AVX;
14326 if (ASSEMBLER_DIALECT == ASM_ATT)
14327 putc ('%', file);
14329 if (x == pc_rtx)
14331 gcc_assert (TARGET_64BIT);
14332 fputs ("rip", file);
14333 return;
14336 regno = true_regnum (x);
14337 gcc_assert (regno != ARG_POINTER_REGNUM
14338 && regno != FRAME_POINTER_REGNUM
14339 && regno != FLAGS_REG
14340 && regno != FPSR_REG
14341 && regno != FPCR_REG);
14343 if (code == 'w' || MMX_REG_P (x))
14344 code = 2;
14345 else if (code == 'b')
14346 code = 1;
14347 else if (code == 'k')
14348 code = 4;
14349 else if (code == 'q')
14350 code = 8;
14351 else if (code == 'y')
14352 code = 3;
14353 else if (code == 'h')
14354 code = 0;
14355 else if (code == 'x')
14356 code = 16;
14357 else if (code == 't')
14358 code = 32;
14359 else if (code == 'g')
14360 code = 64;
14361 else
14362 code = GET_MODE_SIZE (GET_MODE (x));
14364 /* Irritatingly, AMD extended registers use different naming convention
14365 from the normal registers: "r%d[bwd]" */
14366 if (REX_INT_REGNO_P (regno))
14368 gcc_assert (TARGET_64BIT);
14369 putc ('r', file);
14370 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14371 switch (code)
14373 case 0:
14374 error ("extended registers have no high halves");
14375 break;
14376 case 1:
14377 putc ('b', file);
14378 break;
14379 case 2:
14380 putc ('w', file);
14381 break;
14382 case 4:
14383 putc ('d', file);
14384 break;
14385 case 8:
14386 /* no suffix */
14387 break;
14388 default:
14389 error ("unsupported operand size for extended register");
14390 break;
14392 return;
14395 reg = NULL;
14396 switch (code)
14398 case 3:
14399 if (STACK_TOP_P (x))
14401 reg = "st(0)";
14402 break;
14404 /* FALLTHRU */
14405 case 8:
14406 case 4:
14407 case 12:
14408 if (! ANY_FP_REG_P (x) && ! ANY_BND_REG_P (x))
14409 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14410 /* FALLTHRU */
14411 case 16:
14412 case 2:
14413 normal:
14414 reg = hi_reg_name[regno];
14415 break;
14416 case 1:
14417 if (regno >= ARRAY_SIZE (qi_reg_name))
14418 goto normal;
14419 reg = qi_reg_name[regno];
14420 break;
14421 case 0:
14422 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14423 goto normal;
14424 reg = qi_high_reg_name[regno];
14425 break;
14426 case 32:
14427 if (SSE_REG_P (x))
14429 gcc_assert (!duplicated);
14430 putc ('y', file);
14431 fputs (hi_reg_name[regno] + 1, file);
14432 return;
14434 case 64:
14435 if (SSE_REG_P (x))
14437 gcc_assert (!duplicated);
14438 putc ('z', file);
14439 fputs (hi_reg_name[REGNO (x)] + 1, file);
14440 return;
14442 break;
14443 default:
14444 gcc_unreachable ();
14447 fputs (reg, file);
14448 if (duplicated)
14450 if (ASSEMBLER_DIALECT == ASM_ATT)
14451 fprintf (file, ", %%%s", reg);
14452 else
14453 fprintf (file, ", %s", reg);
14457 /* Locate some local-dynamic symbol still in use by this function
14458 so that we can print its name in some tls_local_dynamic_base
14459 pattern. */
14461 static int
14462 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14464 rtx x = *px;
14466 if (GET_CODE (x) == SYMBOL_REF
14467 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14469 cfun->machine->some_ld_name = XSTR (x, 0);
14470 return 1;
14473 return 0;
14476 static const char *
14477 get_some_local_dynamic_name (void)
14479 rtx insn;
14481 if (cfun->machine->some_ld_name)
14482 return cfun->machine->some_ld_name;
14484 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14485 if (NONDEBUG_INSN_P (insn)
14486 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14487 return cfun->machine->some_ld_name;
14489 return NULL;
14492 /* Meaning of CODE:
14493 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14494 C -- print opcode suffix for set/cmov insn.
14495 c -- like C, but print reversed condition
14496 F,f -- likewise, but for floating-point.
14497 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14498 otherwise nothing
14499 R -- print the prefix for register names.
14500 z -- print the opcode suffix for the size of the current operand.
14501 Z -- likewise, with special suffixes for x87 instructions.
14502 * -- print a star (in certain assembler syntax)
14503 A -- print an absolute memory reference.
14504 E -- print address with DImode register names if TARGET_64BIT.
14505 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14506 s -- print a shift double count, followed by the assemblers argument
14507 delimiter.
14508 b -- print the QImode name of the register for the indicated operand.
14509 %b0 would print %al if operands[0] is reg 0.
14510 w -- likewise, print the HImode name of the register.
14511 k -- likewise, print the SImode name of the register.
14512 q -- likewise, print the DImode name of the register.
14513 x -- likewise, print the V4SFmode name of the register.
14514 t -- likewise, print the V8SFmode name of the register.
14515 g -- likewise, print the V16SFmode name of the register.
14516 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14517 y -- print "st(0)" instead of "st" as a register.
14518 d -- print duplicated register operand for AVX instruction.
14519 D -- print condition for SSE cmp instruction.
14520 P -- if PIC, print an @PLT suffix.
14521 p -- print raw symbol name.
14522 X -- don't print any sort of PIC '@' suffix for a symbol.
14523 & -- print some in-use local-dynamic symbol name.
14524 H -- print a memory address offset by 8; used for sse high-parts
14525 Y -- print condition for XOP pcom* instruction.
14526 + -- print a branch hint as 'cs' or 'ds' prefix
14527 ; -- print a semicolon (after prefixes due to bug in older gas).
14528 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14529 @ -- print a segment register of thread base pointer load
14530 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14531 ! -- print MPX prefix for jxx/call/ret instructions if required.
14534 void
14535 ix86_print_operand (FILE *file, rtx x, int code)
14537 if (code)
14539 switch (code)
14541 case 'A':
14542 switch (ASSEMBLER_DIALECT)
14544 case ASM_ATT:
14545 putc ('*', file);
14546 break;
14548 case ASM_INTEL:
14549 /* Intel syntax. For absolute addresses, registers should not
14550 be surrounded by braces. */
14551 if (!REG_P (x))
14553 putc ('[', file);
14554 ix86_print_operand (file, x, 0);
14555 putc (']', file);
14556 return;
14558 break;
14560 default:
14561 gcc_unreachable ();
14564 ix86_print_operand (file, x, 0);
14565 return;
14567 case 'E':
14568 /* Wrap address in an UNSPEC to declare special handling. */
14569 if (TARGET_64BIT)
14570 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14572 output_address (x);
14573 return;
14575 case 'L':
14576 if (ASSEMBLER_DIALECT == ASM_ATT)
14577 putc ('l', file);
14578 return;
14580 case 'W':
14581 if (ASSEMBLER_DIALECT == ASM_ATT)
14582 putc ('w', file);
14583 return;
14585 case 'B':
14586 if (ASSEMBLER_DIALECT == ASM_ATT)
14587 putc ('b', file);
14588 return;
14590 case 'Q':
14591 if (ASSEMBLER_DIALECT == ASM_ATT)
14592 putc ('l', file);
14593 return;
14595 case 'S':
14596 if (ASSEMBLER_DIALECT == ASM_ATT)
14597 putc ('s', file);
14598 return;
14600 case 'T':
14601 if (ASSEMBLER_DIALECT == ASM_ATT)
14602 putc ('t', file);
14603 return;
14605 case 'O':
14606 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14607 if (ASSEMBLER_DIALECT != ASM_ATT)
14608 return;
14610 switch (GET_MODE_SIZE (GET_MODE (x)))
14612 case 2:
14613 putc ('w', file);
14614 break;
14616 case 4:
14617 putc ('l', file);
14618 break;
14620 case 8:
14621 putc ('q', file);
14622 break;
14624 default:
14625 output_operand_lossage
14626 ("invalid operand size for operand code 'O'");
14627 return;
14630 putc ('.', file);
14631 #endif
14632 return;
14634 case 'z':
14635 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14637 /* Opcodes don't get size suffixes if using Intel opcodes. */
14638 if (ASSEMBLER_DIALECT == ASM_INTEL)
14639 return;
14641 switch (GET_MODE_SIZE (GET_MODE (x)))
14643 case 1:
14644 putc ('b', file);
14645 return;
14647 case 2:
14648 putc ('w', file);
14649 return;
14651 case 4:
14652 putc ('l', file);
14653 return;
14655 case 8:
14656 putc ('q', file);
14657 return;
14659 default:
14660 output_operand_lossage
14661 ("invalid operand size for operand code 'z'");
14662 return;
14666 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14667 warning
14668 (0, "non-integer operand used with operand code 'z'");
14669 /* FALLTHRU */
14671 case 'Z':
14672 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14673 if (ASSEMBLER_DIALECT == ASM_INTEL)
14674 return;
14676 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14678 switch (GET_MODE_SIZE (GET_MODE (x)))
14680 case 2:
14681 #ifdef HAVE_AS_IX86_FILDS
14682 putc ('s', file);
14683 #endif
14684 return;
14686 case 4:
14687 putc ('l', file);
14688 return;
14690 case 8:
14691 #ifdef HAVE_AS_IX86_FILDQ
14692 putc ('q', file);
14693 #else
14694 fputs ("ll", file);
14695 #endif
14696 return;
14698 default:
14699 break;
14702 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14704 /* 387 opcodes don't get size suffixes
14705 if the operands are registers. */
14706 if (STACK_REG_P (x))
14707 return;
14709 switch (GET_MODE_SIZE (GET_MODE (x)))
14711 case 4:
14712 putc ('s', file);
14713 return;
14715 case 8:
14716 putc ('l', file);
14717 return;
14719 case 12:
14720 case 16:
14721 putc ('t', file);
14722 return;
14724 default:
14725 break;
14728 else
14730 output_operand_lossage
14731 ("invalid operand type used with operand code 'Z'");
14732 return;
14735 output_operand_lossage
14736 ("invalid operand size for operand code 'Z'");
14737 return;
14739 case 'd':
14740 case 'b':
14741 case 'w':
14742 case 'k':
14743 case 'q':
14744 case 'h':
14745 case 't':
14746 case 'g':
14747 case 'y':
14748 case 'x':
14749 case 'X':
14750 case 'P':
14751 case 'p':
14752 break;
14754 case 's':
14755 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14757 ix86_print_operand (file, x, 0);
14758 fputs (", ", file);
14760 return;
14762 case 'Y':
14763 switch (GET_CODE (x))
14765 case NE:
14766 fputs ("neq", file);
14767 break;
14768 case EQ:
14769 fputs ("eq", file);
14770 break;
14771 case GE:
14772 case GEU:
14773 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14774 break;
14775 case GT:
14776 case GTU:
14777 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14778 break;
14779 case LE:
14780 case LEU:
14781 fputs ("le", file);
14782 break;
14783 case LT:
14784 case LTU:
14785 fputs ("lt", file);
14786 break;
14787 case UNORDERED:
14788 fputs ("unord", file);
14789 break;
14790 case ORDERED:
14791 fputs ("ord", file);
14792 break;
14793 case UNEQ:
14794 fputs ("ueq", file);
14795 break;
14796 case UNGE:
14797 fputs ("nlt", file);
14798 break;
14799 case UNGT:
14800 fputs ("nle", file);
14801 break;
14802 case UNLE:
14803 fputs ("ule", file);
14804 break;
14805 case UNLT:
14806 fputs ("ult", file);
14807 break;
14808 case LTGT:
14809 fputs ("une", file);
14810 break;
14811 default:
14812 output_operand_lossage ("operand is not a condition code, "
14813 "invalid operand code 'Y'");
14814 return;
14816 return;
14818 case 'D':
14819 /* Little bit of braindamage here. The SSE compare instructions
14820 does use completely different names for the comparisons that the
14821 fp conditional moves. */
14822 switch (GET_CODE (x))
14824 case UNEQ:
14825 if (TARGET_AVX)
14827 fputs ("eq_us", file);
14828 break;
14830 case EQ:
14831 fputs ("eq", file);
14832 break;
14833 case UNLT:
14834 if (TARGET_AVX)
14836 fputs ("nge", file);
14837 break;
14839 case LT:
14840 fputs ("lt", file);
14841 break;
14842 case UNLE:
14843 if (TARGET_AVX)
14845 fputs ("ngt", file);
14846 break;
14848 case LE:
14849 fputs ("le", file);
14850 break;
14851 case UNORDERED:
14852 fputs ("unord", file);
14853 break;
14854 case LTGT:
14855 if (TARGET_AVX)
14857 fputs ("neq_oq", file);
14858 break;
14860 case NE:
14861 fputs ("neq", file);
14862 break;
14863 case GE:
14864 if (TARGET_AVX)
14866 fputs ("ge", file);
14867 break;
14869 case UNGE:
14870 fputs ("nlt", file);
14871 break;
14872 case GT:
14873 if (TARGET_AVX)
14875 fputs ("gt", file);
14876 break;
14878 case UNGT:
14879 fputs ("nle", file);
14880 break;
14881 case ORDERED:
14882 fputs ("ord", file);
14883 break;
14884 default:
14885 output_operand_lossage ("operand is not a condition code, "
14886 "invalid operand code 'D'");
14887 return;
14889 return;
14891 case 'F':
14892 case 'f':
14893 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14894 if (ASSEMBLER_DIALECT == ASM_ATT)
14895 putc ('.', file);
14896 #endif
14898 case 'C':
14899 case 'c':
14900 if (!COMPARISON_P (x))
14902 output_operand_lossage ("operand is not a condition code, "
14903 "invalid operand code '%c'", code);
14904 return;
14906 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14907 code == 'c' || code == 'f',
14908 code == 'F' || code == 'f',
14909 file);
14910 return;
14912 case 'H':
14913 if (!offsettable_memref_p (x))
14915 output_operand_lossage ("operand is not an offsettable memory "
14916 "reference, invalid operand code 'H'");
14917 return;
14919 /* It doesn't actually matter what mode we use here, as we're
14920 only going to use this for printing. */
14921 x = adjust_address_nv (x, DImode, 8);
14922 /* Output 'qword ptr' for intel assembler dialect. */
14923 if (ASSEMBLER_DIALECT == ASM_INTEL)
14924 code = 'q';
14925 break;
14927 case 'K':
14928 gcc_assert (CONST_INT_P (x));
14930 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14931 #ifdef HAVE_AS_IX86_HLE
14932 fputs ("xacquire ", file);
14933 #else
14934 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14935 #endif
14936 else if (INTVAL (x) & IX86_HLE_RELEASE)
14937 #ifdef HAVE_AS_IX86_HLE
14938 fputs ("xrelease ", file);
14939 #else
14940 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14941 #endif
14942 /* We do not want to print value of the operand. */
14943 return;
14945 case 'N':
14946 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
14947 fputs ("{z}", file);
14948 return;
14950 case '*':
14951 if (ASSEMBLER_DIALECT == ASM_ATT)
14952 putc ('*', file);
14953 return;
14955 case '&':
14957 const char *name = get_some_local_dynamic_name ();
14958 if (name == NULL)
14959 output_operand_lossage ("'%%&' used without any "
14960 "local dynamic TLS references");
14961 else
14962 assemble_name (file, name);
14963 return;
14966 case '+':
14968 rtx x;
14970 if (!optimize
14971 || optimize_function_for_size_p (cfun)
14972 || !TARGET_BRANCH_PREDICTION_HINTS)
14973 return;
14975 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14976 if (x)
14978 int pred_val = XINT (x, 0);
14980 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14981 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14983 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14984 bool cputaken
14985 = final_forward_branch_p (current_output_insn) == 0;
14987 /* Emit hints only in the case default branch prediction
14988 heuristics would fail. */
14989 if (taken != cputaken)
14991 /* We use 3e (DS) prefix for taken branches and
14992 2e (CS) prefix for not taken branches. */
14993 if (taken)
14994 fputs ("ds ; ", file);
14995 else
14996 fputs ("cs ; ", file);
15000 return;
15003 case ';':
15004 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
15005 putc (';', file);
15006 #endif
15007 return;
15009 case '@':
15010 if (ASSEMBLER_DIALECT == ASM_ATT)
15011 putc ('%', file);
15013 /* The kernel uses a different segment register for performance
15014 reasons; a system call would not have to trash the userspace
15015 segment register, which would be expensive. */
15016 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
15017 fputs ("fs", file);
15018 else
15019 fputs ("gs", file);
15020 return;
15022 case '~':
15023 putc (TARGET_AVX2 ? 'i' : 'f', file);
15024 return;
15026 case '^':
15027 if (TARGET_64BIT && Pmode != word_mode)
15028 fputs ("addr32 ", file);
15029 return;
15031 case '!':
15032 if (ix86_bnd_prefixed_insn_p (NULL_RTX))
15033 fputs ("bnd ", file);
15034 return;
15036 default:
15037 output_operand_lossage ("invalid operand code '%c'", code);
15041 if (REG_P (x))
15042 print_reg (x, code, file);
15044 else if (MEM_P (x))
15046 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
15047 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
15048 && GET_MODE (x) != BLKmode)
15050 const char * size;
15051 switch (GET_MODE_SIZE (GET_MODE (x)))
15053 case 1: size = "BYTE"; break;
15054 case 2: size = "WORD"; break;
15055 case 4: size = "DWORD"; break;
15056 case 8: size = "QWORD"; break;
15057 case 12: size = "TBYTE"; break;
15058 case 16:
15059 if (GET_MODE (x) == XFmode)
15060 size = "TBYTE";
15061 else
15062 size = "XMMWORD";
15063 break;
15064 case 32: size = "YMMWORD"; break;
15065 case 64: size = "ZMMWORD"; break;
15066 default:
15067 gcc_unreachable ();
15070 /* Check for explicit size override (codes 'b', 'w', 'k',
15071 'q' and 'x') */
15072 if (code == 'b')
15073 size = "BYTE";
15074 else if (code == 'w')
15075 size = "WORD";
15076 else if (code == 'k')
15077 size = "DWORD";
15078 else if (code == 'q')
15079 size = "QWORD";
15080 else if (code == 'x')
15081 size = "XMMWORD";
15083 fputs (size, file);
15084 fputs (" PTR ", file);
15087 x = XEXP (x, 0);
15088 /* Avoid (%rip) for call operands. */
15089 if (CONSTANT_ADDRESS_P (x) && code == 'P'
15090 && !CONST_INT_P (x))
15091 output_addr_const (file, x);
15092 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
15093 output_operand_lossage ("invalid constraints for operand");
15094 else
15095 output_address (x);
15098 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
15100 REAL_VALUE_TYPE r;
15101 long l;
15103 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15104 REAL_VALUE_TO_TARGET_SINGLE (r, l);
15106 if (ASSEMBLER_DIALECT == ASM_ATT)
15107 putc ('$', file);
15108 /* Sign extend 32bit SFmode immediate to 8 bytes. */
15109 if (code == 'q')
15110 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
15111 (unsigned long long) (int) l);
15112 else
15113 fprintf (file, "0x%08x", (unsigned int) l);
15116 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
15118 REAL_VALUE_TYPE r;
15119 long l[2];
15121 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15122 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
15124 if (ASSEMBLER_DIALECT == ASM_ATT)
15125 putc ('$', file);
15126 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
15129 /* These float cases don't actually occur as immediate operands. */
15130 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
15132 char dstr[30];
15134 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
15135 fputs (dstr, file);
15138 else
15140 /* We have patterns that allow zero sets of memory, for instance.
15141 In 64-bit mode, we should probably support all 8-byte vectors,
15142 since we can in fact encode that into an immediate. */
15143 if (GET_CODE (x) == CONST_VECTOR)
15145 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
15146 x = const0_rtx;
15149 if (code != 'P' && code != 'p')
15151 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
15153 if (ASSEMBLER_DIALECT == ASM_ATT)
15154 putc ('$', file);
15156 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
15157 || GET_CODE (x) == LABEL_REF)
15159 if (ASSEMBLER_DIALECT == ASM_ATT)
15160 putc ('$', file);
15161 else
15162 fputs ("OFFSET FLAT:", file);
15165 if (CONST_INT_P (x))
15166 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
15167 else if (flag_pic || MACHOPIC_INDIRECT)
15168 output_pic_addr_const (file, x, code);
15169 else
15170 output_addr_const (file, x);
15174 static bool
15175 ix86_print_operand_punct_valid_p (unsigned char code)
15177 return (code == '@' || code == '*' || code == '+' || code == '&'
15178 || code == ';' || code == '~' || code == '^' || code == '!');
15181 /* Print a memory operand whose address is ADDR. */
15183 static void
15184 ix86_print_operand_address (FILE *file, rtx addr)
15186 struct ix86_address parts;
15187 rtx base, index, disp;
15188 int scale;
15189 int ok;
15190 bool vsib = false;
15191 int code = 0;
15193 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15195 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15196 gcc_assert (parts.index == NULL_RTX);
15197 parts.index = XVECEXP (addr, 0, 1);
15198 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15199 addr = XVECEXP (addr, 0, 0);
15200 vsib = true;
15202 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15204 gcc_assert (TARGET_64BIT);
15205 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15206 code = 'q';
15208 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
15210 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
15211 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
15212 if (parts.base != NULL_RTX)
15214 parts.index = parts.base;
15215 parts.scale = 1;
15217 parts.base = XVECEXP (addr, 0, 0);
15218 addr = XVECEXP (addr, 0, 0);
15220 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
15222 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15223 gcc_assert (parts.index == NULL_RTX);
15224 parts.index = XVECEXP (addr, 0, 1);
15225 addr = XVECEXP (addr, 0, 0);
15227 else
15228 ok = ix86_decompose_address (addr, &parts);
15230 gcc_assert (ok);
15232 base = parts.base;
15233 index = parts.index;
15234 disp = parts.disp;
15235 scale = parts.scale;
15237 switch (parts.seg)
15239 case SEG_DEFAULT:
15240 break;
15241 case SEG_FS:
15242 case SEG_GS:
15243 if (ASSEMBLER_DIALECT == ASM_ATT)
15244 putc ('%', file);
15245 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15246 break;
15247 default:
15248 gcc_unreachable ();
15251 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15252 if (TARGET_64BIT && !base && !index)
15254 rtx symbol = disp;
15256 if (GET_CODE (disp) == CONST
15257 && GET_CODE (XEXP (disp, 0)) == PLUS
15258 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15259 symbol = XEXP (XEXP (disp, 0), 0);
15261 if (GET_CODE (symbol) == LABEL_REF
15262 || (GET_CODE (symbol) == SYMBOL_REF
15263 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15264 base = pc_rtx;
15266 if (!base && !index)
15268 /* Displacement only requires special attention. */
15270 if (CONST_INT_P (disp))
15272 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15273 fputs ("ds:", file);
15274 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15276 else if (flag_pic)
15277 output_pic_addr_const (file, disp, 0);
15278 else
15279 output_addr_const (file, disp);
15281 else
15283 /* Print SImode register names to force addr32 prefix. */
15284 if (SImode_address_operand (addr, VOIDmode))
15286 #ifdef ENABLE_CHECKING
15287 gcc_assert (TARGET_64BIT);
15288 switch (GET_CODE (addr))
15290 case SUBREG:
15291 gcc_assert (GET_MODE (addr) == SImode);
15292 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15293 break;
15294 case ZERO_EXTEND:
15295 case AND:
15296 gcc_assert (GET_MODE (addr) == DImode);
15297 break;
15298 default:
15299 gcc_unreachable ();
15301 #endif
15302 gcc_assert (!code);
15303 code = 'k';
15305 else if (code == 0
15306 && TARGET_X32
15307 && disp
15308 && CONST_INT_P (disp)
15309 && INTVAL (disp) < -16*1024*1024)
15311 /* X32 runs in 64-bit mode, where displacement, DISP, in
15312 address DISP(%r64), is encoded as 32-bit immediate sign-
15313 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15314 address is %r64 + 0xffffffffbffffd00. When %r64 <
15315 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15316 which is invalid for x32. The correct address is %r64
15317 - 0x40000300 == 0xf7ffdd64. To properly encode
15318 -0x40000300(%r64) for x32, we zero-extend negative
15319 displacement by forcing addr32 prefix which truncates
15320 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15321 zero-extend all negative displacements, including -1(%rsp).
15322 However, for small negative displacements, sign-extension
15323 won't cause overflow. We only zero-extend negative
15324 displacements if they < -16*1024*1024, which is also used
15325 to check legitimate address displacements for PIC. */
15326 code = 'k';
15329 if (ASSEMBLER_DIALECT == ASM_ATT)
15331 if (disp)
15333 if (flag_pic)
15334 output_pic_addr_const (file, disp, 0);
15335 else if (GET_CODE (disp) == LABEL_REF)
15336 output_asm_label (disp);
15337 else
15338 output_addr_const (file, disp);
15341 putc ('(', file);
15342 if (base)
15343 print_reg (base, code, file);
15344 if (index)
15346 putc (',', file);
15347 print_reg (index, vsib ? 0 : code, file);
15348 if (scale != 1 || vsib)
15349 fprintf (file, ",%d", scale);
15351 putc (')', file);
15353 else
15355 rtx offset = NULL_RTX;
15357 if (disp)
15359 /* Pull out the offset of a symbol; print any symbol itself. */
15360 if (GET_CODE (disp) == CONST
15361 && GET_CODE (XEXP (disp, 0)) == PLUS
15362 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15364 offset = XEXP (XEXP (disp, 0), 1);
15365 disp = gen_rtx_CONST (VOIDmode,
15366 XEXP (XEXP (disp, 0), 0));
15369 if (flag_pic)
15370 output_pic_addr_const (file, disp, 0);
15371 else if (GET_CODE (disp) == LABEL_REF)
15372 output_asm_label (disp);
15373 else if (CONST_INT_P (disp))
15374 offset = disp;
15375 else
15376 output_addr_const (file, disp);
15379 putc ('[', file);
15380 if (base)
15382 print_reg (base, code, file);
15383 if (offset)
15385 if (INTVAL (offset) >= 0)
15386 putc ('+', file);
15387 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15390 else if (offset)
15391 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15392 else
15393 putc ('0', file);
15395 if (index)
15397 putc ('+', file);
15398 print_reg (index, vsib ? 0 : code, file);
15399 if (scale != 1 || vsib)
15400 fprintf (file, "*%d", scale);
15402 putc (']', file);
15407 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15409 static bool
15410 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15412 rtx op;
15414 if (GET_CODE (x) != UNSPEC)
15415 return false;
15417 op = XVECEXP (x, 0, 0);
15418 switch (XINT (x, 1))
15420 case UNSPEC_GOTTPOFF:
15421 output_addr_const (file, op);
15422 /* FIXME: This might be @TPOFF in Sun ld. */
15423 fputs ("@gottpoff", file);
15424 break;
15425 case UNSPEC_TPOFF:
15426 output_addr_const (file, op);
15427 fputs ("@tpoff", file);
15428 break;
15429 case UNSPEC_NTPOFF:
15430 output_addr_const (file, op);
15431 if (TARGET_64BIT)
15432 fputs ("@tpoff", file);
15433 else
15434 fputs ("@ntpoff", file);
15435 break;
15436 case UNSPEC_DTPOFF:
15437 output_addr_const (file, op);
15438 fputs ("@dtpoff", file);
15439 break;
15440 case UNSPEC_GOTNTPOFF:
15441 output_addr_const (file, op);
15442 if (TARGET_64BIT)
15443 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15444 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15445 else
15446 fputs ("@gotntpoff", file);
15447 break;
15448 case UNSPEC_INDNTPOFF:
15449 output_addr_const (file, op);
15450 fputs ("@indntpoff", file);
15451 break;
15452 #if TARGET_MACHO
15453 case UNSPEC_MACHOPIC_OFFSET:
15454 output_addr_const (file, op);
15455 putc ('-', file);
15456 machopic_output_function_base_name (file);
15457 break;
15458 #endif
15460 case UNSPEC_STACK_CHECK:
15462 int offset;
15464 gcc_assert (flag_split_stack);
15466 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15467 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15468 #else
15469 gcc_unreachable ();
15470 #endif
15472 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15474 break;
15476 default:
15477 return false;
15480 return true;
15483 /* Split one or more double-mode RTL references into pairs of half-mode
15484 references. The RTL can be REG, offsettable MEM, integer constant, or
15485 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15486 split and "num" is its length. lo_half and hi_half are output arrays
15487 that parallel "operands". */
15489 void
15490 split_double_mode (enum machine_mode mode, rtx operands[],
15491 int num, rtx lo_half[], rtx hi_half[])
15493 enum machine_mode half_mode;
15494 unsigned int byte;
15496 switch (mode)
15498 case TImode:
15499 half_mode = DImode;
15500 break;
15501 case DImode:
15502 half_mode = SImode;
15503 break;
15504 default:
15505 gcc_unreachable ();
15508 byte = GET_MODE_SIZE (half_mode);
15510 while (num--)
15512 rtx op = operands[num];
15514 /* simplify_subreg refuse to split volatile memory addresses,
15515 but we still have to handle it. */
15516 if (MEM_P (op))
15518 lo_half[num] = adjust_address (op, half_mode, 0);
15519 hi_half[num] = adjust_address (op, half_mode, byte);
15521 else
15523 lo_half[num] = simplify_gen_subreg (half_mode, op,
15524 GET_MODE (op) == VOIDmode
15525 ? mode : GET_MODE (op), 0);
15526 hi_half[num] = simplify_gen_subreg (half_mode, op,
15527 GET_MODE (op) == VOIDmode
15528 ? mode : GET_MODE (op), byte);
15533 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15534 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15535 is the expression of the binary operation. The output may either be
15536 emitted here, or returned to the caller, like all output_* functions.
15538 There is no guarantee that the operands are the same mode, as they
15539 might be within FLOAT or FLOAT_EXTEND expressions. */
15541 #ifndef SYSV386_COMPAT
15542 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15543 wants to fix the assemblers because that causes incompatibility
15544 with gcc. No-one wants to fix gcc because that causes
15545 incompatibility with assemblers... You can use the option of
15546 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15547 #define SYSV386_COMPAT 1
15548 #endif
15550 const char *
15551 output_387_binary_op (rtx insn, rtx *operands)
15553 static char buf[40];
15554 const char *p;
15555 const char *ssep;
15556 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15558 #ifdef ENABLE_CHECKING
15559 /* Even if we do not want to check the inputs, this documents input
15560 constraints. Which helps in understanding the following code. */
15561 if (STACK_REG_P (operands[0])
15562 && ((REG_P (operands[1])
15563 && REGNO (operands[0]) == REGNO (operands[1])
15564 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15565 || (REG_P (operands[2])
15566 && REGNO (operands[0]) == REGNO (operands[2])
15567 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15568 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15569 ; /* ok */
15570 else
15571 gcc_assert (is_sse);
15572 #endif
15574 switch (GET_CODE (operands[3]))
15576 case PLUS:
15577 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15578 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15579 p = "fiadd";
15580 else
15581 p = "fadd";
15582 ssep = "vadd";
15583 break;
15585 case MINUS:
15586 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15587 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15588 p = "fisub";
15589 else
15590 p = "fsub";
15591 ssep = "vsub";
15592 break;
15594 case MULT:
15595 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15596 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15597 p = "fimul";
15598 else
15599 p = "fmul";
15600 ssep = "vmul";
15601 break;
15603 case DIV:
15604 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15605 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15606 p = "fidiv";
15607 else
15608 p = "fdiv";
15609 ssep = "vdiv";
15610 break;
15612 default:
15613 gcc_unreachable ();
15616 if (is_sse)
15618 if (TARGET_AVX)
15620 strcpy (buf, ssep);
15621 if (GET_MODE (operands[0]) == SFmode)
15622 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15623 else
15624 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15626 else
15628 strcpy (buf, ssep + 1);
15629 if (GET_MODE (operands[0]) == SFmode)
15630 strcat (buf, "ss\t{%2, %0|%0, %2}");
15631 else
15632 strcat (buf, "sd\t{%2, %0|%0, %2}");
15634 return buf;
15636 strcpy (buf, p);
15638 switch (GET_CODE (operands[3]))
15640 case MULT:
15641 case PLUS:
15642 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15644 rtx temp = operands[2];
15645 operands[2] = operands[1];
15646 operands[1] = temp;
15649 /* know operands[0] == operands[1]. */
15651 if (MEM_P (operands[2]))
15653 p = "%Z2\t%2";
15654 break;
15657 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15659 if (STACK_TOP_P (operands[0]))
15660 /* How is it that we are storing to a dead operand[2]?
15661 Well, presumably operands[1] is dead too. We can't
15662 store the result to st(0) as st(0) gets popped on this
15663 instruction. Instead store to operands[2] (which I
15664 think has to be st(1)). st(1) will be popped later.
15665 gcc <= 2.8.1 didn't have this check and generated
15666 assembly code that the Unixware assembler rejected. */
15667 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15668 else
15669 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15670 break;
15673 if (STACK_TOP_P (operands[0]))
15674 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15675 else
15676 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15677 break;
15679 case MINUS:
15680 case DIV:
15681 if (MEM_P (operands[1]))
15683 p = "r%Z1\t%1";
15684 break;
15687 if (MEM_P (operands[2]))
15689 p = "%Z2\t%2";
15690 break;
15693 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15695 #if SYSV386_COMPAT
15696 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15697 derived assemblers, confusingly reverse the direction of
15698 the operation for fsub{r} and fdiv{r} when the
15699 destination register is not st(0). The Intel assembler
15700 doesn't have this brain damage. Read !SYSV386_COMPAT to
15701 figure out what the hardware really does. */
15702 if (STACK_TOP_P (operands[0]))
15703 p = "{p\t%0, %2|rp\t%2, %0}";
15704 else
15705 p = "{rp\t%2, %0|p\t%0, %2}";
15706 #else
15707 if (STACK_TOP_P (operands[0]))
15708 /* As above for fmul/fadd, we can't store to st(0). */
15709 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15710 else
15711 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15712 #endif
15713 break;
15716 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15718 #if SYSV386_COMPAT
15719 if (STACK_TOP_P (operands[0]))
15720 p = "{rp\t%0, %1|p\t%1, %0}";
15721 else
15722 p = "{p\t%1, %0|rp\t%0, %1}";
15723 #else
15724 if (STACK_TOP_P (operands[0]))
15725 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15726 else
15727 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15728 #endif
15729 break;
15732 if (STACK_TOP_P (operands[0]))
15734 if (STACK_TOP_P (operands[1]))
15735 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15736 else
15737 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15738 break;
15740 else if (STACK_TOP_P (operands[1]))
15742 #if SYSV386_COMPAT
15743 p = "{\t%1, %0|r\t%0, %1}";
15744 #else
15745 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15746 #endif
15748 else
15750 #if SYSV386_COMPAT
15751 p = "{r\t%2, %0|\t%0, %2}";
15752 #else
15753 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15754 #endif
15756 break;
15758 default:
15759 gcc_unreachable ();
15762 strcat (buf, p);
15763 return buf;
15766 /* Check if a 256bit AVX register is referenced inside of EXP. */
15768 static int
15769 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15771 rtx exp = *pexp;
15773 if (GET_CODE (exp) == SUBREG)
15774 exp = SUBREG_REG (exp);
15776 if (REG_P (exp)
15777 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15778 return 1;
15780 return 0;
15783 /* Return needed mode for entity in optimize_mode_switching pass. */
15785 static int
15786 ix86_avx_u128_mode_needed (rtx insn)
15788 if (CALL_P (insn))
15790 rtx link;
15792 /* Needed mode is set to AVX_U128_CLEAN if there are
15793 no 256bit modes used in function arguments. */
15794 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15795 link;
15796 link = XEXP (link, 1))
15798 if (GET_CODE (XEXP (link, 0)) == USE)
15800 rtx arg = XEXP (XEXP (link, 0), 0);
15802 if (ix86_check_avx256_register (&arg, NULL))
15803 return AVX_U128_DIRTY;
15807 return AVX_U128_CLEAN;
15810 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15811 changes state only when a 256bit register is written to, but we need
15812 to prevent the compiler from moving optimal insertion point above
15813 eventual read from 256bit register. */
15814 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15815 return AVX_U128_DIRTY;
15817 return AVX_U128_ANY;
15820 /* Return mode that i387 must be switched into
15821 prior to the execution of insn. */
15823 static int
15824 ix86_i387_mode_needed (int entity, rtx insn)
15826 enum attr_i387_cw mode;
15828 /* The mode UNINITIALIZED is used to store control word after a
15829 function call or ASM pattern. The mode ANY specify that function
15830 has no requirements on the control word and make no changes in the
15831 bits we are interested in. */
15833 if (CALL_P (insn)
15834 || (NONJUMP_INSN_P (insn)
15835 && (asm_noperands (PATTERN (insn)) >= 0
15836 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15837 return I387_CW_UNINITIALIZED;
15839 if (recog_memoized (insn) < 0)
15840 return I387_CW_ANY;
15842 mode = get_attr_i387_cw (insn);
15844 switch (entity)
15846 case I387_TRUNC:
15847 if (mode == I387_CW_TRUNC)
15848 return mode;
15849 break;
15851 case I387_FLOOR:
15852 if (mode == I387_CW_FLOOR)
15853 return mode;
15854 break;
15856 case I387_CEIL:
15857 if (mode == I387_CW_CEIL)
15858 return mode;
15859 break;
15861 case I387_MASK_PM:
15862 if (mode == I387_CW_MASK_PM)
15863 return mode;
15864 break;
15866 default:
15867 gcc_unreachable ();
15870 return I387_CW_ANY;
15873 /* Return mode that entity must be switched into
15874 prior to the execution of insn. */
15877 ix86_mode_needed (int entity, rtx insn)
15879 switch (entity)
15881 case AVX_U128:
15882 return ix86_avx_u128_mode_needed (insn);
15883 case I387_TRUNC:
15884 case I387_FLOOR:
15885 case I387_CEIL:
15886 case I387_MASK_PM:
15887 return ix86_i387_mode_needed (entity, insn);
15888 default:
15889 gcc_unreachable ();
15891 return 0;
15894 /* Check if a 256bit AVX register is referenced in stores. */
15896 static void
15897 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15899 if (ix86_check_avx256_register (&dest, NULL))
15901 bool *used = (bool *) data;
15902 *used = true;
15906 /* Calculate mode of upper 128bit AVX registers after the insn. */
15908 static int
15909 ix86_avx_u128_mode_after (int mode, rtx insn)
15911 rtx pat = PATTERN (insn);
15913 if (vzeroupper_operation (pat, VOIDmode)
15914 || vzeroall_operation (pat, VOIDmode))
15915 return AVX_U128_CLEAN;
15917 /* We know that state is clean after CALL insn if there are no
15918 256bit registers used in the function return register. */
15919 if (CALL_P (insn))
15921 bool avx_reg256_found = false;
15922 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15924 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
15927 /* Otherwise, return current mode. Remember that if insn
15928 references AVX 256bit registers, the mode was already changed
15929 to DIRTY from MODE_NEEDED. */
15930 return mode;
15933 /* Return the mode that an insn results in. */
15936 ix86_mode_after (int entity, int mode, rtx insn)
15938 switch (entity)
15940 case AVX_U128:
15941 return ix86_avx_u128_mode_after (mode, insn);
15942 case I387_TRUNC:
15943 case I387_FLOOR:
15944 case I387_CEIL:
15945 case I387_MASK_PM:
15946 return mode;
15947 default:
15948 gcc_unreachable ();
15952 static int
15953 ix86_avx_u128_mode_entry (void)
15955 tree arg;
15957 /* Entry mode is set to AVX_U128_DIRTY if there are
15958 256bit modes used in function arguments. */
15959 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15960 arg = TREE_CHAIN (arg))
15962 rtx incoming = DECL_INCOMING_RTL (arg);
15964 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15965 return AVX_U128_DIRTY;
15968 return AVX_U128_CLEAN;
15971 /* Return a mode that ENTITY is assumed to be
15972 switched to at function entry. */
15975 ix86_mode_entry (int entity)
15977 switch (entity)
15979 case AVX_U128:
15980 return ix86_avx_u128_mode_entry ();
15981 case I387_TRUNC:
15982 case I387_FLOOR:
15983 case I387_CEIL:
15984 case I387_MASK_PM:
15985 return I387_CW_ANY;
15986 default:
15987 gcc_unreachable ();
15991 static int
15992 ix86_avx_u128_mode_exit (void)
15994 rtx reg = crtl->return_rtx;
15996 /* Exit mode is set to AVX_U128_DIRTY if there are
15997 256bit modes used in the function return register. */
15998 if (reg && ix86_check_avx256_register (&reg, NULL))
15999 return AVX_U128_DIRTY;
16001 return AVX_U128_CLEAN;
16004 /* Return a mode that ENTITY is assumed to be
16005 switched to at function exit. */
16008 ix86_mode_exit (int entity)
16010 switch (entity)
16012 case AVX_U128:
16013 return ix86_avx_u128_mode_exit ();
16014 case I387_TRUNC:
16015 case I387_FLOOR:
16016 case I387_CEIL:
16017 case I387_MASK_PM:
16018 return I387_CW_ANY;
16019 default:
16020 gcc_unreachable ();
16024 /* Output code to initialize control word copies used by trunc?f?i and
16025 rounding patterns. CURRENT_MODE is set to current control word,
16026 while NEW_MODE is set to new control word. */
16028 static void
16029 emit_i387_cw_initialization (int mode)
16031 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
16032 rtx new_mode;
16034 enum ix86_stack_slot slot;
16036 rtx reg = gen_reg_rtx (HImode);
16038 emit_insn (gen_x86_fnstcw_1 (stored_mode));
16039 emit_move_insn (reg, copy_rtx (stored_mode));
16041 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
16042 || optimize_insn_for_size_p ())
16044 switch (mode)
16046 case I387_CW_TRUNC:
16047 /* round toward zero (truncate) */
16048 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
16049 slot = SLOT_CW_TRUNC;
16050 break;
16052 case I387_CW_FLOOR:
16053 /* round down toward -oo */
16054 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16055 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
16056 slot = SLOT_CW_FLOOR;
16057 break;
16059 case I387_CW_CEIL:
16060 /* round up toward +oo */
16061 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16062 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
16063 slot = SLOT_CW_CEIL;
16064 break;
16066 case I387_CW_MASK_PM:
16067 /* mask precision exception for nearbyint() */
16068 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16069 slot = SLOT_CW_MASK_PM;
16070 break;
16072 default:
16073 gcc_unreachable ();
16076 else
16078 switch (mode)
16080 case I387_CW_TRUNC:
16081 /* round toward zero (truncate) */
16082 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
16083 slot = SLOT_CW_TRUNC;
16084 break;
16086 case I387_CW_FLOOR:
16087 /* round down toward -oo */
16088 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
16089 slot = SLOT_CW_FLOOR;
16090 break;
16092 case I387_CW_CEIL:
16093 /* round up toward +oo */
16094 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
16095 slot = SLOT_CW_CEIL;
16096 break;
16098 case I387_CW_MASK_PM:
16099 /* mask precision exception for nearbyint() */
16100 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16101 slot = SLOT_CW_MASK_PM;
16102 break;
16104 default:
16105 gcc_unreachable ();
16109 gcc_assert (slot < MAX_386_STACK_LOCALS);
16111 new_mode = assign_386_stack_local (HImode, slot);
16112 emit_move_insn (new_mode, reg);
16115 /* Emit vzeroupper. */
16117 void
16118 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
16120 int i;
16122 /* Cancel automatic vzeroupper insertion if there are
16123 live call-saved SSE registers at the insertion point. */
16125 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
16126 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16127 return;
16129 if (TARGET_64BIT)
16130 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
16131 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16132 return;
16134 emit_insn (gen_avx_vzeroupper ());
16137 /* Generate one or more insns to set ENTITY to MODE. */
16139 void
16140 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
16142 switch (entity)
16144 case AVX_U128:
16145 if (mode == AVX_U128_CLEAN)
16146 ix86_avx_emit_vzeroupper (regs_live);
16147 break;
16148 case I387_TRUNC:
16149 case I387_FLOOR:
16150 case I387_CEIL:
16151 case I387_MASK_PM:
16152 if (mode != I387_CW_ANY
16153 && mode != I387_CW_UNINITIALIZED)
16154 emit_i387_cw_initialization (mode);
16155 break;
16156 default:
16157 gcc_unreachable ();
16161 /* Output code for INSN to convert a float to a signed int. OPERANDS
16162 are the insn operands. The output may be [HSD]Imode and the input
16163 operand may be [SDX]Fmode. */
16165 const char *
16166 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
16168 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16169 int dimode_p = GET_MODE (operands[0]) == DImode;
16170 int round_mode = get_attr_i387_cw (insn);
16172 /* Jump through a hoop or two for DImode, since the hardware has no
16173 non-popping instruction. We used to do this a different way, but
16174 that was somewhat fragile and broke with post-reload splitters. */
16175 if ((dimode_p || fisttp) && !stack_top_dies)
16176 output_asm_insn ("fld\t%y1", operands);
16178 gcc_assert (STACK_TOP_P (operands[1]));
16179 gcc_assert (MEM_P (operands[0]));
16180 gcc_assert (GET_MODE (operands[1]) != TFmode);
16182 if (fisttp)
16183 output_asm_insn ("fisttp%Z0\t%0", operands);
16184 else
16186 if (round_mode != I387_CW_ANY)
16187 output_asm_insn ("fldcw\t%3", operands);
16188 if (stack_top_dies || dimode_p)
16189 output_asm_insn ("fistp%Z0\t%0", operands);
16190 else
16191 output_asm_insn ("fist%Z0\t%0", operands);
16192 if (round_mode != I387_CW_ANY)
16193 output_asm_insn ("fldcw\t%2", operands);
16196 return "";
16199 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16200 have the values zero or one, indicates the ffreep insn's operand
16201 from the OPERANDS array. */
16203 static const char *
16204 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16206 if (TARGET_USE_FFREEP)
16207 #ifdef HAVE_AS_IX86_FFREEP
16208 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16209 #else
16211 static char retval[32];
16212 int regno = REGNO (operands[opno]);
16214 gcc_assert (STACK_REGNO_P (regno));
16216 regno -= FIRST_STACK_REG;
16218 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16219 return retval;
16221 #endif
16223 return opno ? "fstp\t%y1" : "fstp\t%y0";
16227 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16228 should be used. UNORDERED_P is true when fucom should be used. */
16230 const char *
16231 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16233 int stack_top_dies;
16234 rtx cmp_op0, cmp_op1;
16235 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16237 if (eflags_p)
16239 cmp_op0 = operands[0];
16240 cmp_op1 = operands[1];
16242 else
16244 cmp_op0 = operands[1];
16245 cmp_op1 = operands[2];
16248 if (is_sse)
16250 if (GET_MODE (operands[0]) == SFmode)
16251 if (unordered_p)
16252 return "%vucomiss\t{%1, %0|%0, %1}";
16253 else
16254 return "%vcomiss\t{%1, %0|%0, %1}";
16255 else
16256 if (unordered_p)
16257 return "%vucomisd\t{%1, %0|%0, %1}";
16258 else
16259 return "%vcomisd\t{%1, %0|%0, %1}";
16262 gcc_assert (STACK_TOP_P (cmp_op0));
16264 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16266 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16268 if (stack_top_dies)
16270 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16271 return output_387_ffreep (operands, 1);
16273 else
16274 return "ftst\n\tfnstsw\t%0";
16277 if (STACK_REG_P (cmp_op1)
16278 && stack_top_dies
16279 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16280 && REGNO (cmp_op1) != FIRST_STACK_REG)
16282 /* If both the top of the 387 stack dies, and the other operand
16283 is also a stack register that dies, then this must be a
16284 `fcompp' float compare */
16286 if (eflags_p)
16288 /* There is no double popping fcomi variant. Fortunately,
16289 eflags is immune from the fstp's cc clobbering. */
16290 if (unordered_p)
16291 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16292 else
16293 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16294 return output_387_ffreep (operands, 0);
16296 else
16298 if (unordered_p)
16299 return "fucompp\n\tfnstsw\t%0";
16300 else
16301 return "fcompp\n\tfnstsw\t%0";
16304 else
16306 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16308 static const char * const alt[16] =
16310 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16311 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16312 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16313 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16315 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16316 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16317 NULL,
16318 NULL,
16320 "fcomi\t{%y1, %0|%0, %y1}",
16321 "fcomip\t{%y1, %0|%0, %y1}",
16322 "fucomi\t{%y1, %0|%0, %y1}",
16323 "fucomip\t{%y1, %0|%0, %y1}",
16325 NULL,
16326 NULL,
16327 NULL,
16328 NULL
16331 int mask;
16332 const char *ret;
16334 mask = eflags_p << 3;
16335 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16336 mask |= unordered_p << 1;
16337 mask |= stack_top_dies;
16339 gcc_assert (mask < 16);
16340 ret = alt[mask];
16341 gcc_assert (ret);
16343 return ret;
16347 void
16348 ix86_output_addr_vec_elt (FILE *file, int value)
16350 const char *directive = ASM_LONG;
16352 #ifdef ASM_QUAD
16353 if (TARGET_LP64)
16354 directive = ASM_QUAD;
16355 #else
16356 gcc_assert (!TARGET_64BIT);
16357 #endif
16359 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16362 void
16363 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16365 const char *directive = ASM_LONG;
16367 #ifdef ASM_QUAD
16368 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16369 directive = ASM_QUAD;
16370 #else
16371 gcc_assert (!TARGET_64BIT);
16372 #endif
16373 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16374 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16375 fprintf (file, "%s%s%d-%s%d\n",
16376 directive, LPREFIX, value, LPREFIX, rel);
16377 else if (HAVE_AS_GOTOFF_IN_DATA)
16378 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16379 #if TARGET_MACHO
16380 else if (TARGET_MACHO)
16382 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16383 machopic_output_function_base_name (file);
16384 putc ('\n', file);
16386 #endif
16387 else
16388 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16389 GOT_SYMBOL_NAME, LPREFIX, value);
16392 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16393 for the target. */
16395 void
16396 ix86_expand_clear (rtx dest)
16398 rtx tmp;
16400 /* We play register width games, which are only valid after reload. */
16401 gcc_assert (reload_completed);
16403 /* Avoid HImode and its attendant prefix byte. */
16404 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16405 dest = gen_rtx_REG (SImode, REGNO (dest));
16406 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16408 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16409 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16411 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16412 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16415 emit_insn (tmp);
16418 /* X is an unchanging MEM. If it is a constant pool reference, return
16419 the constant pool rtx, else NULL. */
16422 maybe_get_pool_constant (rtx x)
16424 x = ix86_delegitimize_address (XEXP (x, 0));
16426 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16427 return get_pool_constant (x);
16429 return NULL_RTX;
16432 void
16433 ix86_expand_move (enum machine_mode mode, rtx operands[])
16435 rtx op0, op1;
16436 enum tls_model model;
16438 op0 = operands[0];
16439 op1 = operands[1];
16441 if (GET_CODE (op1) == SYMBOL_REF)
16443 rtx tmp;
16445 model = SYMBOL_REF_TLS_MODEL (op1);
16446 if (model)
16448 op1 = legitimize_tls_address (op1, model, true);
16449 op1 = force_operand (op1, op0);
16450 if (op1 == op0)
16451 return;
16452 op1 = convert_to_mode (mode, op1, 1);
16454 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16455 op1 = tmp;
16457 else if (GET_CODE (op1) == CONST
16458 && GET_CODE (XEXP (op1, 0)) == PLUS
16459 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16461 rtx addend = XEXP (XEXP (op1, 0), 1);
16462 rtx symbol = XEXP (XEXP (op1, 0), 0);
16463 rtx tmp;
16465 model = SYMBOL_REF_TLS_MODEL (symbol);
16466 if (model)
16467 tmp = legitimize_tls_address (symbol, model, true);
16468 else
16469 tmp = legitimize_pe_coff_symbol (symbol, true);
16471 if (tmp)
16473 tmp = force_operand (tmp, NULL);
16474 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16475 op0, 1, OPTAB_DIRECT);
16476 if (tmp == op0)
16477 return;
16478 op1 = convert_to_mode (mode, tmp, 1);
16482 if ((flag_pic || MACHOPIC_INDIRECT)
16483 && symbolic_operand (op1, mode))
16485 if (TARGET_MACHO && !TARGET_64BIT)
16487 #if TARGET_MACHO
16488 /* dynamic-no-pic */
16489 if (MACHOPIC_INDIRECT)
16491 rtx temp = ((reload_in_progress
16492 || ((op0 && REG_P (op0))
16493 && mode == Pmode))
16494 ? op0 : gen_reg_rtx (Pmode));
16495 op1 = machopic_indirect_data_reference (op1, temp);
16496 if (MACHOPIC_PURE)
16497 op1 = machopic_legitimize_pic_address (op1, mode,
16498 temp == op1 ? 0 : temp);
16500 if (op0 != op1 && GET_CODE (op0) != MEM)
16502 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16503 emit_insn (insn);
16504 return;
16506 if (GET_CODE (op0) == MEM)
16507 op1 = force_reg (Pmode, op1);
16508 else
16510 rtx temp = op0;
16511 if (GET_CODE (temp) != REG)
16512 temp = gen_reg_rtx (Pmode);
16513 temp = legitimize_pic_address (op1, temp);
16514 if (temp == op0)
16515 return;
16516 op1 = temp;
16518 /* dynamic-no-pic */
16519 #endif
16521 else
16523 if (MEM_P (op0))
16524 op1 = force_reg (mode, op1);
16525 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16527 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16528 op1 = legitimize_pic_address (op1, reg);
16529 if (op0 == op1)
16530 return;
16531 op1 = convert_to_mode (mode, op1, 1);
16535 else
16537 if (MEM_P (op0)
16538 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16539 || !push_operand (op0, mode))
16540 && MEM_P (op1))
16541 op1 = force_reg (mode, op1);
16543 if (push_operand (op0, mode)
16544 && ! general_no_elim_operand (op1, mode))
16545 op1 = copy_to_mode_reg (mode, op1);
16547 /* Force large constants in 64bit compilation into register
16548 to get them CSEed. */
16549 if (can_create_pseudo_p ()
16550 && (mode == DImode) && TARGET_64BIT
16551 && immediate_operand (op1, mode)
16552 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16553 && !register_operand (op0, mode)
16554 && optimize)
16555 op1 = copy_to_mode_reg (mode, op1);
16557 if (can_create_pseudo_p ()
16558 && FLOAT_MODE_P (mode)
16559 && GET_CODE (op1) == CONST_DOUBLE)
16561 /* If we are loading a floating point constant to a register,
16562 force the value to memory now, since we'll get better code
16563 out the back end. */
16565 op1 = validize_mem (force_const_mem (mode, op1));
16566 if (!register_operand (op0, mode))
16568 rtx temp = gen_reg_rtx (mode);
16569 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16570 emit_move_insn (op0, temp);
16571 return;
16576 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16579 void
16580 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16582 rtx op0 = operands[0], op1 = operands[1];
16583 unsigned int align = GET_MODE_ALIGNMENT (mode);
16585 /* Force constants other than zero into memory. We do not know how
16586 the instructions used to build constants modify the upper 64 bits
16587 of the register, once we have that information we may be able
16588 to handle some of them more efficiently. */
16589 if (can_create_pseudo_p ()
16590 && register_operand (op0, mode)
16591 && (CONSTANT_P (op1)
16592 || (GET_CODE (op1) == SUBREG
16593 && CONSTANT_P (SUBREG_REG (op1))))
16594 && !standard_sse_constant_p (op1))
16595 op1 = validize_mem (force_const_mem (mode, op1));
16597 /* We need to check memory alignment for SSE mode since attribute
16598 can make operands unaligned. */
16599 if (can_create_pseudo_p ()
16600 && SSE_REG_MODE_P (mode)
16601 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16602 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16604 rtx tmp[2];
16606 /* ix86_expand_vector_move_misalign() does not like constants ... */
16607 if (CONSTANT_P (op1)
16608 || (GET_CODE (op1) == SUBREG
16609 && CONSTANT_P (SUBREG_REG (op1))))
16610 op1 = validize_mem (force_const_mem (mode, op1));
16612 /* ... nor both arguments in memory. */
16613 if (!register_operand (op0, mode)
16614 && !register_operand (op1, mode))
16615 op1 = force_reg (mode, op1);
16617 tmp[0] = op0; tmp[1] = op1;
16618 ix86_expand_vector_move_misalign (mode, tmp);
16619 return;
16622 /* Make operand1 a register if it isn't already. */
16623 if (can_create_pseudo_p ()
16624 && !register_operand (op0, mode)
16625 && !register_operand (op1, mode))
16627 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16628 return;
16631 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16634 /* Split 32-byte AVX unaligned load and store if needed. */
16636 static void
16637 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16639 rtx m;
16640 rtx (*extract) (rtx, rtx, rtx);
16641 rtx (*load_unaligned) (rtx, rtx);
16642 rtx (*store_unaligned) (rtx, rtx);
16643 enum machine_mode mode;
16645 switch (GET_MODE (op0))
16647 default:
16648 gcc_unreachable ();
16649 case V32QImode:
16650 extract = gen_avx_vextractf128v32qi;
16651 load_unaligned = gen_avx_loaddquv32qi;
16652 store_unaligned = gen_avx_storedquv32qi;
16653 mode = V16QImode;
16654 break;
16655 case V8SFmode:
16656 extract = gen_avx_vextractf128v8sf;
16657 load_unaligned = gen_avx_loadups256;
16658 store_unaligned = gen_avx_storeups256;
16659 mode = V4SFmode;
16660 break;
16661 case V4DFmode:
16662 extract = gen_avx_vextractf128v4df;
16663 load_unaligned = gen_avx_loadupd256;
16664 store_unaligned = gen_avx_storeupd256;
16665 mode = V2DFmode;
16666 break;
16669 if (MEM_P (op1))
16671 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16673 rtx r = gen_reg_rtx (mode);
16674 m = adjust_address (op1, mode, 0);
16675 emit_move_insn (r, m);
16676 m = adjust_address (op1, mode, 16);
16677 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16678 emit_move_insn (op0, r);
16680 /* Normal *mov<mode>_internal pattern will handle
16681 unaligned loads just fine if misaligned_operand
16682 is true, and without the UNSPEC it can be combined
16683 with arithmetic instructions. */
16684 else if (misaligned_operand (op1, GET_MODE (op1)))
16685 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16686 else
16687 emit_insn (load_unaligned (op0, op1));
16689 else if (MEM_P (op0))
16691 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16693 m = adjust_address (op0, mode, 0);
16694 emit_insn (extract (m, op1, const0_rtx));
16695 m = adjust_address (op0, mode, 16);
16696 emit_insn (extract (m, op1, const1_rtx));
16698 else
16699 emit_insn (store_unaligned (op0, op1));
16701 else
16702 gcc_unreachable ();
16705 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16706 straight to ix86_expand_vector_move. */
16707 /* Code generation for scalar reg-reg moves of single and double precision data:
16708 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16709 movaps reg, reg
16710 else
16711 movss reg, reg
16712 if (x86_sse_partial_reg_dependency == true)
16713 movapd reg, reg
16714 else
16715 movsd reg, reg
16717 Code generation for scalar loads of double precision data:
16718 if (x86_sse_split_regs == true)
16719 movlpd mem, reg (gas syntax)
16720 else
16721 movsd mem, reg
16723 Code generation for unaligned packed loads of single precision data
16724 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16725 if (x86_sse_unaligned_move_optimal)
16726 movups mem, reg
16728 if (x86_sse_partial_reg_dependency == true)
16730 xorps reg, reg
16731 movlps mem, reg
16732 movhps mem+8, reg
16734 else
16736 movlps mem, reg
16737 movhps mem+8, reg
16740 Code generation for unaligned packed loads of double precision data
16741 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16742 if (x86_sse_unaligned_move_optimal)
16743 movupd mem, reg
16745 if (x86_sse_split_regs == true)
16747 movlpd mem, reg
16748 movhpd mem+8, reg
16750 else
16752 movsd mem, reg
16753 movhpd mem+8, reg
16757 void
16758 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16760 rtx op0, op1, orig_op0 = NULL_RTX, m;
16761 rtx (*load_unaligned) (rtx, rtx);
16762 rtx (*store_unaligned) (rtx, rtx);
16764 op0 = operands[0];
16765 op1 = operands[1];
16767 if (GET_MODE_SIZE (mode) == 64)
16769 switch (GET_MODE_CLASS (mode))
16771 case MODE_VECTOR_INT:
16772 case MODE_INT:
16773 if (GET_MODE (op0) != V16SImode)
16775 if (!MEM_P (op0))
16777 orig_op0 = op0;
16778 op0 = gen_reg_rtx (V16SImode);
16780 else
16781 op0 = gen_lowpart (V16SImode, op0);
16783 op1 = gen_lowpart (V16SImode, op1);
16784 /* FALLTHRU */
16786 case MODE_VECTOR_FLOAT:
16787 switch (GET_MODE (op0))
16789 default:
16790 gcc_unreachable ();
16791 case V16SImode:
16792 load_unaligned = gen_avx512f_loaddquv16si;
16793 store_unaligned = gen_avx512f_storedquv16si;
16794 break;
16795 case V16SFmode:
16796 load_unaligned = gen_avx512f_loadups512;
16797 store_unaligned = gen_avx512f_storeups512;
16798 break;
16799 case V8DFmode:
16800 load_unaligned = gen_avx512f_loadupd512;
16801 store_unaligned = gen_avx512f_storeupd512;
16802 break;
16805 if (MEM_P (op1))
16806 emit_insn (load_unaligned (op0, op1));
16807 else if (MEM_P (op0))
16808 emit_insn (store_unaligned (op0, op1));
16809 else
16810 gcc_unreachable ();
16811 if (orig_op0)
16812 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16813 break;
16815 default:
16816 gcc_unreachable ();
16819 return;
16822 if (TARGET_AVX
16823 && GET_MODE_SIZE (mode) == 32)
16825 switch (GET_MODE_CLASS (mode))
16827 case MODE_VECTOR_INT:
16828 case MODE_INT:
16829 if (GET_MODE (op0) != V32QImode)
16831 if (!MEM_P (op0))
16833 orig_op0 = op0;
16834 op0 = gen_reg_rtx (V32QImode);
16836 else
16837 op0 = gen_lowpart (V32QImode, op0);
16839 op1 = gen_lowpart (V32QImode, op1);
16840 /* FALLTHRU */
16842 case MODE_VECTOR_FLOAT:
16843 ix86_avx256_split_vector_move_misalign (op0, op1);
16844 if (orig_op0)
16845 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16846 break;
16848 default:
16849 gcc_unreachable ();
16852 return;
16855 if (MEM_P (op1))
16857 /* Normal *mov<mode>_internal pattern will handle
16858 unaligned loads just fine if misaligned_operand
16859 is true, and without the UNSPEC it can be combined
16860 with arithmetic instructions. */
16861 if (TARGET_AVX
16862 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16863 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16864 && misaligned_operand (op1, GET_MODE (op1)))
16865 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16866 /* ??? If we have typed data, then it would appear that using
16867 movdqu is the only way to get unaligned data loaded with
16868 integer type. */
16869 else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16871 if (GET_MODE (op0) != V16QImode)
16873 orig_op0 = op0;
16874 op0 = gen_reg_rtx (V16QImode);
16876 op1 = gen_lowpart (V16QImode, op1);
16877 /* We will eventually emit movups based on insn attributes. */
16878 emit_insn (gen_sse2_loaddquv16qi (op0, op1));
16879 if (orig_op0)
16880 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16882 else if (TARGET_SSE2 && mode == V2DFmode)
16884 rtx zero;
16886 if (TARGET_AVX
16887 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16888 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16889 || optimize_insn_for_size_p ())
16891 /* We will eventually emit movups based on insn attributes. */
16892 emit_insn (gen_sse2_loadupd (op0, op1));
16893 return;
16896 /* When SSE registers are split into halves, we can avoid
16897 writing to the top half twice. */
16898 if (TARGET_SSE_SPLIT_REGS)
16900 emit_clobber (op0);
16901 zero = op0;
16903 else
16905 /* ??? Not sure about the best option for the Intel chips.
16906 The following would seem to satisfy; the register is
16907 entirely cleared, breaking the dependency chain. We
16908 then store to the upper half, with a dependency depth
16909 of one. A rumor has it that Intel recommends two movsd
16910 followed by an unpacklpd, but this is unconfirmed. And
16911 given that the dependency depth of the unpacklpd would
16912 still be one, I'm not sure why this would be better. */
16913 zero = CONST0_RTX (V2DFmode);
16916 m = adjust_address (op1, DFmode, 0);
16917 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16918 m = adjust_address (op1, DFmode, 8);
16919 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16921 else
16923 rtx t;
16925 if (TARGET_AVX
16926 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16927 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16928 || optimize_insn_for_size_p ())
16930 if (GET_MODE (op0) != V4SFmode)
16932 orig_op0 = op0;
16933 op0 = gen_reg_rtx (V4SFmode);
16935 op1 = gen_lowpart (V4SFmode, op1);
16936 emit_insn (gen_sse_loadups (op0, op1));
16937 if (orig_op0)
16938 emit_move_insn (orig_op0,
16939 gen_lowpart (GET_MODE (orig_op0), op0));
16940 return;
16943 if (mode != V4SFmode)
16944 t = gen_reg_rtx (V4SFmode);
16945 else
16946 t = op0;
16948 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16949 emit_move_insn (t, CONST0_RTX (V4SFmode));
16950 else
16951 emit_clobber (t);
16953 m = adjust_address (op1, V2SFmode, 0);
16954 emit_insn (gen_sse_loadlps (t, t, m));
16955 m = adjust_address (op1, V2SFmode, 8);
16956 emit_insn (gen_sse_loadhps (t, t, m));
16957 if (mode != V4SFmode)
16958 emit_move_insn (op0, gen_lowpart (mode, t));
16961 else if (MEM_P (op0))
16963 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16965 op0 = gen_lowpart (V16QImode, op0);
16966 op1 = gen_lowpart (V16QImode, op1);
16967 /* We will eventually emit movups based on insn attributes. */
16968 emit_insn (gen_sse2_storedquv16qi (op0, op1));
16970 else if (TARGET_SSE2 && mode == V2DFmode)
16972 if (TARGET_AVX
16973 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16974 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16975 || optimize_insn_for_size_p ())
16976 /* We will eventually emit movups based on insn attributes. */
16977 emit_insn (gen_sse2_storeupd (op0, op1));
16978 else
16980 m = adjust_address (op0, DFmode, 0);
16981 emit_insn (gen_sse2_storelpd (m, op1));
16982 m = adjust_address (op0, DFmode, 8);
16983 emit_insn (gen_sse2_storehpd (m, op1));
16986 else
16988 if (mode != V4SFmode)
16989 op1 = gen_lowpart (V4SFmode, op1);
16991 if (TARGET_AVX
16992 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16993 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16994 || optimize_insn_for_size_p ())
16996 op0 = gen_lowpart (V4SFmode, op0);
16997 emit_insn (gen_sse_storeups (op0, op1));
16999 else
17001 m = adjust_address (op0, V2SFmode, 0);
17002 emit_insn (gen_sse_storelps (m, op1));
17003 m = adjust_address (op0, V2SFmode, 8);
17004 emit_insn (gen_sse_storehps (m, op1));
17008 else
17009 gcc_unreachable ();
17012 /* Expand a push in MODE. This is some mode for which we do not support
17013 proper push instructions, at least from the registers that we expect
17014 the value to live in. */
17016 void
17017 ix86_expand_push (enum machine_mode mode, rtx x)
17019 rtx tmp;
17021 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
17022 GEN_INT (-GET_MODE_SIZE (mode)),
17023 stack_pointer_rtx, 1, OPTAB_DIRECT);
17024 if (tmp != stack_pointer_rtx)
17025 emit_move_insn (stack_pointer_rtx, tmp);
17027 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
17029 /* When we push an operand onto stack, it has to be aligned at least
17030 at the function argument boundary. However since we don't have
17031 the argument type, we can't determine the actual argument
17032 boundary. */
17033 emit_move_insn (tmp, x);
17036 /* Helper function of ix86_fixup_binary_operands to canonicalize
17037 operand order. Returns true if the operands should be swapped. */
17039 static bool
17040 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
17041 rtx operands[])
17043 rtx dst = operands[0];
17044 rtx src1 = operands[1];
17045 rtx src2 = operands[2];
17047 /* If the operation is not commutative, we can't do anything. */
17048 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
17049 return false;
17051 /* Highest priority is that src1 should match dst. */
17052 if (rtx_equal_p (dst, src1))
17053 return false;
17054 if (rtx_equal_p (dst, src2))
17055 return true;
17057 /* Next highest priority is that immediate constants come second. */
17058 if (immediate_operand (src2, mode))
17059 return false;
17060 if (immediate_operand (src1, mode))
17061 return true;
17063 /* Lowest priority is that memory references should come second. */
17064 if (MEM_P (src2))
17065 return false;
17066 if (MEM_P (src1))
17067 return true;
17069 return false;
17073 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
17074 destination to use for the operation. If different from the true
17075 destination in operands[0], a copy operation will be required. */
17078 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
17079 rtx operands[])
17081 rtx dst = operands[0];
17082 rtx src1 = operands[1];
17083 rtx src2 = operands[2];
17085 /* Canonicalize operand order. */
17086 if (ix86_swap_binary_operands_p (code, mode, operands))
17088 rtx temp;
17090 /* It is invalid to swap operands of different modes. */
17091 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
17093 temp = src1;
17094 src1 = src2;
17095 src2 = temp;
17098 /* Both source operands cannot be in memory. */
17099 if (MEM_P (src1) && MEM_P (src2))
17101 /* Optimization: Only read from memory once. */
17102 if (rtx_equal_p (src1, src2))
17104 src2 = force_reg (mode, src2);
17105 src1 = src2;
17107 else if (rtx_equal_p (dst, src1))
17108 src2 = force_reg (mode, src2);
17109 else
17110 src1 = force_reg (mode, src1);
17113 /* If the destination is memory, and we do not have matching source
17114 operands, do things in registers. */
17115 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17116 dst = gen_reg_rtx (mode);
17118 /* Source 1 cannot be a constant. */
17119 if (CONSTANT_P (src1))
17120 src1 = force_reg (mode, src1);
17122 /* Source 1 cannot be a non-matching memory. */
17123 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17124 src1 = force_reg (mode, src1);
17126 /* Improve address combine. */
17127 if (code == PLUS
17128 && GET_MODE_CLASS (mode) == MODE_INT
17129 && MEM_P (src2))
17130 src2 = force_reg (mode, src2);
17132 operands[1] = src1;
17133 operands[2] = src2;
17134 return dst;
17137 /* Similarly, but assume that the destination has already been
17138 set up properly. */
17140 void
17141 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
17142 enum machine_mode mode, rtx operands[])
17144 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
17145 gcc_assert (dst == operands[0]);
17148 /* Attempt to expand a binary operator. Make the expansion closer to the
17149 actual machine, then just general_operand, which will allow 3 separate
17150 memory references (one output, two input) in a single insn. */
17152 void
17153 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
17154 rtx operands[])
17156 rtx src1, src2, dst, op, clob;
17158 dst = ix86_fixup_binary_operands (code, mode, operands);
17159 src1 = operands[1];
17160 src2 = operands[2];
17162 /* Emit the instruction. */
17164 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
17165 if (reload_in_progress)
17167 /* Reload doesn't know about the flags register, and doesn't know that
17168 it doesn't want to clobber it. We can only do this with PLUS. */
17169 gcc_assert (code == PLUS);
17170 emit_insn (op);
17172 else if (reload_completed
17173 && code == PLUS
17174 && !rtx_equal_p (dst, src1))
17176 /* This is going to be an LEA; avoid splitting it later. */
17177 emit_insn (op);
17179 else
17181 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17182 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17185 /* Fix up the destination if needed. */
17186 if (dst != operands[0])
17187 emit_move_insn (operands[0], dst);
17190 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
17191 the given OPERANDS. */
17193 void
17194 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
17195 rtx operands[])
17197 rtx op1 = NULL_RTX, op2 = NULL_RTX;
17198 if (GET_CODE (operands[1]) == SUBREG)
17200 op1 = operands[1];
17201 op2 = operands[2];
17203 else if (GET_CODE (operands[2]) == SUBREG)
17205 op1 = operands[2];
17206 op2 = operands[1];
17208 /* Optimize (__m128i) d | (__m128i) e and similar code
17209 when d and e are float vectors into float vector logical
17210 insn. In C/C++ without using intrinsics there is no other way
17211 to express vector logical operation on float vectors than
17212 to cast them temporarily to integer vectors. */
17213 if (op1
17214 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17215 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
17216 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
17217 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
17218 && SUBREG_BYTE (op1) == 0
17219 && (GET_CODE (op2) == CONST_VECTOR
17220 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
17221 && SUBREG_BYTE (op2) == 0))
17222 && can_create_pseudo_p ())
17224 rtx dst;
17225 switch (GET_MODE (SUBREG_REG (op1)))
17227 case V4SFmode:
17228 case V8SFmode:
17229 case V2DFmode:
17230 case V4DFmode:
17231 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
17232 if (GET_CODE (op2) == CONST_VECTOR)
17234 op2 = gen_lowpart (GET_MODE (dst), op2);
17235 op2 = force_reg (GET_MODE (dst), op2);
17237 else
17239 op1 = operands[1];
17240 op2 = SUBREG_REG (operands[2]);
17241 if (!nonimmediate_operand (op2, GET_MODE (dst)))
17242 op2 = force_reg (GET_MODE (dst), op2);
17244 op1 = SUBREG_REG (op1);
17245 if (!nonimmediate_operand (op1, GET_MODE (dst)))
17246 op1 = force_reg (GET_MODE (dst), op1);
17247 emit_insn (gen_rtx_SET (VOIDmode, dst,
17248 gen_rtx_fmt_ee (code, GET_MODE (dst),
17249 op1, op2)));
17250 emit_move_insn (operands[0], gen_lowpart (mode, dst));
17251 return;
17252 default:
17253 break;
17256 if (!nonimmediate_operand (operands[1], mode))
17257 operands[1] = force_reg (mode, operands[1]);
17258 if (!nonimmediate_operand (operands[2], mode))
17259 operands[2] = force_reg (mode, operands[2]);
17260 ix86_fixup_binary_operands_no_copy (code, mode, operands);
17261 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
17262 gen_rtx_fmt_ee (code, mode, operands[1],
17263 operands[2])));
17266 /* Return TRUE or FALSE depending on whether the binary operator meets the
17267 appropriate constraints. */
17269 bool
17270 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
17271 rtx operands[3])
17273 rtx dst = operands[0];
17274 rtx src1 = operands[1];
17275 rtx src2 = operands[2];
17277 /* Both source operands cannot be in memory. */
17278 if (MEM_P (src1) && MEM_P (src2))
17279 return false;
17281 /* Canonicalize operand order for commutative operators. */
17282 if (ix86_swap_binary_operands_p (code, mode, operands))
17284 rtx temp = src1;
17285 src1 = src2;
17286 src2 = temp;
17289 /* If the destination is memory, we must have a matching source operand. */
17290 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17291 return false;
17293 /* Source 1 cannot be a constant. */
17294 if (CONSTANT_P (src1))
17295 return false;
17297 /* Source 1 cannot be a non-matching memory. */
17298 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17299 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17300 return (code == AND
17301 && (mode == HImode
17302 || mode == SImode
17303 || (TARGET_64BIT && mode == DImode))
17304 && satisfies_constraint_L (src2));
17306 return true;
17309 /* Attempt to expand a unary operator. Make the expansion closer to the
17310 actual machine, then just general_operand, which will allow 2 separate
17311 memory references (one output, one input) in a single insn. */
17313 void
17314 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17315 rtx operands[])
17317 int matching_memory;
17318 rtx src, dst, op, clob;
17320 dst = operands[0];
17321 src = operands[1];
17323 /* If the destination is memory, and we do not have matching source
17324 operands, do things in registers. */
17325 matching_memory = 0;
17326 if (MEM_P (dst))
17328 if (rtx_equal_p (dst, src))
17329 matching_memory = 1;
17330 else
17331 dst = gen_reg_rtx (mode);
17334 /* When source operand is memory, destination must match. */
17335 if (MEM_P (src) && !matching_memory)
17336 src = force_reg (mode, src);
17338 /* Emit the instruction. */
17340 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17341 if (reload_in_progress || code == NOT)
17343 /* Reload doesn't know about the flags register, and doesn't know that
17344 it doesn't want to clobber it. */
17345 gcc_assert (code == NOT);
17346 emit_insn (op);
17348 else
17350 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17351 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17354 /* Fix up the destination if needed. */
17355 if (dst != operands[0])
17356 emit_move_insn (operands[0], dst);
17359 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17360 divisor are within the range [0-255]. */
17362 void
17363 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17364 bool signed_p)
17366 rtx end_label, qimode_label;
17367 rtx insn, div, mod;
17368 rtx scratch, tmp0, tmp1, tmp2;
17369 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17370 rtx (*gen_zero_extend) (rtx, rtx);
17371 rtx (*gen_test_ccno_1) (rtx, rtx);
17373 switch (mode)
17375 case SImode:
17376 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17377 gen_test_ccno_1 = gen_testsi_ccno_1;
17378 gen_zero_extend = gen_zero_extendqisi2;
17379 break;
17380 case DImode:
17381 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17382 gen_test_ccno_1 = gen_testdi_ccno_1;
17383 gen_zero_extend = gen_zero_extendqidi2;
17384 break;
17385 default:
17386 gcc_unreachable ();
17389 end_label = gen_label_rtx ();
17390 qimode_label = gen_label_rtx ();
17392 scratch = gen_reg_rtx (mode);
17394 /* Use 8bit unsigned divimod if dividend and divisor are within
17395 the range [0-255]. */
17396 emit_move_insn (scratch, operands[2]);
17397 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17398 scratch, 1, OPTAB_DIRECT);
17399 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17400 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17401 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17402 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17403 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17404 pc_rtx);
17405 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17406 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17407 JUMP_LABEL (insn) = qimode_label;
17409 /* Generate original signed/unsigned divimod. */
17410 div = gen_divmod4_1 (operands[0], operands[1],
17411 operands[2], operands[3]);
17412 emit_insn (div);
17414 /* Branch to the end. */
17415 emit_jump_insn (gen_jump (end_label));
17416 emit_barrier ();
17418 /* Generate 8bit unsigned divide. */
17419 emit_label (qimode_label);
17420 /* Don't use operands[0] for result of 8bit divide since not all
17421 registers support QImode ZERO_EXTRACT. */
17422 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17423 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17424 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17425 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17427 if (signed_p)
17429 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17430 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17432 else
17434 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17435 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17438 /* Extract remainder from AH. */
17439 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17440 if (REG_P (operands[1]))
17441 insn = emit_move_insn (operands[1], tmp1);
17442 else
17444 /* Need a new scratch register since the old one has result
17445 of 8bit divide. */
17446 scratch = gen_reg_rtx (mode);
17447 emit_move_insn (scratch, tmp1);
17448 insn = emit_move_insn (operands[1], scratch);
17450 set_unique_reg_note (insn, REG_EQUAL, mod);
17452 /* Zero extend quotient from AL. */
17453 tmp1 = gen_lowpart (QImode, tmp0);
17454 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17455 set_unique_reg_note (insn, REG_EQUAL, div);
17457 emit_label (end_label);
17460 /* Whether it is OK to emit CFI directives when emitting asm code. */
17462 bool
17463 ix86_emit_cfi ()
17465 return dwarf2out_do_cfi_asm ();
17468 #define LEA_MAX_STALL (3)
17469 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17471 /* Increase given DISTANCE in half-cycles according to
17472 dependencies between PREV and NEXT instructions.
17473 Add 1 half-cycle if there is no dependency and
17474 go to next cycle if there is some dependecy. */
17476 static unsigned int
17477 increase_distance (rtx prev, rtx next, unsigned int distance)
17479 df_ref *use_rec;
17480 df_ref *def_rec;
17482 if (!prev || !next)
17483 return distance + (distance & 1) + 2;
17485 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17486 return distance + 1;
17488 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17489 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17490 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17491 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17492 return distance + (distance & 1) + 2;
17494 return distance + 1;
17497 /* Function checks if instruction INSN defines register number
17498 REGNO1 or REGNO2. */
17500 static bool
17501 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17502 rtx insn)
17504 df_ref *def_rec;
17506 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17507 if (DF_REF_REG_DEF_P (*def_rec)
17508 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17509 && (regno1 == DF_REF_REGNO (*def_rec)
17510 || regno2 == DF_REF_REGNO (*def_rec)))
17512 return true;
17515 return false;
17518 /* Function checks if instruction INSN uses register number
17519 REGNO as a part of address expression. */
17521 static bool
17522 insn_uses_reg_mem (unsigned int regno, rtx insn)
17524 df_ref *use_rec;
17526 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17527 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17528 return true;
17530 return false;
17533 /* Search backward for non-agu definition of register number REGNO1
17534 or register number REGNO2 in basic block starting from instruction
17535 START up to head of basic block or instruction INSN.
17537 Function puts true value into *FOUND var if definition was found
17538 and false otherwise.
17540 Distance in half-cycles between START and found instruction or head
17541 of BB is added to DISTANCE and returned. */
17543 static int
17544 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17545 rtx insn, int distance,
17546 rtx start, bool *found)
17548 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17549 rtx prev = start;
17550 rtx next = NULL;
17552 *found = false;
17554 while (prev
17555 && prev != insn
17556 && distance < LEA_SEARCH_THRESHOLD)
17558 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17560 distance = increase_distance (prev, next, distance);
17561 if (insn_defines_reg (regno1, regno2, prev))
17563 if (recog_memoized (prev) < 0
17564 || get_attr_type (prev) != TYPE_LEA)
17566 *found = true;
17567 return distance;
17571 next = prev;
17573 if (prev == BB_HEAD (bb))
17574 break;
17576 prev = PREV_INSN (prev);
17579 return distance;
17582 /* Search backward for non-agu definition of register number REGNO1
17583 or register number REGNO2 in INSN's basic block until
17584 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17585 2. Reach neighbour BBs boundary, or
17586 3. Reach agu definition.
17587 Returns the distance between the non-agu definition point and INSN.
17588 If no definition point, returns -1. */
17590 static int
17591 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17592 rtx insn)
17594 basic_block bb = BLOCK_FOR_INSN (insn);
17595 int distance = 0;
17596 bool found = false;
17598 if (insn != BB_HEAD (bb))
17599 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17600 distance, PREV_INSN (insn),
17601 &found);
17603 if (!found && distance < LEA_SEARCH_THRESHOLD)
17605 edge e;
17606 edge_iterator ei;
17607 bool simple_loop = false;
17609 FOR_EACH_EDGE (e, ei, bb->preds)
17610 if (e->src == bb)
17612 simple_loop = true;
17613 break;
17616 if (simple_loop)
17617 distance = distance_non_agu_define_in_bb (regno1, regno2,
17618 insn, distance,
17619 BB_END (bb), &found);
17620 else
17622 int shortest_dist = -1;
17623 bool found_in_bb = false;
17625 FOR_EACH_EDGE (e, ei, bb->preds)
17627 int bb_dist
17628 = distance_non_agu_define_in_bb (regno1, regno2,
17629 insn, distance,
17630 BB_END (e->src),
17631 &found_in_bb);
17632 if (found_in_bb)
17634 if (shortest_dist < 0)
17635 shortest_dist = bb_dist;
17636 else if (bb_dist > 0)
17637 shortest_dist = MIN (bb_dist, shortest_dist);
17639 found = true;
17643 distance = shortest_dist;
17647 /* get_attr_type may modify recog data. We want to make sure
17648 that recog data is valid for instruction INSN, on which
17649 distance_non_agu_define is called. INSN is unchanged here. */
17650 extract_insn_cached (insn);
17652 if (!found)
17653 return -1;
17655 return distance >> 1;
17658 /* Return the distance in half-cycles between INSN and the next
17659 insn that uses register number REGNO in memory address added
17660 to DISTANCE. Return -1 if REGNO0 is set.
17662 Put true value into *FOUND if register usage was found and
17663 false otherwise.
17664 Put true value into *REDEFINED if register redefinition was
17665 found and false otherwise. */
17667 static int
17668 distance_agu_use_in_bb (unsigned int regno,
17669 rtx insn, int distance, rtx start,
17670 bool *found, bool *redefined)
17672 basic_block bb = NULL;
17673 rtx next = start;
17674 rtx prev = NULL;
17676 *found = false;
17677 *redefined = false;
17679 if (start != NULL_RTX)
17681 bb = BLOCK_FOR_INSN (start);
17682 if (start != BB_HEAD (bb))
17683 /* If insn and start belong to the same bb, set prev to insn,
17684 so the call to increase_distance will increase the distance
17685 between insns by 1. */
17686 prev = insn;
17689 while (next
17690 && next != insn
17691 && distance < LEA_SEARCH_THRESHOLD)
17693 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17695 distance = increase_distance(prev, next, distance);
17696 if (insn_uses_reg_mem (regno, next))
17698 /* Return DISTANCE if OP0 is used in memory
17699 address in NEXT. */
17700 *found = true;
17701 return distance;
17704 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17706 /* Return -1 if OP0 is set in NEXT. */
17707 *redefined = true;
17708 return -1;
17711 prev = next;
17714 if (next == BB_END (bb))
17715 break;
17717 next = NEXT_INSN (next);
17720 return distance;
17723 /* Return the distance between INSN and the next insn that uses
17724 register number REGNO0 in memory address. Return -1 if no such
17725 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17727 static int
17728 distance_agu_use (unsigned int regno0, rtx insn)
17730 basic_block bb = BLOCK_FOR_INSN (insn);
17731 int distance = 0;
17732 bool found = false;
17733 bool redefined = false;
17735 if (insn != BB_END (bb))
17736 distance = distance_agu_use_in_bb (regno0, insn, distance,
17737 NEXT_INSN (insn),
17738 &found, &redefined);
17740 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17742 edge e;
17743 edge_iterator ei;
17744 bool simple_loop = false;
17746 FOR_EACH_EDGE (e, ei, bb->succs)
17747 if (e->dest == bb)
17749 simple_loop = true;
17750 break;
17753 if (simple_loop)
17754 distance = distance_agu_use_in_bb (regno0, insn,
17755 distance, BB_HEAD (bb),
17756 &found, &redefined);
17757 else
17759 int shortest_dist = -1;
17760 bool found_in_bb = false;
17761 bool redefined_in_bb = false;
17763 FOR_EACH_EDGE (e, ei, bb->succs)
17765 int bb_dist
17766 = distance_agu_use_in_bb (regno0, insn,
17767 distance, BB_HEAD (e->dest),
17768 &found_in_bb, &redefined_in_bb);
17769 if (found_in_bb)
17771 if (shortest_dist < 0)
17772 shortest_dist = bb_dist;
17773 else if (bb_dist > 0)
17774 shortest_dist = MIN (bb_dist, shortest_dist);
17776 found = true;
17780 distance = shortest_dist;
17784 if (!found || redefined)
17785 return -1;
17787 return distance >> 1;
17790 /* Define this macro to tune LEA priority vs ADD, it take effect when
17791 there is a dilemma of choicing LEA or ADD
17792 Negative value: ADD is more preferred than LEA
17793 Zero: Netrual
17794 Positive value: LEA is more preferred than ADD*/
17795 #define IX86_LEA_PRIORITY 0
17797 /* Return true if usage of lea INSN has performance advantage
17798 over a sequence of instructions. Instructions sequence has
17799 SPLIT_COST cycles higher latency than lea latency. */
17801 static bool
17802 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17803 unsigned int regno2, int split_cost, bool has_scale)
17805 int dist_define, dist_use;
17807 /* For Silvermont if using a 2-source or 3-source LEA for
17808 non-destructive destination purposes, or due to wanting
17809 ability to use SCALE, the use of LEA is justified. */
17810 if (ix86_tune == PROCESSOR_SLM)
17812 if (has_scale)
17813 return true;
17814 if (split_cost < 1)
17815 return false;
17816 if (regno0 == regno1 || regno0 == regno2)
17817 return false;
17818 return true;
17821 dist_define = distance_non_agu_define (regno1, regno2, insn);
17822 dist_use = distance_agu_use (regno0, insn);
17824 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17826 /* If there is no non AGU operand definition, no AGU
17827 operand usage and split cost is 0 then both lea
17828 and non lea variants have same priority. Currently
17829 we prefer lea for 64 bit code and non lea on 32 bit
17830 code. */
17831 if (dist_use < 0 && split_cost == 0)
17832 return TARGET_64BIT || IX86_LEA_PRIORITY;
17833 else
17834 return true;
17837 /* With longer definitions distance lea is more preferable.
17838 Here we change it to take into account splitting cost and
17839 lea priority. */
17840 dist_define += split_cost + IX86_LEA_PRIORITY;
17842 /* If there is no use in memory addess then we just check
17843 that split cost exceeds AGU stall. */
17844 if (dist_use < 0)
17845 return dist_define > LEA_MAX_STALL;
17847 /* If this insn has both backward non-agu dependence and forward
17848 agu dependence, the one with short distance takes effect. */
17849 return dist_define >= dist_use;
17852 /* Return true if it is legal to clobber flags by INSN and
17853 false otherwise. */
17855 static bool
17856 ix86_ok_to_clobber_flags (rtx insn)
17858 basic_block bb = BLOCK_FOR_INSN (insn);
17859 df_ref *use;
17860 bitmap live;
17862 while (insn)
17864 if (NONDEBUG_INSN_P (insn))
17866 for (use = DF_INSN_USES (insn); *use; use++)
17867 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17868 return false;
17870 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17871 return true;
17874 if (insn == BB_END (bb))
17875 break;
17877 insn = NEXT_INSN (insn);
17880 live = df_get_live_out(bb);
17881 return !REGNO_REG_SET_P (live, FLAGS_REG);
17884 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17885 move and add to avoid AGU stalls. */
17887 bool
17888 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17890 unsigned int regno0, regno1, regno2;
17892 /* Check if we need to optimize. */
17893 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17894 return false;
17896 /* Check it is correct to split here. */
17897 if (!ix86_ok_to_clobber_flags(insn))
17898 return false;
17900 regno0 = true_regnum (operands[0]);
17901 regno1 = true_regnum (operands[1]);
17902 regno2 = true_regnum (operands[2]);
17904 /* We need to split only adds with non destructive
17905 destination operand. */
17906 if (regno0 == regno1 || regno0 == regno2)
17907 return false;
17908 else
17909 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17912 /* Return true if we should emit lea instruction instead of mov
17913 instruction. */
17915 bool
17916 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17918 unsigned int regno0, regno1;
17920 /* Check if we need to optimize. */
17921 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17922 return false;
17924 /* Use lea for reg to reg moves only. */
17925 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17926 return false;
17928 regno0 = true_regnum (operands[0]);
17929 regno1 = true_regnum (operands[1]);
17931 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17934 /* Return true if we need to split lea into a sequence of
17935 instructions to avoid AGU stalls. */
17937 bool
17938 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17940 unsigned int regno0, regno1, regno2;
17941 int split_cost;
17942 struct ix86_address parts;
17943 int ok;
17945 /* Check we need to optimize. */
17946 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17947 return false;
17949 /* Check it is correct to split here. */
17950 if (!ix86_ok_to_clobber_flags(insn))
17951 return false;
17953 ok = ix86_decompose_address (operands[1], &parts);
17954 gcc_assert (ok);
17956 /* There should be at least two components in the address. */
17957 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17958 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17959 return false;
17961 /* We should not split into add if non legitimate pic
17962 operand is used as displacement. */
17963 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17964 return false;
17966 regno0 = true_regnum (operands[0]) ;
17967 regno1 = INVALID_REGNUM;
17968 regno2 = INVALID_REGNUM;
17970 if (parts.base)
17971 regno1 = true_regnum (parts.base);
17972 if (parts.index)
17973 regno2 = true_regnum (parts.index);
17975 split_cost = 0;
17977 /* Compute how many cycles we will add to execution time
17978 if split lea into a sequence of instructions. */
17979 if (parts.base || parts.index)
17981 /* Have to use mov instruction if non desctructive
17982 destination form is used. */
17983 if (regno1 != regno0 && regno2 != regno0)
17984 split_cost += 1;
17986 /* Have to add index to base if both exist. */
17987 if (parts.base && parts.index)
17988 split_cost += 1;
17990 /* Have to use shift and adds if scale is 2 or greater. */
17991 if (parts.scale > 1)
17993 if (regno0 != regno1)
17994 split_cost += 1;
17995 else if (regno2 == regno0)
17996 split_cost += 4;
17997 else
17998 split_cost += parts.scale;
18001 /* Have to use add instruction with immediate if
18002 disp is non zero. */
18003 if (parts.disp && parts.disp != const0_rtx)
18004 split_cost += 1;
18006 /* Subtract the price of lea. */
18007 split_cost -= 1;
18010 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
18011 parts.scale > 1);
18014 /* Emit x86 binary operand CODE in mode MODE, where the first operand
18015 matches destination. RTX includes clobber of FLAGS_REG. */
18017 static void
18018 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
18019 rtx dst, rtx src)
18021 rtx op, clob;
18023 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
18024 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18026 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
18029 /* Return true if regno1 def is nearest to the insn. */
18031 static bool
18032 find_nearest_reg_def (rtx insn, int regno1, int regno2)
18034 rtx prev = insn;
18035 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
18037 if (insn == start)
18038 return false;
18039 while (prev && prev != start)
18041 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
18043 prev = PREV_INSN (prev);
18044 continue;
18046 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
18047 return true;
18048 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
18049 return false;
18050 prev = PREV_INSN (prev);
18053 /* None of the regs is defined in the bb. */
18054 return false;
18057 /* Split lea instructions into a sequence of instructions
18058 which are executed on ALU to avoid AGU stalls.
18059 It is assumed that it is allowed to clobber flags register
18060 at lea position. */
18062 void
18063 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
18065 unsigned int regno0, regno1, regno2;
18066 struct ix86_address parts;
18067 rtx target, tmp;
18068 int ok, adds;
18070 ok = ix86_decompose_address (operands[1], &parts);
18071 gcc_assert (ok);
18073 target = gen_lowpart (mode, operands[0]);
18075 regno0 = true_regnum (target);
18076 regno1 = INVALID_REGNUM;
18077 regno2 = INVALID_REGNUM;
18079 if (parts.base)
18081 parts.base = gen_lowpart (mode, parts.base);
18082 regno1 = true_regnum (parts.base);
18085 if (parts.index)
18087 parts.index = gen_lowpart (mode, parts.index);
18088 regno2 = true_regnum (parts.index);
18091 if (parts.disp)
18092 parts.disp = gen_lowpart (mode, parts.disp);
18094 if (parts.scale > 1)
18096 /* Case r1 = r1 + ... */
18097 if (regno1 == regno0)
18099 /* If we have a case r1 = r1 + C * r1 then we
18100 should use multiplication which is very
18101 expensive. Assume cost model is wrong if we
18102 have such case here. */
18103 gcc_assert (regno2 != regno0);
18105 for (adds = parts.scale; adds > 0; adds--)
18106 ix86_emit_binop (PLUS, mode, target, parts.index);
18108 else
18110 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
18111 if (regno0 != regno2)
18112 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18114 /* Use shift for scaling. */
18115 ix86_emit_binop (ASHIFT, mode, target,
18116 GEN_INT (exact_log2 (parts.scale)));
18118 if (parts.base)
18119 ix86_emit_binop (PLUS, mode, target, parts.base);
18121 if (parts.disp && parts.disp != const0_rtx)
18122 ix86_emit_binop (PLUS, mode, target, parts.disp);
18125 else if (!parts.base && !parts.index)
18127 gcc_assert(parts.disp);
18128 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
18130 else
18132 if (!parts.base)
18134 if (regno0 != regno2)
18135 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18137 else if (!parts.index)
18139 if (regno0 != regno1)
18140 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
18142 else
18144 if (regno0 == regno1)
18145 tmp = parts.index;
18146 else if (regno0 == regno2)
18147 tmp = parts.base;
18148 else
18150 rtx tmp1;
18152 /* Find better operand for SET instruction, depending
18153 on which definition is farther from the insn. */
18154 if (find_nearest_reg_def (insn, regno1, regno2))
18155 tmp = parts.index, tmp1 = parts.base;
18156 else
18157 tmp = parts.base, tmp1 = parts.index;
18159 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
18161 if (parts.disp && parts.disp != const0_rtx)
18162 ix86_emit_binop (PLUS, mode, target, parts.disp);
18164 ix86_emit_binop (PLUS, mode, target, tmp1);
18165 return;
18168 ix86_emit_binop (PLUS, mode, target, tmp);
18171 if (parts.disp && parts.disp != const0_rtx)
18172 ix86_emit_binop (PLUS, mode, target, parts.disp);
18176 /* Return true if it is ok to optimize an ADD operation to LEA
18177 operation to avoid flag register consumation. For most processors,
18178 ADD is faster than LEA. For the processors like ATOM, if the
18179 destination register of LEA holds an actual address which will be
18180 used soon, LEA is better and otherwise ADD is better. */
18182 bool
18183 ix86_lea_for_add_ok (rtx insn, rtx operands[])
18185 unsigned int regno0 = true_regnum (operands[0]);
18186 unsigned int regno1 = true_regnum (operands[1]);
18187 unsigned int regno2 = true_regnum (operands[2]);
18189 /* If a = b + c, (a!=b && a!=c), must use lea form. */
18190 if (regno0 != regno1 && regno0 != regno2)
18191 return true;
18193 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
18194 return false;
18196 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
18199 /* Return true if destination reg of SET_BODY is shift count of
18200 USE_BODY. */
18202 static bool
18203 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
18205 rtx set_dest;
18206 rtx shift_rtx;
18207 int i;
18209 /* Retrieve destination of SET_BODY. */
18210 switch (GET_CODE (set_body))
18212 case SET:
18213 set_dest = SET_DEST (set_body);
18214 if (!set_dest || !REG_P (set_dest))
18215 return false;
18216 break;
18217 case PARALLEL:
18218 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
18219 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
18220 use_body))
18221 return true;
18222 default:
18223 return false;
18224 break;
18227 /* Retrieve shift count of USE_BODY. */
18228 switch (GET_CODE (use_body))
18230 case SET:
18231 shift_rtx = XEXP (use_body, 1);
18232 break;
18233 case PARALLEL:
18234 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
18235 if (ix86_dep_by_shift_count_body (set_body,
18236 XVECEXP (use_body, 0, i)))
18237 return true;
18238 default:
18239 return false;
18240 break;
18243 if (shift_rtx
18244 && (GET_CODE (shift_rtx) == ASHIFT
18245 || GET_CODE (shift_rtx) == LSHIFTRT
18246 || GET_CODE (shift_rtx) == ASHIFTRT
18247 || GET_CODE (shift_rtx) == ROTATE
18248 || GET_CODE (shift_rtx) == ROTATERT))
18250 rtx shift_count = XEXP (shift_rtx, 1);
18252 /* Return true if shift count is dest of SET_BODY. */
18253 if (REG_P (shift_count))
18255 /* Add check since it can be invoked before register
18256 allocation in pre-reload schedule. */
18257 if (reload_completed
18258 && true_regnum (set_dest) == true_regnum (shift_count))
18259 return true;
18260 else if (REGNO(set_dest) == REGNO(shift_count))
18261 return true;
18265 return false;
18268 /* Return true if destination reg of SET_INSN is shift count of
18269 USE_INSN. */
18271 bool
18272 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
18274 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
18275 PATTERN (use_insn));
18278 /* Return TRUE or FALSE depending on whether the unary operator meets the
18279 appropriate constraints. */
18281 bool
18282 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
18283 enum machine_mode mode ATTRIBUTE_UNUSED,
18284 rtx operands[2])
18286 /* If one of operands is memory, source and destination must match. */
18287 if ((MEM_P (operands[0])
18288 || MEM_P (operands[1]))
18289 && ! rtx_equal_p (operands[0], operands[1]))
18290 return false;
18291 return true;
18294 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
18295 are ok, keeping in mind the possible movddup alternative. */
18297 bool
18298 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
18300 if (MEM_P (operands[0]))
18301 return rtx_equal_p (operands[0], operands[1 + high]);
18302 if (MEM_P (operands[1]) && MEM_P (operands[2]))
18303 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
18304 return true;
18307 /* Post-reload splitter for converting an SF or DFmode value in an
18308 SSE register into an unsigned SImode. */
18310 void
18311 ix86_split_convert_uns_si_sse (rtx operands[])
18313 enum machine_mode vecmode;
18314 rtx value, large, zero_or_two31, input, two31, x;
18316 large = operands[1];
18317 zero_or_two31 = operands[2];
18318 input = operands[3];
18319 two31 = operands[4];
18320 vecmode = GET_MODE (large);
18321 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18323 /* Load up the value into the low element. We must ensure that the other
18324 elements are valid floats -- zero is the easiest such value. */
18325 if (MEM_P (input))
18327 if (vecmode == V4SFmode)
18328 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18329 else
18330 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18332 else
18334 input = gen_rtx_REG (vecmode, REGNO (input));
18335 emit_move_insn (value, CONST0_RTX (vecmode));
18336 if (vecmode == V4SFmode)
18337 emit_insn (gen_sse_movss (value, value, input));
18338 else
18339 emit_insn (gen_sse2_movsd (value, value, input));
18342 emit_move_insn (large, two31);
18343 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18345 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18346 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18348 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18349 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18351 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18352 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18354 large = gen_rtx_REG (V4SImode, REGNO (large));
18355 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18357 x = gen_rtx_REG (V4SImode, REGNO (value));
18358 if (vecmode == V4SFmode)
18359 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18360 else
18361 emit_insn (gen_sse2_cvttpd2dq (x, value));
18362 value = x;
18364 emit_insn (gen_xorv4si3 (value, value, large));
18367 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18368 Expects the 64-bit DImode to be supplied in a pair of integral
18369 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18370 -mfpmath=sse, !optimize_size only. */
18372 void
18373 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18375 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18376 rtx int_xmm, fp_xmm;
18377 rtx biases, exponents;
18378 rtx x;
18380 int_xmm = gen_reg_rtx (V4SImode);
18381 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18382 emit_insn (gen_movdi_to_sse (int_xmm, input));
18383 else if (TARGET_SSE_SPLIT_REGS)
18385 emit_clobber (int_xmm);
18386 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18388 else
18390 x = gen_reg_rtx (V2DImode);
18391 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18392 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18395 x = gen_rtx_CONST_VECTOR (V4SImode,
18396 gen_rtvec (4, GEN_INT (0x43300000UL),
18397 GEN_INT (0x45300000UL),
18398 const0_rtx, const0_rtx));
18399 exponents = validize_mem (force_const_mem (V4SImode, x));
18401 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18402 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18404 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18405 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18406 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18407 (0x1.0p84 + double(fp_value_hi_xmm)).
18408 Note these exponents differ by 32. */
18410 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18412 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18413 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18414 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18415 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18416 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18417 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18418 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18419 biases = validize_mem (force_const_mem (V2DFmode, biases));
18420 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18422 /* Add the upper and lower DFmode values together. */
18423 if (TARGET_SSE3)
18424 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18425 else
18427 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18428 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18429 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18432 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18435 /* Not used, but eases macroization of patterns. */
18436 void
18437 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18438 rtx input ATTRIBUTE_UNUSED)
18440 gcc_unreachable ();
18443 /* Convert an unsigned SImode value into a DFmode. Only currently used
18444 for SSE, but applicable anywhere. */
18446 void
18447 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18449 REAL_VALUE_TYPE TWO31r;
18450 rtx x, fp;
18452 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18453 NULL, 1, OPTAB_DIRECT);
18455 fp = gen_reg_rtx (DFmode);
18456 emit_insn (gen_floatsidf2 (fp, x));
18458 real_ldexp (&TWO31r, &dconst1, 31);
18459 x = const_double_from_real_value (TWO31r, DFmode);
18461 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18462 if (x != target)
18463 emit_move_insn (target, x);
18466 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18467 32-bit mode; otherwise we have a direct convert instruction. */
18469 void
18470 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18472 REAL_VALUE_TYPE TWO32r;
18473 rtx fp_lo, fp_hi, x;
18475 fp_lo = gen_reg_rtx (DFmode);
18476 fp_hi = gen_reg_rtx (DFmode);
18478 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18480 real_ldexp (&TWO32r, &dconst1, 32);
18481 x = const_double_from_real_value (TWO32r, DFmode);
18482 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18484 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18486 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18487 0, OPTAB_DIRECT);
18488 if (x != target)
18489 emit_move_insn (target, x);
18492 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18493 For x86_32, -mfpmath=sse, !optimize_size only. */
18494 void
18495 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18497 REAL_VALUE_TYPE ONE16r;
18498 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18500 real_ldexp (&ONE16r, &dconst1, 16);
18501 x = const_double_from_real_value (ONE16r, SFmode);
18502 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18503 NULL, 0, OPTAB_DIRECT);
18504 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18505 NULL, 0, OPTAB_DIRECT);
18506 fp_hi = gen_reg_rtx (SFmode);
18507 fp_lo = gen_reg_rtx (SFmode);
18508 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18509 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18510 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18511 0, OPTAB_DIRECT);
18512 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18513 0, OPTAB_DIRECT);
18514 if (!rtx_equal_p (target, fp_hi))
18515 emit_move_insn (target, fp_hi);
18518 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18519 a vector of unsigned ints VAL to vector of floats TARGET. */
18521 void
18522 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18524 rtx tmp[8];
18525 REAL_VALUE_TYPE TWO16r;
18526 enum machine_mode intmode = GET_MODE (val);
18527 enum machine_mode fltmode = GET_MODE (target);
18528 rtx (*cvt) (rtx, rtx);
18530 if (intmode == V4SImode)
18531 cvt = gen_floatv4siv4sf2;
18532 else
18533 cvt = gen_floatv8siv8sf2;
18534 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18535 tmp[0] = force_reg (intmode, tmp[0]);
18536 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18537 OPTAB_DIRECT);
18538 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18539 NULL_RTX, 1, OPTAB_DIRECT);
18540 tmp[3] = gen_reg_rtx (fltmode);
18541 emit_insn (cvt (tmp[3], tmp[1]));
18542 tmp[4] = gen_reg_rtx (fltmode);
18543 emit_insn (cvt (tmp[4], tmp[2]));
18544 real_ldexp (&TWO16r, &dconst1, 16);
18545 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18546 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18547 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18548 OPTAB_DIRECT);
18549 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18550 OPTAB_DIRECT);
18551 if (tmp[7] != target)
18552 emit_move_insn (target, tmp[7]);
18555 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18556 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18557 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18558 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18561 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18563 REAL_VALUE_TYPE TWO31r;
18564 rtx two31r, tmp[4];
18565 enum machine_mode mode = GET_MODE (val);
18566 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18567 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18568 rtx (*cmp) (rtx, rtx, rtx, rtx);
18569 int i;
18571 for (i = 0; i < 3; i++)
18572 tmp[i] = gen_reg_rtx (mode);
18573 real_ldexp (&TWO31r, &dconst1, 31);
18574 two31r = const_double_from_real_value (TWO31r, scalarmode);
18575 two31r = ix86_build_const_vector (mode, 1, two31r);
18576 two31r = force_reg (mode, two31r);
18577 switch (mode)
18579 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18580 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18581 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18582 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18583 default: gcc_unreachable ();
18585 tmp[3] = gen_rtx_LE (mode, two31r, val);
18586 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18587 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18588 0, OPTAB_DIRECT);
18589 if (intmode == V4SImode || TARGET_AVX2)
18590 *xorp = expand_simple_binop (intmode, ASHIFT,
18591 gen_lowpart (intmode, tmp[0]),
18592 GEN_INT (31), NULL_RTX, 0,
18593 OPTAB_DIRECT);
18594 else
18596 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18597 two31 = ix86_build_const_vector (intmode, 1, two31);
18598 *xorp = expand_simple_binop (intmode, AND,
18599 gen_lowpart (intmode, tmp[0]),
18600 two31, NULL_RTX, 0,
18601 OPTAB_DIRECT);
18603 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18604 0, OPTAB_DIRECT);
18607 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18608 then replicate the value for all elements of the vector
18609 register. */
18612 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18614 int i, n_elt;
18615 rtvec v;
18616 enum machine_mode scalar_mode;
18618 switch (mode)
18620 case V32QImode:
18621 case V16QImode:
18622 case V16HImode:
18623 case V8HImode:
18624 case V8SImode:
18625 case V4SImode:
18626 case V4DImode:
18627 case V2DImode:
18628 gcc_assert (vect);
18629 case V8SFmode:
18630 case V4SFmode:
18631 case V4DFmode:
18632 case V2DFmode:
18633 n_elt = GET_MODE_NUNITS (mode);
18634 v = rtvec_alloc (n_elt);
18635 scalar_mode = GET_MODE_INNER (mode);
18637 RTVEC_ELT (v, 0) = value;
18639 for (i = 1; i < n_elt; ++i)
18640 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18642 return gen_rtx_CONST_VECTOR (mode, v);
18644 default:
18645 gcc_unreachable ();
18649 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18650 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18651 for an SSE register. If VECT is true, then replicate the mask for
18652 all elements of the vector register. If INVERT is true, then create
18653 a mask excluding the sign bit. */
18656 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18658 enum machine_mode vec_mode, imode;
18659 HOST_WIDE_INT hi, lo;
18660 int shift = 63;
18661 rtx v;
18662 rtx mask;
18664 /* Find the sign bit, sign extended to 2*HWI. */
18665 switch (mode)
18667 case V8SImode:
18668 case V4SImode:
18669 case V8SFmode:
18670 case V4SFmode:
18671 vec_mode = mode;
18672 mode = GET_MODE_INNER (mode);
18673 imode = SImode;
18674 lo = 0x80000000, hi = lo < 0;
18675 break;
18677 case V4DImode:
18678 case V2DImode:
18679 case V4DFmode:
18680 case V2DFmode:
18681 vec_mode = mode;
18682 mode = GET_MODE_INNER (mode);
18683 imode = DImode;
18684 if (HOST_BITS_PER_WIDE_INT >= 64)
18685 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18686 else
18687 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18688 break;
18690 case TImode:
18691 case TFmode:
18692 vec_mode = VOIDmode;
18693 if (HOST_BITS_PER_WIDE_INT >= 64)
18695 imode = TImode;
18696 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18698 else
18700 rtvec vec;
18702 imode = DImode;
18703 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18705 if (invert)
18707 lo = ~lo, hi = ~hi;
18708 v = constm1_rtx;
18710 else
18711 v = const0_rtx;
18713 mask = immed_double_const (lo, hi, imode);
18715 vec = gen_rtvec (2, v, mask);
18716 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18717 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18719 return v;
18721 break;
18723 default:
18724 gcc_unreachable ();
18727 if (invert)
18728 lo = ~lo, hi = ~hi;
18730 /* Force this value into the low part of a fp vector constant. */
18731 mask = immed_double_const (lo, hi, imode);
18732 mask = gen_lowpart (mode, mask);
18734 if (vec_mode == VOIDmode)
18735 return force_reg (mode, mask);
18737 v = ix86_build_const_vector (vec_mode, vect, mask);
18738 return force_reg (vec_mode, v);
18741 /* Generate code for floating point ABS or NEG. */
18743 void
18744 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18745 rtx operands[])
18747 rtx mask, set, dst, src;
18748 bool use_sse = false;
18749 bool vector_mode = VECTOR_MODE_P (mode);
18750 enum machine_mode vmode = mode;
18752 if (vector_mode)
18753 use_sse = true;
18754 else if (mode == TFmode)
18755 use_sse = true;
18756 else if (TARGET_SSE_MATH)
18758 use_sse = SSE_FLOAT_MODE_P (mode);
18759 if (mode == SFmode)
18760 vmode = V4SFmode;
18761 else if (mode == DFmode)
18762 vmode = V2DFmode;
18765 /* NEG and ABS performed with SSE use bitwise mask operations.
18766 Create the appropriate mask now. */
18767 if (use_sse)
18768 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18769 else
18770 mask = NULL_RTX;
18772 dst = operands[0];
18773 src = operands[1];
18775 set = gen_rtx_fmt_e (code, mode, src);
18776 set = gen_rtx_SET (VOIDmode, dst, set);
18778 if (mask)
18780 rtx use, clob;
18781 rtvec par;
18783 use = gen_rtx_USE (VOIDmode, mask);
18784 if (vector_mode)
18785 par = gen_rtvec (2, set, use);
18786 else
18788 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18789 par = gen_rtvec (3, set, use, clob);
18791 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18793 else
18794 emit_insn (set);
18797 /* Expand a copysign operation. Special case operand 0 being a constant. */
18799 void
18800 ix86_expand_copysign (rtx operands[])
18802 enum machine_mode mode, vmode;
18803 rtx dest, op0, op1, mask, nmask;
18805 dest = operands[0];
18806 op0 = operands[1];
18807 op1 = operands[2];
18809 mode = GET_MODE (dest);
18811 if (mode == SFmode)
18812 vmode = V4SFmode;
18813 else if (mode == DFmode)
18814 vmode = V2DFmode;
18815 else
18816 vmode = mode;
18818 if (GET_CODE (op0) == CONST_DOUBLE)
18820 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18822 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18823 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18825 if (mode == SFmode || mode == DFmode)
18827 if (op0 == CONST0_RTX (mode))
18828 op0 = CONST0_RTX (vmode);
18829 else
18831 rtx v = ix86_build_const_vector (vmode, false, op0);
18833 op0 = force_reg (vmode, v);
18836 else if (op0 != CONST0_RTX (mode))
18837 op0 = force_reg (mode, op0);
18839 mask = ix86_build_signbit_mask (vmode, 0, 0);
18841 if (mode == SFmode)
18842 copysign_insn = gen_copysignsf3_const;
18843 else if (mode == DFmode)
18844 copysign_insn = gen_copysigndf3_const;
18845 else
18846 copysign_insn = gen_copysigntf3_const;
18848 emit_insn (copysign_insn (dest, op0, op1, mask));
18850 else
18852 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18854 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18855 mask = ix86_build_signbit_mask (vmode, 0, 0);
18857 if (mode == SFmode)
18858 copysign_insn = gen_copysignsf3_var;
18859 else if (mode == DFmode)
18860 copysign_insn = gen_copysigndf3_var;
18861 else
18862 copysign_insn = gen_copysigntf3_var;
18864 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18868 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18869 be a constant, and so has already been expanded into a vector constant. */
18871 void
18872 ix86_split_copysign_const (rtx operands[])
18874 enum machine_mode mode, vmode;
18875 rtx dest, op0, mask, x;
18877 dest = operands[0];
18878 op0 = operands[1];
18879 mask = operands[3];
18881 mode = GET_MODE (dest);
18882 vmode = GET_MODE (mask);
18884 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18885 x = gen_rtx_AND (vmode, dest, mask);
18886 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18888 if (op0 != CONST0_RTX (vmode))
18890 x = gen_rtx_IOR (vmode, dest, op0);
18891 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18895 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18896 so we have to do two masks. */
18898 void
18899 ix86_split_copysign_var (rtx operands[])
18901 enum machine_mode mode, vmode;
18902 rtx dest, scratch, op0, op1, mask, nmask, x;
18904 dest = operands[0];
18905 scratch = operands[1];
18906 op0 = operands[2];
18907 op1 = operands[3];
18908 nmask = operands[4];
18909 mask = operands[5];
18911 mode = GET_MODE (dest);
18912 vmode = GET_MODE (mask);
18914 if (rtx_equal_p (op0, op1))
18916 /* Shouldn't happen often (it's useless, obviously), but when it does
18917 we'd generate incorrect code if we continue below. */
18918 emit_move_insn (dest, op0);
18919 return;
18922 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18924 gcc_assert (REGNO (op1) == REGNO (scratch));
18926 x = gen_rtx_AND (vmode, scratch, mask);
18927 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18929 dest = mask;
18930 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18931 x = gen_rtx_NOT (vmode, dest);
18932 x = gen_rtx_AND (vmode, x, op0);
18933 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18935 else
18937 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18939 x = gen_rtx_AND (vmode, scratch, mask);
18941 else /* alternative 2,4 */
18943 gcc_assert (REGNO (mask) == REGNO (scratch));
18944 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18945 x = gen_rtx_AND (vmode, scratch, op1);
18947 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18949 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18951 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18952 x = gen_rtx_AND (vmode, dest, nmask);
18954 else /* alternative 3,4 */
18956 gcc_assert (REGNO (nmask) == REGNO (dest));
18957 dest = nmask;
18958 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18959 x = gen_rtx_AND (vmode, dest, op0);
18961 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18964 x = gen_rtx_IOR (vmode, dest, scratch);
18965 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18968 /* Return TRUE or FALSE depending on whether the first SET in INSN
18969 has source and destination with matching CC modes, and that the
18970 CC mode is at least as constrained as REQ_MODE. */
18972 bool
18973 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18975 rtx set;
18976 enum machine_mode set_mode;
18978 set = PATTERN (insn);
18979 if (GET_CODE (set) == PARALLEL)
18980 set = XVECEXP (set, 0, 0);
18981 gcc_assert (GET_CODE (set) == SET);
18982 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18984 set_mode = GET_MODE (SET_DEST (set));
18985 switch (set_mode)
18987 case CCNOmode:
18988 if (req_mode != CCNOmode
18989 && (req_mode != CCmode
18990 || XEXP (SET_SRC (set), 1) != const0_rtx))
18991 return false;
18992 break;
18993 case CCmode:
18994 if (req_mode == CCGCmode)
18995 return false;
18996 /* FALLTHRU */
18997 case CCGCmode:
18998 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18999 return false;
19000 /* FALLTHRU */
19001 case CCGOCmode:
19002 if (req_mode == CCZmode)
19003 return false;
19004 /* FALLTHRU */
19005 case CCZmode:
19006 break;
19008 case CCAmode:
19009 case CCCmode:
19010 case CCOmode:
19011 case CCSmode:
19012 if (set_mode != req_mode)
19013 return false;
19014 break;
19016 default:
19017 gcc_unreachable ();
19020 return GET_MODE (SET_SRC (set)) == set_mode;
19023 /* Generate insn patterns to do an integer compare of OPERANDS. */
19025 static rtx
19026 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
19028 enum machine_mode cmpmode;
19029 rtx tmp, flags;
19031 cmpmode = SELECT_CC_MODE (code, op0, op1);
19032 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
19034 /* This is very simple, but making the interface the same as in the
19035 FP case makes the rest of the code easier. */
19036 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
19037 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
19039 /* Return the test that should be put into the flags user, i.e.
19040 the bcc, scc, or cmov instruction. */
19041 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
19044 /* Figure out whether to use ordered or unordered fp comparisons.
19045 Return the appropriate mode to use. */
19047 enum machine_mode
19048 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
19050 /* ??? In order to make all comparisons reversible, we do all comparisons
19051 non-trapping when compiling for IEEE. Once gcc is able to distinguish
19052 all forms trapping and nontrapping comparisons, we can make inequality
19053 comparisons trapping again, since it results in better code when using
19054 FCOM based compares. */
19055 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
19058 enum machine_mode
19059 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
19061 enum machine_mode mode = GET_MODE (op0);
19063 if (SCALAR_FLOAT_MODE_P (mode))
19065 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19066 return ix86_fp_compare_mode (code);
19069 switch (code)
19071 /* Only zero flag is needed. */
19072 case EQ: /* ZF=0 */
19073 case NE: /* ZF!=0 */
19074 return CCZmode;
19075 /* Codes needing carry flag. */
19076 case GEU: /* CF=0 */
19077 case LTU: /* CF=1 */
19078 /* Detect overflow checks. They need just the carry flag. */
19079 if (GET_CODE (op0) == PLUS
19080 && rtx_equal_p (op1, XEXP (op0, 0)))
19081 return CCCmode;
19082 else
19083 return CCmode;
19084 case GTU: /* CF=0 & ZF=0 */
19085 case LEU: /* CF=1 | ZF=1 */
19086 return CCmode;
19087 /* Codes possibly doable only with sign flag when
19088 comparing against zero. */
19089 case GE: /* SF=OF or SF=0 */
19090 case LT: /* SF<>OF or SF=1 */
19091 if (op1 == const0_rtx)
19092 return CCGOCmode;
19093 else
19094 /* For other cases Carry flag is not required. */
19095 return CCGCmode;
19096 /* Codes doable only with sign flag when comparing
19097 against zero, but we miss jump instruction for it
19098 so we need to use relational tests against overflow
19099 that thus needs to be zero. */
19100 case GT: /* ZF=0 & SF=OF */
19101 case LE: /* ZF=1 | SF<>OF */
19102 if (op1 == const0_rtx)
19103 return CCNOmode;
19104 else
19105 return CCGCmode;
19106 /* strcmp pattern do (use flags) and combine may ask us for proper
19107 mode. */
19108 case USE:
19109 return CCmode;
19110 default:
19111 gcc_unreachable ();
19115 /* Return the fixed registers used for condition codes. */
19117 static bool
19118 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
19120 *p1 = FLAGS_REG;
19121 *p2 = FPSR_REG;
19122 return true;
19125 /* If two condition code modes are compatible, return a condition code
19126 mode which is compatible with both. Otherwise, return
19127 VOIDmode. */
19129 static enum machine_mode
19130 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
19132 if (m1 == m2)
19133 return m1;
19135 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
19136 return VOIDmode;
19138 if ((m1 == CCGCmode && m2 == CCGOCmode)
19139 || (m1 == CCGOCmode && m2 == CCGCmode))
19140 return CCGCmode;
19142 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
19143 return m2;
19144 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
19145 return m1;
19147 switch (m1)
19149 default:
19150 gcc_unreachable ();
19152 case CCmode:
19153 case CCGCmode:
19154 case CCGOCmode:
19155 case CCNOmode:
19156 case CCAmode:
19157 case CCCmode:
19158 case CCOmode:
19159 case CCSmode:
19160 case CCZmode:
19161 switch (m2)
19163 default:
19164 return VOIDmode;
19166 case CCmode:
19167 case CCGCmode:
19168 case CCGOCmode:
19169 case CCNOmode:
19170 case CCAmode:
19171 case CCCmode:
19172 case CCOmode:
19173 case CCSmode:
19174 case CCZmode:
19175 return CCmode;
19178 case CCFPmode:
19179 case CCFPUmode:
19180 /* These are only compatible with themselves, which we already
19181 checked above. */
19182 return VOIDmode;
19187 /* Return a comparison we can do and that it is equivalent to
19188 swap_condition (code) apart possibly from orderedness.
19189 But, never change orderedness if TARGET_IEEE_FP, returning
19190 UNKNOWN in that case if necessary. */
19192 static enum rtx_code
19193 ix86_fp_swap_condition (enum rtx_code code)
19195 switch (code)
19197 case GT: /* GTU - CF=0 & ZF=0 */
19198 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
19199 case GE: /* GEU - CF=0 */
19200 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
19201 case UNLT: /* LTU - CF=1 */
19202 return TARGET_IEEE_FP ? UNKNOWN : GT;
19203 case UNLE: /* LEU - CF=1 | ZF=1 */
19204 return TARGET_IEEE_FP ? UNKNOWN : GE;
19205 default:
19206 return swap_condition (code);
19210 /* Return cost of comparison CODE using the best strategy for performance.
19211 All following functions do use number of instructions as a cost metrics.
19212 In future this should be tweaked to compute bytes for optimize_size and
19213 take into account performance of various instructions on various CPUs. */
19215 static int
19216 ix86_fp_comparison_cost (enum rtx_code code)
19218 int arith_cost;
19220 /* The cost of code using bit-twiddling on %ah. */
19221 switch (code)
19223 case UNLE:
19224 case UNLT:
19225 case LTGT:
19226 case GT:
19227 case GE:
19228 case UNORDERED:
19229 case ORDERED:
19230 case UNEQ:
19231 arith_cost = 4;
19232 break;
19233 case LT:
19234 case NE:
19235 case EQ:
19236 case UNGE:
19237 arith_cost = TARGET_IEEE_FP ? 5 : 4;
19238 break;
19239 case LE:
19240 case UNGT:
19241 arith_cost = TARGET_IEEE_FP ? 6 : 4;
19242 break;
19243 default:
19244 gcc_unreachable ();
19247 switch (ix86_fp_comparison_strategy (code))
19249 case IX86_FPCMP_COMI:
19250 return arith_cost > 4 ? 3 : 2;
19251 case IX86_FPCMP_SAHF:
19252 return arith_cost > 4 ? 4 : 3;
19253 default:
19254 return arith_cost;
19258 /* Return strategy to use for floating-point. We assume that fcomi is always
19259 preferrable where available, since that is also true when looking at size
19260 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
19262 enum ix86_fpcmp_strategy
19263 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
19265 /* Do fcomi/sahf based test when profitable. */
19267 if (TARGET_CMOVE)
19268 return IX86_FPCMP_COMI;
19270 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19271 return IX86_FPCMP_SAHF;
19273 return IX86_FPCMP_ARITH;
19276 /* Swap, force into registers, or otherwise massage the two operands
19277 to a fp comparison. The operands are updated in place; the new
19278 comparison code is returned. */
19280 static enum rtx_code
19281 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
19283 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
19284 rtx op0 = *pop0, op1 = *pop1;
19285 enum machine_mode op_mode = GET_MODE (op0);
19286 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
19288 /* All of the unordered compare instructions only work on registers.
19289 The same is true of the fcomi compare instructions. The XFmode
19290 compare instructions require registers except when comparing
19291 against zero or when converting operand 1 from fixed point to
19292 floating point. */
19294 if (!is_sse
19295 && (fpcmp_mode == CCFPUmode
19296 || (op_mode == XFmode
19297 && ! (standard_80387_constant_p (op0) == 1
19298 || standard_80387_constant_p (op1) == 1)
19299 && GET_CODE (op1) != FLOAT)
19300 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19302 op0 = force_reg (op_mode, op0);
19303 op1 = force_reg (op_mode, op1);
19305 else
19307 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19308 things around if they appear profitable, otherwise force op0
19309 into a register. */
19311 if (standard_80387_constant_p (op0) == 0
19312 || (MEM_P (op0)
19313 && ! (standard_80387_constant_p (op1) == 0
19314 || MEM_P (op1))))
19316 enum rtx_code new_code = ix86_fp_swap_condition (code);
19317 if (new_code != UNKNOWN)
19319 rtx tmp;
19320 tmp = op0, op0 = op1, op1 = tmp;
19321 code = new_code;
19325 if (!REG_P (op0))
19326 op0 = force_reg (op_mode, op0);
19328 if (CONSTANT_P (op1))
19330 int tmp = standard_80387_constant_p (op1);
19331 if (tmp == 0)
19332 op1 = validize_mem (force_const_mem (op_mode, op1));
19333 else if (tmp == 1)
19335 if (TARGET_CMOVE)
19336 op1 = force_reg (op_mode, op1);
19338 else
19339 op1 = force_reg (op_mode, op1);
19343 /* Try to rearrange the comparison to make it cheaper. */
19344 if (ix86_fp_comparison_cost (code)
19345 > ix86_fp_comparison_cost (swap_condition (code))
19346 && (REG_P (op1) || can_create_pseudo_p ()))
19348 rtx tmp;
19349 tmp = op0, op0 = op1, op1 = tmp;
19350 code = swap_condition (code);
19351 if (!REG_P (op0))
19352 op0 = force_reg (op_mode, op0);
19355 *pop0 = op0;
19356 *pop1 = op1;
19357 return code;
19360 /* Convert comparison codes we use to represent FP comparison to integer
19361 code that will result in proper branch. Return UNKNOWN if no such code
19362 is available. */
19364 enum rtx_code
19365 ix86_fp_compare_code_to_integer (enum rtx_code code)
19367 switch (code)
19369 case GT:
19370 return GTU;
19371 case GE:
19372 return GEU;
19373 case ORDERED:
19374 case UNORDERED:
19375 return code;
19376 break;
19377 case UNEQ:
19378 return EQ;
19379 break;
19380 case UNLT:
19381 return LTU;
19382 break;
19383 case UNLE:
19384 return LEU;
19385 break;
19386 case LTGT:
19387 return NE;
19388 break;
19389 default:
19390 return UNKNOWN;
19394 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19396 static rtx
19397 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19399 enum machine_mode fpcmp_mode, intcmp_mode;
19400 rtx tmp, tmp2;
19402 fpcmp_mode = ix86_fp_compare_mode (code);
19403 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19405 /* Do fcomi/sahf based test when profitable. */
19406 switch (ix86_fp_comparison_strategy (code))
19408 case IX86_FPCMP_COMI:
19409 intcmp_mode = fpcmp_mode;
19410 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19411 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19412 tmp);
19413 emit_insn (tmp);
19414 break;
19416 case IX86_FPCMP_SAHF:
19417 intcmp_mode = fpcmp_mode;
19418 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19419 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19420 tmp);
19422 if (!scratch)
19423 scratch = gen_reg_rtx (HImode);
19424 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19425 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19426 break;
19428 case IX86_FPCMP_ARITH:
19429 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19430 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19431 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19432 if (!scratch)
19433 scratch = gen_reg_rtx (HImode);
19434 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19436 /* In the unordered case, we have to check C2 for NaN's, which
19437 doesn't happen to work out to anything nice combination-wise.
19438 So do some bit twiddling on the value we've got in AH to come
19439 up with an appropriate set of condition codes. */
19441 intcmp_mode = CCNOmode;
19442 switch (code)
19444 case GT:
19445 case UNGT:
19446 if (code == GT || !TARGET_IEEE_FP)
19448 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19449 code = EQ;
19451 else
19453 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19454 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19455 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19456 intcmp_mode = CCmode;
19457 code = GEU;
19459 break;
19460 case LT:
19461 case UNLT:
19462 if (code == LT && TARGET_IEEE_FP)
19464 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19465 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19466 intcmp_mode = CCmode;
19467 code = EQ;
19469 else
19471 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19472 code = NE;
19474 break;
19475 case GE:
19476 case UNGE:
19477 if (code == GE || !TARGET_IEEE_FP)
19479 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19480 code = EQ;
19482 else
19484 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19485 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19486 code = NE;
19488 break;
19489 case LE:
19490 case UNLE:
19491 if (code == LE && TARGET_IEEE_FP)
19493 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19494 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19495 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19496 intcmp_mode = CCmode;
19497 code = LTU;
19499 else
19501 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19502 code = NE;
19504 break;
19505 case EQ:
19506 case UNEQ:
19507 if (code == EQ && TARGET_IEEE_FP)
19509 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19510 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19511 intcmp_mode = CCmode;
19512 code = EQ;
19514 else
19516 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19517 code = NE;
19519 break;
19520 case NE:
19521 case LTGT:
19522 if (code == NE && TARGET_IEEE_FP)
19524 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19525 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19526 GEN_INT (0x40)));
19527 code = NE;
19529 else
19531 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19532 code = EQ;
19534 break;
19536 case UNORDERED:
19537 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19538 code = NE;
19539 break;
19540 case ORDERED:
19541 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19542 code = EQ;
19543 break;
19545 default:
19546 gcc_unreachable ();
19548 break;
19550 default:
19551 gcc_unreachable();
19554 /* Return the test that should be put into the flags user, i.e.
19555 the bcc, scc, or cmov instruction. */
19556 return gen_rtx_fmt_ee (code, VOIDmode,
19557 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19558 const0_rtx);
19561 static rtx
19562 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19564 rtx ret;
19566 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19567 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19569 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19571 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19572 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19574 else
19575 ret = ix86_expand_int_compare (code, op0, op1);
19577 return ret;
19580 void
19581 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19583 enum machine_mode mode = GET_MODE (op0);
19584 rtx tmp;
19586 switch (mode)
19588 case SFmode:
19589 case DFmode:
19590 case XFmode:
19591 case QImode:
19592 case HImode:
19593 case SImode:
19594 simple:
19595 tmp = ix86_expand_compare (code, op0, op1);
19596 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19597 gen_rtx_LABEL_REF (VOIDmode, label),
19598 pc_rtx);
19599 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19600 return;
19602 case DImode:
19603 if (TARGET_64BIT)
19604 goto simple;
19605 case TImode:
19606 /* Expand DImode branch into multiple compare+branch. */
19608 rtx lo[2], hi[2], label2;
19609 enum rtx_code code1, code2, code3;
19610 enum machine_mode submode;
19612 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19614 tmp = op0, op0 = op1, op1 = tmp;
19615 code = swap_condition (code);
19618 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19619 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19621 submode = mode == DImode ? SImode : DImode;
19623 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19624 avoid two branches. This costs one extra insn, so disable when
19625 optimizing for size. */
19627 if ((code == EQ || code == NE)
19628 && (!optimize_insn_for_size_p ()
19629 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19631 rtx xor0, xor1;
19633 xor1 = hi[0];
19634 if (hi[1] != const0_rtx)
19635 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19636 NULL_RTX, 0, OPTAB_WIDEN);
19638 xor0 = lo[0];
19639 if (lo[1] != const0_rtx)
19640 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19641 NULL_RTX, 0, OPTAB_WIDEN);
19643 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19644 NULL_RTX, 0, OPTAB_WIDEN);
19646 ix86_expand_branch (code, tmp, const0_rtx, label);
19647 return;
19650 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19651 op1 is a constant and the low word is zero, then we can just
19652 examine the high word. Similarly for low word -1 and
19653 less-or-equal-than or greater-than. */
19655 if (CONST_INT_P (hi[1]))
19656 switch (code)
19658 case LT: case LTU: case GE: case GEU:
19659 if (lo[1] == const0_rtx)
19661 ix86_expand_branch (code, hi[0], hi[1], label);
19662 return;
19664 break;
19665 case LE: case LEU: case GT: case GTU:
19666 if (lo[1] == constm1_rtx)
19668 ix86_expand_branch (code, hi[0], hi[1], label);
19669 return;
19671 break;
19672 default:
19673 break;
19676 /* Otherwise, we need two or three jumps. */
19678 label2 = gen_label_rtx ();
19680 code1 = code;
19681 code2 = swap_condition (code);
19682 code3 = unsigned_condition (code);
19684 switch (code)
19686 case LT: case GT: case LTU: case GTU:
19687 break;
19689 case LE: code1 = LT; code2 = GT; break;
19690 case GE: code1 = GT; code2 = LT; break;
19691 case LEU: code1 = LTU; code2 = GTU; break;
19692 case GEU: code1 = GTU; code2 = LTU; break;
19694 case EQ: code1 = UNKNOWN; code2 = NE; break;
19695 case NE: code2 = UNKNOWN; break;
19697 default:
19698 gcc_unreachable ();
19702 * a < b =>
19703 * if (hi(a) < hi(b)) goto true;
19704 * if (hi(a) > hi(b)) goto false;
19705 * if (lo(a) < lo(b)) goto true;
19706 * false:
19709 if (code1 != UNKNOWN)
19710 ix86_expand_branch (code1, hi[0], hi[1], label);
19711 if (code2 != UNKNOWN)
19712 ix86_expand_branch (code2, hi[0], hi[1], label2);
19714 ix86_expand_branch (code3, lo[0], lo[1], label);
19716 if (code2 != UNKNOWN)
19717 emit_label (label2);
19718 return;
19721 default:
19722 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19723 goto simple;
19727 /* Split branch based on floating point condition. */
19728 void
19729 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19730 rtx target1, rtx target2, rtx tmp, rtx pushed)
19732 rtx condition;
19733 rtx i;
19735 if (target2 != pc_rtx)
19737 rtx tmp = target2;
19738 code = reverse_condition_maybe_unordered (code);
19739 target2 = target1;
19740 target1 = tmp;
19743 condition = ix86_expand_fp_compare (code, op1, op2,
19744 tmp);
19746 /* Remove pushed operand from stack. */
19747 if (pushed)
19748 ix86_free_from_memory (GET_MODE (pushed));
19750 i = emit_jump_insn (gen_rtx_SET
19751 (VOIDmode, pc_rtx,
19752 gen_rtx_IF_THEN_ELSE (VOIDmode,
19753 condition, target1, target2)));
19754 if (split_branch_probability >= 0)
19755 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
19758 void
19759 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19761 rtx ret;
19763 gcc_assert (GET_MODE (dest) == QImode);
19765 ret = ix86_expand_compare (code, op0, op1);
19766 PUT_MODE (ret, QImode);
19767 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19770 /* Expand comparison setting or clearing carry flag. Return true when
19771 successful and set pop for the operation. */
19772 static bool
19773 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19775 enum machine_mode mode =
19776 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19778 /* Do not handle double-mode compares that go through special path. */
19779 if (mode == (TARGET_64BIT ? TImode : DImode))
19780 return false;
19782 if (SCALAR_FLOAT_MODE_P (mode))
19784 rtx compare_op, compare_seq;
19786 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19788 /* Shortcut: following common codes never translate
19789 into carry flag compares. */
19790 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19791 || code == ORDERED || code == UNORDERED)
19792 return false;
19794 /* These comparisons require zero flag; swap operands so they won't. */
19795 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19796 && !TARGET_IEEE_FP)
19798 rtx tmp = op0;
19799 op0 = op1;
19800 op1 = tmp;
19801 code = swap_condition (code);
19804 /* Try to expand the comparison and verify that we end up with
19805 carry flag based comparison. This fails to be true only when
19806 we decide to expand comparison using arithmetic that is not
19807 too common scenario. */
19808 start_sequence ();
19809 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19810 compare_seq = get_insns ();
19811 end_sequence ();
19813 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19814 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19815 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19816 else
19817 code = GET_CODE (compare_op);
19819 if (code != LTU && code != GEU)
19820 return false;
19822 emit_insn (compare_seq);
19823 *pop = compare_op;
19824 return true;
19827 if (!INTEGRAL_MODE_P (mode))
19828 return false;
19830 switch (code)
19832 case LTU:
19833 case GEU:
19834 break;
19836 /* Convert a==0 into (unsigned)a<1. */
19837 case EQ:
19838 case NE:
19839 if (op1 != const0_rtx)
19840 return false;
19841 op1 = const1_rtx;
19842 code = (code == EQ ? LTU : GEU);
19843 break;
19845 /* Convert a>b into b<a or a>=b-1. */
19846 case GTU:
19847 case LEU:
19848 if (CONST_INT_P (op1))
19850 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19851 /* Bail out on overflow. We still can swap operands but that
19852 would force loading of the constant into register. */
19853 if (op1 == const0_rtx
19854 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19855 return false;
19856 code = (code == GTU ? GEU : LTU);
19858 else
19860 rtx tmp = op1;
19861 op1 = op0;
19862 op0 = tmp;
19863 code = (code == GTU ? LTU : GEU);
19865 break;
19867 /* Convert a>=0 into (unsigned)a<0x80000000. */
19868 case LT:
19869 case GE:
19870 if (mode == DImode || op1 != const0_rtx)
19871 return false;
19872 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19873 code = (code == LT ? GEU : LTU);
19874 break;
19875 case LE:
19876 case GT:
19877 if (mode == DImode || op1 != constm1_rtx)
19878 return false;
19879 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19880 code = (code == LE ? GEU : LTU);
19881 break;
19883 default:
19884 return false;
19886 /* Swapping operands may cause constant to appear as first operand. */
19887 if (!nonimmediate_operand (op0, VOIDmode))
19889 if (!can_create_pseudo_p ())
19890 return false;
19891 op0 = force_reg (mode, op0);
19893 *pop = ix86_expand_compare (code, op0, op1);
19894 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19895 return true;
19898 bool
19899 ix86_expand_int_movcc (rtx operands[])
19901 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19902 rtx compare_seq, compare_op;
19903 enum machine_mode mode = GET_MODE (operands[0]);
19904 bool sign_bit_compare_p = false;
19905 rtx op0 = XEXP (operands[1], 0);
19906 rtx op1 = XEXP (operands[1], 1);
19908 if (GET_MODE (op0) == TImode
19909 || (GET_MODE (op0) == DImode
19910 && !TARGET_64BIT))
19911 return false;
19913 start_sequence ();
19914 compare_op = ix86_expand_compare (code, op0, op1);
19915 compare_seq = get_insns ();
19916 end_sequence ();
19918 compare_code = GET_CODE (compare_op);
19920 if ((op1 == const0_rtx && (code == GE || code == LT))
19921 || (op1 == constm1_rtx && (code == GT || code == LE)))
19922 sign_bit_compare_p = true;
19924 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19925 HImode insns, we'd be swallowed in word prefix ops. */
19927 if ((mode != HImode || TARGET_FAST_PREFIX)
19928 && (mode != (TARGET_64BIT ? TImode : DImode))
19929 && CONST_INT_P (operands[2])
19930 && CONST_INT_P (operands[3]))
19932 rtx out = operands[0];
19933 HOST_WIDE_INT ct = INTVAL (operands[2]);
19934 HOST_WIDE_INT cf = INTVAL (operands[3]);
19935 HOST_WIDE_INT diff;
19937 diff = ct - cf;
19938 /* Sign bit compares are better done using shifts than we do by using
19939 sbb. */
19940 if (sign_bit_compare_p
19941 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19943 /* Detect overlap between destination and compare sources. */
19944 rtx tmp = out;
19946 if (!sign_bit_compare_p)
19948 rtx flags;
19949 bool fpcmp = false;
19951 compare_code = GET_CODE (compare_op);
19953 flags = XEXP (compare_op, 0);
19955 if (GET_MODE (flags) == CCFPmode
19956 || GET_MODE (flags) == CCFPUmode)
19958 fpcmp = true;
19959 compare_code
19960 = ix86_fp_compare_code_to_integer (compare_code);
19963 /* To simplify rest of code, restrict to the GEU case. */
19964 if (compare_code == LTU)
19966 HOST_WIDE_INT tmp = ct;
19967 ct = cf;
19968 cf = tmp;
19969 compare_code = reverse_condition (compare_code);
19970 code = reverse_condition (code);
19972 else
19974 if (fpcmp)
19975 PUT_CODE (compare_op,
19976 reverse_condition_maybe_unordered
19977 (GET_CODE (compare_op)));
19978 else
19979 PUT_CODE (compare_op,
19980 reverse_condition (GET_CODE (compare_op)));
19982 diff = ct - cf;
19984 if (reg_overlap_mentioned_p (out, op0)
19985 || reg_overlap_mentioned_p (out, op1))
19986 tmp = gen_reg_rtx (mode);
19988 if (mode == DImode)
19989 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19990 else
19991 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19992 flags, compare_op));
19994 else
19996 if (code == GT || code == GE)
19997 code = reverse_condition (code);
19998 else
20000 HOST_WIDE_INT tmp = ct;
20001 ct = cf;
20002 cf = tmp;
20003 diff = ct - cf;
20005 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
20008 if (diff == 1)
20011 * cmpl op0,op1
20012 * sbbl dest,dest
20013 * [addl dest, ct]
20015 * Size 5 - 8.
20017 if (ct)
20018 tmp = expand_simple_binop (mode, PLUS,
20019 tmp, GEN_INT (ct),
20020 copy_rtx (tmp), 1, OPTAB_DIRECT);
20022 else if (cf == -1)
20025 * cmpl op0,op1
20026 * sbbl dest,dest
20027 * orl $ct, dest
20029 * Size 8.
20031 tmp = expand_simple_binop (mode, IOR,
20032 tmp, GEN_INT (ct),
20033 copy_rtx (tmp), 1, OPTAB_DIRECT);
20035 else if (diff == -1 && ct)
20038 * cmpl op0,op1
20039 * sbbl dest,dest
20040 * notl dest
20041 * [addl dest, cf]
20043 * Size 8 - 11.
20045 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
20046 if (cf)
20047 tmp = expand_simple_binop (mode, PLUS,
20048 copy_rtx (tmp), GEN_INT (cf),
20049 copy_rtx (tmp), 1, OPTAB_DIRECT);
20051 else
20054 * cmpl op0,op1
20055 * sbbl dest,dest
20056 * [notl dest]
20057 * andl cf - ct, dest
20058 * [addl dest, ct]
20060 * Size 8 - 11.
20063 if (cf == 0)
20065 cf = ct;
20066 ct = 0;
20067 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
20070 tmp = expand_simple_binop (mode, AND,
20071 copy_rtx (tmp),
20072 gen_int_mode (cf - ct, mode),
20073 copy_rtx (tmp), 1, OPTAB_DIRECT);
20074 if (ct)
20075 tmp = expand_simple_binop (mode, PLUS,
20076 copy_rtx (tmp), GEN_INT (ct),
20077 copy_rtx (tmp), 1, OPTAB_DIRECT);
20080 if (!rtx_equal_p (tmp, out))
20081 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
20083 return true;
20086 if (diff < 0)
20088 enum machine_mode cmp_mode = GET_MODE (op0);
20090 HOST_WIDE_INT tmp;
20091 tmp = ct, ct = cf, cf = tmp;
20092 diff = -diff;
20094 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20096 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20098 /* We may be reversing unordered compare to normal compare, that
20099 is not valid in general (we may convert non-trapping condition
20100 to trapping one), however on i386 we currently emit all
20101 comparisons unordered. */
20102 compare_code = reverse_condition_maybe_unordered (compare_code);
20103 code = reverse_condition_maybe_unordered (code);
20105 else
20107 compare_code = reverse_condition (compare_code);
20108 code = reverse_condition (code);
20112 compare_code = UNKNOWN;
20113 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
20114 && CONST_INT_P (op1))
20116 if (op1 == const0_rtx
20117 && (code == LT || code == GE))
20118 compare_code = code;
20119 else if (op1 == constm1_rtx)
20121 if (code == LE)
20122 compare_code = LT;
20123 else if (code == GT)
20124 compare_code = GE;
20128 /* Optimize dest = (op0 < 0) ? -1 : cf. */
20129 if (compare_code != UNKNOWN
20130 && GET_MODE (op0) == GET_MODE (out)
20131 && (cf == -1 || ct == -1))
20133 /* If lea code below could be used, only optimize
20134 if it results in a 2 insn sequence. */
20136 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
20137 || diff == 3 || diff == 5 || diff == 9)
20138 || (compare_code == LT && ct == -1)
20139 || (compare_code == GE && cf == -1))
20142 * notl op1 (if necessary)
20143 * sarl $31, op1
20144 * orl cf, op1
20146 if (ct != -1)
20148 cf = ct;
20149 ct = -1;
20150 code = reverse_condition (code);
20153 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20155 out = expand_simple_binop (mode, IOR,
20156 out, GEN_INT (cf),
20157 out, 1, OPTAB_DIRECT);
20158 if (out != operands[0])
20159 emit_move_insn (operands[0], out);
20161 return true;
20166 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
20167 || diff == 3 || diff == 5 || diff == 9)
20168 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
20169 && (mode != DImode
20170 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
20173 * xorl dest,dest
20174 * cmpl op1,op2
20175 * setcc dest
20176 * lea cf(dest*(ct-cf)),dest
20178 * Size 14.
20180 * This also catches the degenerate setcc-only case.
20183 rtx tmp;
20184 int nops;
20186 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20188 nops = 0;
20189 /* On x86_64 the lea instruction operates on Pmode, so we need
20190 to get arithmetics done in proper mode to match. */
20191 if (diff == 1)
20192 tmp = copy_rtx (out);
20193 else
20195 rtx out1;
20196 out1 = copy_rtx (out);
20197 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
20198 nops++;
20199 if (diff & 1)
20201 tmp = gen_rtx_PLUS (mode, tmp, out1);
20202 nops++;
20205 if (cf != 0)
20207 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
20208 nops++;
20210 if (!rtx_equal_p (tmp, out))
20212 if (nops == 1)
20213 out = force_operand (tmp, copy_rtx (out));
20214 else
20215 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
20217 if (!rtx_equal_p (out, operands[0]))
20218 emit_move_insn (operands[0], copy_rtx (out));
20220 return true;
20224 * General case: Jumpful:
20225 * xorl dest,dest cmpl op1, op2
20226 * cmpl op1, op2 movl ct, dest
20227 * setcc dest jcc 1f
20228 * decl dest movl cf, dest
20229 * andl (cf-ct),dest 1:
20230 * addl ct,dest
20232 * Size 20. Size 14.
20234 * This is reasonably steep, but branch mispredict costs are
20235 * high on modern cpus, so consider failing only if optimizing
20236 * for space.
20239 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20240 && BRANCH_COST (optimize_insn_for_speed_p (),
20241 false) >= 2)
20243 if (cf == 0)
20245 enum machine_mode cmp_mode = GET_MODE (op0);
20247 cf = ct;
20248 ct = 0;
20250 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20252 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20254 /* We may be reversing unordered compare to normal compare,
20255 that is not valid in general (we may convert non-trapping
20256 condition to trapping one), however on i386 we currently
20257 emit all comparisons unordered. */
20258 code = reverse_condition_maybe_unordered (code);
20260 else
20262 code = reverse_condition (code);
20263 if (compare_code != UNKNOWN)
20264 compare_code = reverse_condition (compare_code);
20268 if (compare_code != UNKNOWN)
20270 /* notl op1 (if needed)
20271 sarl $31, op1
20272 andl (cf-ct), op1
20273 addl ct, op1
20275 For x < 0 (resp. x <= -1) there will be no notl,
20276 so if possible swap the constants to get rid of the
20277 complement.
20278 True/false will be -1/0 while code below (store flag
20279 followed by decrement) is 0/-1, so the constants need
20280 to be exchanged once more. */
20282 if (compare_code == GE || !cf)
20284 code = reverse_condition (code);
20285 compare_code = LT;
20287 else
20289 HOST_WIDE_INT tmp = cf;
20290 cf = ct;
20291 ct = tmp;
20294 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20296 else
20298 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20300 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20301 constm1_rtx,
20302 copy_rtx (out), 1, OPTAB_DIRECT);
20305 out = expand_simple_binop (mode, AND, copy_rtx (out),
20306 gen_int_mode (cf - ct, mode),
20307 copy_rtx (out), 1, OPTAB_DIRECT);
20308 if (ct)
20309 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20310 copy_rtx (out), 1, OPTAB_DIRECT);
20311 if (!rtx_equal_p (out, operands[0]))
20312 emit_move_insn (operands[0], copy_rtx (out));
20314 return true;
20318 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20320 /* Try a few things more with specific constants and a variable. */
20322 optab op;
20323 rtx var, orig_out, out, tmp;
20325 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20326 return false;
20328 /* If one of the two operands is an interesting constant, load a
20329 constant with the above and mask it in with a logical operation. */
20331 if (CONST_INT_P (operands[2]))
20333 var = operands[3];
20334 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20335 operands[3] = constm1_rtx, op = and_optab;
20336 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20337 operands[3] = const0_rtx, op = ior_optab;
20338 else
20339 return false;
20341 else if (CONST_INT_P (operands[3]))
20343 var = operands[2];
20344 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20345 operands[2] = constm1_rtx, op = and_optab;
20346 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20347 operands[2] = const0_rtx, op = ior_optab;
20348 else
20349 return false;
20351 else
20352 return false;
20354 orig_out = operands[0];
20355 tmp = gen_reg_rtx (mode);
20356 operands[0] = tmp;
20358 /* Recurse to get the constant loaded. */
20359 if (ix86_expand_int_movcc (operands) == 0)
20360 return false;
20362 /* Mask in the interesting variable. */
20363 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20364 OPTAB_WIDEN);
20365 if (!rtx_equal_p (out, orig_out))
20366 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20368 return true;
20372 * For comparison with above,
20374 * movl cf,dest
20375 * movl ct,tmp
20376 * cmpl op1,op2
20377 * cmovcc tmp,dest
20379 * Size 15.
20382 if (! nonimmediate_operand (operands[2], mode))
20383 operands[2] = force_reg (mode, operands[2]);
20384 if (! nonimmediate_operand (operands[3], mode))
20385 operands[3] = force_reg (mode, operands[3]);
20387 if (! register_operand (operands[2], VOIDmode)
20388 && (mode == QImode
20389 || ! register_operand (operands[3], VOIDmode)))
20390 operands[2] = force_reg (mode, operands[2]);
20392 if (mode == QImode
20393 && ! register_operand (operands[3], VOIDmode))
20394 operands[3] = force_reg (mode, operands[3]);
20396 emit_insn (compare_seq);
20397 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20398 gen_rtx_IF_THEN_ELSE (mode,
20399 compare_op, operands[2],
20400 operands[3])));
20401 return true;
20404 /* Swap, force into registers, or otherwise massage the two operands
20405 to an sse comparison with a mask result. Thus we differ a bit from
20406 ix86_prepare_fp_compare_args which expects to produce a flags result.
20408 The DEST operand exists to help determine whether to commute commutative
20409 operators. The POP0/POP1 operands are updated in place. The new
20410 comparison code is returned, or UNKNOWN if not implementable. */
20412 static enum rtx_code
20413 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20414 rtx *pop0, rtx *pop1)
20416 rtx tmp;
20418 switch (code)
20420 case LTGT:
20421 case UNEQ:
20422 /* AVX supports all the needed comparisons. */
20423 if (TARGET_AVX)
20424 break;
20425 /* We have no LTGT as an operator. We could implement it with
20426 NE & ORDERED, but this requires an extra temporary. It's
20427 not clear that it's worth it. */
20428 return UNKNOWN;
20430 case LT:
20431 case LE:
20432 case UNGT:
20433 case UNGE:
20434 /* These are supported directly. */
20435 break;
20437 case EQ:
20438 case NE:
20439 case UNORDERED:
20440 case ORDERED:
20441 /* AVX has 3 operand comparisons, no need to swap anything. */
20442 if (TARGET_AVX)
20443 break;
20444 /* For commutative operators, try to canonicalize the destination
20445 operand to be first in the comparison - this helps reload to
20446 avoid extra moves. */
20447 if (!dest || !rtx_equal_p (dest, *pop1))
20448 break;
20449 /* FALLTHRU */
20451 case GE:
20452 case GT:
20453 case UNLE:
20454 case UNLT:
20455 /* These are not supported directly before AVX, and furthermore
20456 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20457 comparison operands to transform into something that is
20458 supported. */
20459 tmp = *pop0;
20460 *pop0 = *pop1;
20461 *pop1 = tmp;
20462 code = swap_condition (code);
20463 break;
20465 default:
20466 gcc_unreachable ();
20469 return code;
20472 /* Detect conditional moves that exactly match min/max operational
20473 semantics. Note that this is IEEE safe, as long as we don't
20474 interchange the operands.
20476 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20477 and TRUE if the operation is successful and instructions are emitted. */
20479 static bool
20480 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20481 rtx cmp_op1, rtx if_true, rtx if_false)
20483 enum machine_mode mode;
20484 bool is_min;
20485 rtx tmp;
20487 if (code == LT)
20489 else if (code == UNGE)
20491 tmp = if_true;
20492 if_true = if_false;
20493 if_false = tmp;
20495 else
20496 return false;
20498 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20499 is_min = true;
20500 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20501 is_min = false;
20502 else
20503 return false;
20505 mode = GET_MODE (dest);
20507 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20508 but MODE may be a vector mode and thus not appropriate. */
20509 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20511 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20512 rtvec v;
20514 if_true = force_reg (mode, if_true);
20515 v = gen_rtvec (2, if_true, if_false);
20516 tmp = gen_rtx_UNSPEC (mode, v, u);
20518 else
20520 code = is_min ? SMIN : SMAX;
20521 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20524 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20525 return true;
20528 /* Expand an sse vector comparison. Return the register with the result. */
20530 static rtx
20531 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20532 rtx op_true, rtx op_false)
20534 enum machine_mode mode = GET_MODE (dest);
20535 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20536 rtx x;
20538 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20539 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20540 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20542 if (optimize
20543 || reg_overlap_mentioned_p (dest, op_true)
20544 || reg_overlap_mentioned_p (dest, op_false))
20545 dest = gen_reg_rtx (mode);
20547 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20548 if (cmp_mode != mode)
20550 x = force_reg (cmp_mode, x);
20551 convert_move (dest, x, false);
20553 else
20554 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20556 return dest;
20559 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20560 operations. This is used for both scalar and vector conditional moves. */
20562 static void
20563 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20565 enum machine_mode mode = GET_MODE (dest);
20566 rtx t2, t3, x;
20568 if (vector_all_ones_operand (op_true, mode)
20569 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20571 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20573 else if (op_false == CONST0_RTX (mode))
20575 op_true = force_reg (mode, op_true);
20576 x = gen_rtx_AND (mode, cmp, op_true);
20577 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20579 else if (op_true == CONST0_RTX (mode))
20581 op_false = force_reg (mode, op_false);
20582 x = gen_rtx_NOT (mode, cmp);
20583 x = gen_rtx_AND (mode, x, op_false);
20584 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20586 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20588 op_false = force_reg (mode, op_false);
20589 x = gen_rtx_IOR (mode, cmp, op_false);
20590 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20592 else if (TARGET_XOP)
20594 op_true = force_reg (mode, op_true);
20596 if (!nonimmediate_operand (op_false, mode))
20597 op_false = force_reg (mode, op_false);
20599 emit_insn (gen_rtx_SET (mode, dest,
20600 gen_rtx_IF_THEN_ELSE (mode, cmp,
20601 op_true,
20602 op_false)));
20604 else
20606 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20607 rtx d = dest;
20609 if (!nonimmediate_operand (op_true, mode))
20610 op_true = force_reg (mode, op_true);
20612 op_false = force_reg (mode, op_false);
20614 switch (mode)
20616 case V4SFmode:
20617 if (TARGET_SSE4_1)
20618 gen = gen_sse4_1_blendvps;
20619 break;
20620 case V2DFmode:
20621 if (TARGET_SSE4_1)
20622 gen = gen_sse4_1_blendvpd;
20623 break;
20624 case V16QImode:
20625 case V8HImode:
20626 case V4SImode:
20627 case V2DImode:
20628 if (TARGET_SSE4_1)
20630 gen = gen_sse4_1_pblendvb;
20631 if (mode != V16QImode)
20632 d = gen_reg_rtx (V16QImode);
20633 op_false = gen_lowpart (V16QImode, op_false);
20634 op_true = gen_lowpart (V16QImode, op_true);
20635 cmp = gen_lowpart (V16QImode, cmp);
20637 break;
20638 case V8SFmode:
20639 if (TARGET_AVX)
20640 gen = gen_avx_blendvps256;
20641 break;
20642 case V4DFmode:
20643 if (TARGET_AVX)
20644 gen = gen_avx_blendvpd256;
20645 break;
20646 case V32QImode:
20647 case V16HImode:
20648 case V8SImode:
20649 case V4DImode:
20650 if (TARGET_AVX2)
20652 gen = gen_avx2_pblendvb;
20653 if (mode != V32QImode)
20654 d = gen_reg_rtx (V32QImode);
20655 op_false = gen_lowpart (V32QImode, op_false);
20656 op_true = gen_lowpart (V32QImode, op_true);
20657 cmp = gen_lowpart (V32QImode, cmp);
20659 break;
20660 default:
20661 break;
20664 if (gen != NULL)
20666 emit_insn (gen (d, op_false, op_true, cmp));
20667 if (d != dest)
20668 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
20670 else
20672 op_true = force_reg (mode, op_true);
20674 t2 = gen_reg_rtx (mode);
20675 if (optimize)
20676 t3 = gen_reg_rtx (mode);
20677 else
20678 t3 = dest;
20680 x = gen_rtx_AND (mode, op_true, cmp);
20681 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20683 x = gen_rtx_NOT (mode, cmp);
20684 x = gen_rtx_AND (mode, x, op_false);
20685 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20687 x = gen_rtx_IOR (mode, t3, t2);
20688 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20693 /* Expand a floating-point conditional move. Return true if successful. */
20695 bool
20696 ix86_expand_fp_movcc (rtx operands[])
20698 enum machine_mode mode = GET_MODE (operands[0]);
20699 enum rtx_code code = GET_CODE (operands[1]);
20700 rtx tmp, compare_op;
20701 rtx op0 = XEXP (operands[1], 0);
20702 rtx op1 = XEXP (operands[1], 1);
20704 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20706 enum machine_mode cmode;
20708 /* Since we've no cmove for sse registers, don't force bad register
20709 allocation just to gain access to it. Deny movcc when the
20710 comparison mode doesn't match the move mode. */
20711 cmode = GET_MODE (op0);
20712 if (cmode == VOIDmode)
20713 cmode = GET_MODE (op1);
20714 if (cmode != mode)
20715 return false;
20717 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20718 if (code == UNKNOWN)
20719 return false;
20721 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20722 operands[2], operands[3]))
20723 return true;
20725 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20726 operands[2], operands[3]);
20727 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20728 return true;
20731 if (GET_MODE (op0) == TImode
20732 || (GET_MODE (op0) == DImode
20733 && !TARGET_64BIT))
20734 return false;
20736 /* The floating point conditional move instructions don't directly
20737 support conditions resulting from a signed integer comparison. */
20739 compare_op = ix86_expand_compare (code, op0, op1);
20740 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20742 tmp = gen_reg_rtx (QImode);
20743 ix86_expand_setcc (tmp, code, op0, op1);
20745 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20748 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20749 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20750 operands[2], operands[3])));
20752 return true;
20755 /* Expand a floating-point vector conditional move; a vcond operation
20756 rather than a movcc operation. */
20758 bool
20759 ix86_expand_fp_vcond (rtx operands[])
20761 enum rtx_code code = GET_CODE (operands[3]);
20762 rtx cmp;
20764 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20765 &operands[4], &operands[5]);
20766 if (code == UNKNOWN)
20768 rtx temp;
20769 switch (GET_CODE (operands[3]))
20771 case LTGT:
20772 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20773 operands[5], operands[0], operands[0]);
20774 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20775 operands[5], operands[1], operands[2]);
20776 code = AND;
20777 break;
20778 case UNEQ:
20779 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20780 operands[5], operands[0], operands[0]);
20781 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20782 operands[5], operands[1], operands[2]);
20783 code = IOR;
20784 break;
20785 default:
20786 gcc_unreachable ();
20788 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20789 OPTAB_DIRECT);
20790 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20791 return true;
20794 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20795 operands[5], operands[1], operands[2]))
20796 return true;
20798 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20799 operands[1], operands[2]);
20800 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20801 return true;
20804 /* Expand a signed/unsigned integral vector conditional move. */
20806 bool
20807 ix86_expand_int_vcond (rtx operands[])
20809 enum machine_mode data_mode = GET_MODE (operands[0]);
20810 enum machine_mode mode = GET_MODE (operands[4]);
20811 enum rtx_code code = GET_CODE (operands[3]);
20812 bool negate = false;
20813 rtx x, cop0, cop1;
20815 cop0 = operands[4];
20816 cop1 = operands[5];
20818 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20819 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20820 if ((code == LT || code == GE)
20821 && data_mode == mode
20822 && cop1 == CONST0_RTX (mode)
20823 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20824 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20825 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20826 && (GET_MODE_SIZE (data_mode) == 16
20827 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20829 rtx negop = operands[2 - (code == LT)];
20830 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20831 if (negop == CONST1_RTX (data_mode))
20833 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20834 operands[0], 1, OPTAB_DIRECT);
20835 if (res != operands[0])
20836 emit_move_insn (operands[0], res);
20837 return true;
20839 else if (GET_MODE_INNER (data_mode) != DImode
20840 && vector_all_ones_operand (negop, data_mode))
20842 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20843 operands[0], 0, OPTAB_DIRECT);
20844 if (res != operands[0])
20845 emit_move_insn (operands[0], res);
20846 return true;
20850 if (!nonimmediate_operand (cop1, mode))
20851 cop1 = force_reg (mode, cop1);
20852 if (!general_operand (operands[1], data_mode))
20853 operands[1] = force_reg (data_mode, operands[1]);
20854 if (!general_operand (operands[2], data_mode))
20855 operands[2] = force_reg (data_mode, operands[2]);
20857 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20858 if (TARGET_XOP
20859 && (mode == V16QImode || mode == V8HImode
20860 || mode == V4SImode || mode == V2DImode))
20862 else
20864 /* Canonicalize the comparison to EQ, GT, GTU. */
20865 switch (code)
20867 case EQ:
20868 case GT:
20869 case GTU:
20870 break;
20872 case NE:
20873 case LE:
20874 case LEU:
20875 code = reverse_condition (code);
20876 negate = true;
20877 break;
20879 case GE:
20880 case GEU:
20881 code = reverse_condition (code);
20882 negate = true;
20883 /* FALLTHRU */
20885 case LT:
20886 case LTU:
20887 code = swap_condition (code);
20888 x = cop0, cop0 = cop1, cop1 = x;
20889 break;
20891 default:
20892 gcc_unreachable ();
20895 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20896 if (mode == V2DImode)
20898 switch (code)
20900 case EQ:
20901 /* SSE4.1 supports EQ. */
20902 if (!TARGET_SSE4_1)
20903 return false;
20904 break;
20906 case GT:
20907 case GTU:
20908 /* SSE4.2 supports GT/GTU. */
20909 if (!TARGET_SSE4_2)
20910 return false;
20911 break;
20913 default:
20914 gcc_unreachable ();
20918 /* Unsigned parallel compare is not supported by the hardware.
20919 Play some tricks to turn this into a signed comparison
20920 against 0. */
20921 if (code == GTU)
20923 cop0 = force_reg (mode, cop0);
20925 switch (mode)
20927 case V8SImode:
20928 case V4DImode:
20929 case V4SImode:
20930 case V2DImode:
20932 rtx t1, t2, mask;
20933 rtx (*gen_sub3) (rtx, rtx, rtx);
20935 switch (mode)
20937 case V8SImode: gen_sub3 = gen_subv8si3; break;
20938 case V4DImode: gen_sub3 = gen_subv4di3; break;
20939 case V4SImode: gen_sub3 = gen_subv4si3; break;
20940 case V2DImode: gen_sub3 = gen_subv2di3; break;
20941 default:
20942 gcc_unreachable ();
20944 /* Subtract (-(INT MAX) - 1) from both operands to make
20945 them signed. */
20946 mask = ix86_build_signbit_mask (mode, true, false);
20947 t1 = gen_reg_rtx (mode);
20948 emit_insn (gen_sub3 (t1, cop0, mask));
20950 t2 = gen_reg_rtx (mode);
20951 emit_insn (gen_sub3 (t2, cop1, mask));
20953 cop0 = t1;
20954 cop1 = t2;
20955 code = GT;
20957 break;
20959 case V32QImode:
20960 case V16HImode:
20961 case V16QImode:
20962 case V8HImode:
20963 /* Perform a parallel unsigned saturating subtraction. */
20964 x = gen_reg_rtx (mode);
20965 emit_insn (gen_rtx_SET (VOIDmode, x,
20966 gen_rtx_US_MINUS (mode, cop0, cop1)));
20968 cop0 = x;
20969 cop1 = CONST0_RTX (mode);
20970 code = EQ;
20971 negate = !negate;
20972 break;
20974 default:
20975 gcc_unreachable ();
20980 /* Allow the comparison to be done in one mode, but the movcc to
20981 happen in another mode. */
20982 if (data_mode == mode)
20984 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20985 operands[1+negate], operands[2-negate]);
20987 else
20989 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20990 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
20991 operands[1+negate], operands[2-negate]);
20992 x = gen_lowpart (data_mode, x);
20995 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20996 operands[2-negate]);
20997 return true;
21000 /* Expand a variable vector permutation. */
21002 void
21003 ix86_expand_vec_perm (rtx operands[])
21005 rtx target = operands[0];
21006 rtx op0 = operands[1];
21007 rtx op1 = operands[2];
21008 rtx mask = operands[3];
21009 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
21010 enum machine_mode mode = GET_MODE (op0);
21011 enum machine_mode maskmode = GET_MODE (mask);
21012 int w, e, i;
21013 bool one_operand_shuffle = rtx_equal_p (op0, op1);
21015 /* Number of elements in the vector. */
21016 w = GET_MODE_NUNITS (mode);
21017 e = GET_MODE_UNIT_SIZE (mode);
21018 gcc_assert (w <= 32);
21020 if (TARGET_AVX2)
21022 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
21024 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
21025 an constant shuffle operand. With a tiny bit of effort we can
21026 use VPERMD instead. A re-interpretation stall for V4DFmode is
21027 unfortunate but there's no avoiding it.
21028 Similarly for V16HImode we don't have instructions for variable
21029 shuffling, while for V32QImode we can use after preparing suitable
21030 masks vpshufb; vpshufb; vpermq; vpor. */
21032 if (mode == V16HImode)
21034 maskmode = mode = V32QImode;
21035 w = 32;
21036 e = 1;
21038 else
21040 maskmode = mode = V8SImode;
21041 w = 8;
21042 e = 4;
21044 t1 = gen_reg_rtx (maskmode);
21046 /* Replicate the low bits of the V4DImode mask into V8SImode:
21047 mask = { A B C D }
21048 t1 = { A A B B C C D D }. */
21049 for (i = 0; i < w / 2; ++i)
21050 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
21051 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21052 vt = force_reg (maskmode, vt);
21053 mask = gen_lowpart (maskmode, mask);
21054 if (maskmode == V8SImode)
21055 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
21056 else
21057 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
21059 /* Multiply the shuffle indicies by two. */
21060 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
21061 OPTAB_DIRECT);
21063 /* Add one to the odd shuffle indicies:
21064 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
21065 for (i = 0; i < w / 2; ++i)
21067 vec[i * 2] = const0_rtx;
21068 vec[i * 2 + 1] = const1_rtx;
21070 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21071 vt = validize_mem (force_const_mem (maskmode, vt));
21072 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
21073 OPTAB_DIRECT);
21075 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
21076 operands[3] = mask = t1;
21077 target = gen_reg_rtx (mode);
21078 op0 = gen_lowpart (mode, op0);
21079 op1 = gen_lowpart (mode, op1);
21082 switch (mode)
21084 case V8SImode:
21085 /* The VPERMD and VPERMPS instructions already properly ignore
21086 the high bits of the shuffle elements. No need for us to
21087 perform an AND ourselves. */
21088 if (one_operand_shuffle)
21090 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
21091 if (target != operands[0])
21092 emit_move_insn (operands[0],
21093 gen_lowpart (GET_MODE (operands[0]), target));
21095 else
21097 t1 = gen_reg_rtx (V8SImode);
21098 t2 = gen_reg_rtx (V8SImode);
21099 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
21100 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
21101 goto merge_two;
21103 return;
21105 case V8SFmode:
21106 mask = gen_lowpart (V8SFmode, mask);
21107 if (one_operand_shuffle)
21108 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
21109 else
21111 t1 = gen_reg_rtx (V8SFmode);
21112 t2 = gen_reg_rtx (V8SFmode);
21113 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
21114 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
21115 goto merge_two;
21117 return;
21119 case V4SImode:
21120 /* By combining the two 128-bit input vectors into one 256-bit
21121 input vector, we can use VPERMD and VPERMPS for the full
21122 two-operand shuffle. */
21123 t1 = gen_reg_rtx (V8SImode);
21124 t2 = gen_reg_rtx (V8SImode);
21125 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
21126 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21127 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
21128 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
21129 return;
21131 case V4SFmode:
21132 t1 = gen_reg_rtx (V8SFmode);
21133 t2 = gen_reg_rtx (V8SImode);
21134 mask = gen_lowpart (V4SImode, mask);
21135 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
21136 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21137 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
21138 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
21139 return;
21141 case V32QImode:
21142 t1 = gen_reg_rtx (V32QImode);
21143 t2 = gen_reg_rtx (V32QImode);
21144 t3 = gen_reg_rtx (V32QImode);
21145 vt2 = GEN_INT (128);
21146 for (i = 0; i < 32; i++)
21147 vec[i] = vt2;
21148 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21149 vt = force_reg (V32QImode, vt);
21150 for (i = 0; i < 32; i++)
21151 vec[i] = i < 16 ? vt2 : const0_rtx;
21152 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21153 vt2 = force_reg (V32QImode, vt2);
21154 /* From mask create two adjusted masks, which contain the same
21155 bits as mask in the low 7 bits of each vector element.
21156 The first mask will have the most significant bit clear
21157 if it requests element from the same 128-bit lane
21158 and MSB set if it requests element from the other 128-bit lane.
21159 The second mask will have the opposite values of the MSB,
21160 and additionally will have its 128-bit lanes swapped.
21161 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
21162 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
21163 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
21164 stands for other 12 bytes. */
21165 /* The bit whether element is from the same lane or the other
21166 lane is bit 4, so shift it up by 3 to the MSB position. */
21167 t5 = gen_reg_rtx (V4DImode);
21168 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
21169 GEN_INT (3)));
21170 /* Clear MSB bits from the mask just in case it had them set. */
21171 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
21172 /* After this t1 will have MSB set for elements from other lane. */
21173 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
21174 /* Clear bits other than MSB. */
21175 emit_insn (gen_andv32qi3 (t1, t1, vt));
21176 /* Or in the lower bits from mask into t3. */
21177 emit_insn (gen_iorv32qi3 (t3, t1, t2));
21178 /* And invert MSB bits in t1, so MSB is set for elements from the same
21179 lane. */
21180 emit_insn (gen_xorv32qi3 (t1, t1, vt));
21181 /* Swap 128-bit lanes in t3. */
21182 t6 = gen_reg_rtx (V4DImode);
21183 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
21184 const2_rtx, GEN_INT (3),
21185 const0_rtx, const1_rtx));
21186 /* And or in the lower bits from mask into t1. */
21187 emit_insn (gen_iorv32qi3 (t1, t1, t2));
21188 if (one_operand_shuffle)
21190 /* Each of these shuffles will put 0s in places where
21191 element from the other 128-bit lane is needed, otherwise
21192 will shuffle in the requested value. */
21193 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
21194 gen_lowpart (V32QImode, t6)));
21195 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
21196 /* For t3 the 128-bit lanes are swapped again. */
21197 t7 = gen_reg_rtx (V4DImode);
21198 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
21199 const2_rtx, GEN_INT (3),
21200 const0_rtx, const1_rtx));
21201 /* And oring both together leads to the result. */
21202 emit_insn (gen_iorv32qi3 (target, t1,
21203 gen_lowpart (V32QImode, t7)));
21204 if (target != operands[0])
21205 emit_move_insn (operands[0],
21206 gen_lowpart (GET_MODE (operands[0]), target));
21207 return;
21210 t4 = gen_reg_rtx (V32QImode);
21211 /* Similarly to the above one_operand_shuffle code,
21212 just for repeated twice for each operand. merge_two:
21213 code will merge the two results together. */
21214 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
21215 gen_lowpart (V32QImode, t6)));
21216 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
21217 gen_lowpart (V32QImode, t6)));
21218 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
21219 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
21220 t7 = gen_reg_rtx (V4DImode);
21221 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
21222 const2_rtx, GEN_INT (3),
21223 const0_rtx, const1_rtx));
21224 t8 = gen_reg_rtx (V4DImode);
21225 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
21226 const2_rtx, GEN_INT (3),
21227 const0_rtx, const1_rtx));
21228 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
21229 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
21230 t1 = t4;
21231 t2 = t3;
21232 goto merge_two;
21234 default:
21235 gcc_assert (GET_MODE_SIZE (mode) <= 16);
21236 break;
21240 if (TARGET_XOP)
21242 /* The XOP VPPERM insn supports three inputs. By ignoring the
21243 one_operand_shuffle special case, we avoid creating another
21244 set of constant vectors in memory. */
21245 one_operand_shuffle = false;
21247 /* mask = mask & {2*w-1, ...} */
21248 vt = GEN_INT (2*w - 1);
21250 else
21252 /* mask = mask & {w-1, ...} */
21253 vt = GEN_INT (w - 1);
21256 for (i = 0; i < w; i++)
21257 vec[i] = vt;
21258 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21259 mask = expand_simple_binop (maskmode, AND, mask, vt,
21260 NULL_RTX, 0, OPTAB_DIRECT);
21262 /* For non-QImode operations, convert the word permutation control
21263 into a byte permutation control. */
21264 if (mode != V16QImode)
21266 mask = expand_simple_binop (maskmode, ASHIFT, mask,
21267 GEN_INT (exact_log2 (e)),
21268 NULL_RTX, 0, OPTAB_DIRECT);
21270 /* Convert mask to vector of chars. */
21271 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
21273 /* Replicate each of the input bytes into byte positions:
21274 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
21275 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
21276 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
21277 for (i = 0; i < 16; ++i)
21278 vec[i] = GEN_INT (i/e * e);
21279 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21280 vt = validize_mem (force_const_mem (V16QImode, vt));
21281 if (TARGET_XOP)
21282 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
21283 else
21284 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
21286 /* Convert it into the byte positions by doing
21287 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
21288 for (i = 0; i < 16; ++i)
21289 vec[i] = GEN_INT (i % e);
21290 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21291 vt = validize_mem (force_const_mem (V16QImode, vt));
21292 emit_insn (gen_addv16qi3 (mask, mask, vt));
21295 /* The actual shuffle operations all operate on V16QImode. */
21296 op0 = gen_lowpart (V16QImode, op0);
21297 op1 = gen_lowpart (V16QImode, op1);
21299 if (TARGET_XOP)
21301 if (GET_MODE (target) != V16QImode)
21302 target = gen_reg_rtx (V16QImode);
21303 emit_insn (gen_xop_pperm (target, op0, op1, mask));
21304 if (target != operands[0])
21305 emit_move_insn (operands[0],
21306 gen_lowpart (GET_MODE (operands[0]), target));
21308 else if (one_operand_shuffle)
21310 if (GET_MODE (target) != V16QImode)
21311 target = gen_reg_rtx (V16QImode);
21312 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
21313 if (target != operands[0])
21314 emit_move_insn (operands[0],
21315 gen_lowpart (GET_MODE (operands[0]), target));
21317 else
21319 rtx xops[6];
21320 bool ok;
21322 /* Shuffle the two input vectors independently. */
21323 t1 = gen_reg_rtx (V16QImode);
21324 t2 = gen_reg_rtx (V16QImode);
21325 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
21326 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21328 merge_two:
21329 /* Then merge them together. The key is whether any given control
21330 element contained a bit set that indicates the second word. */
21331 mask = operands[3];
21332 vt = GEN_INT (w);
21333 if (maskmode == V2DImode && !TARGET_SSE4_1)
21335 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21336 more shuffle to convert the V2DI input mask into a V4SI
21337 input mask. At which point the masking that expand_int_vcond
21338 will work as desired. */
21339 rtx t3 = gen_reg_rtx (V4SImode);
21340 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21341 const0_rtx, const0_rtx,
21342 const2_rtx, const2_rtx));
21343 mask = t3;
21344 maskmode = V4SImode;
21345 e = w = 4;
21348 for (i = 0; i < w; i++)
21349 vec[i] = vt;
21350 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21351 vt = force_reg (maskmode, vt);
21352 mask = expand_simple_binop (maskmode, AND, mask, vt,
21353 NULL_RTX, 0, OPTAB_DIRECT);
21355 if (GET_MODE (target) != mode)
21356 target = gen_reg_rtx (mode);
21357 xops[0] = target;
21358 xops[1] = gen_lowpart (mode, t2);
21359 xops[2] = gen_lowpart (mode, t1);
21360 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21361 xops[4] = mask;
21362 xops[5] = vt;
21363 ok = ix86_expand_int_vcond (xops);
21364 gcc_assert (ok);
21365 if (target != operands[0])
21366 emit_move_insn (operands[0],
21367 gen_lowpart (GET_MODE (operands[0]), target));
21371 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21372 true if we should do zero extension, else sign extension. HIGH_P is
21373 true if we want the N/2 high elements, else the low elements. */
21375 void
21376 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21378 enum machine_mode imode = GET_MODE (src);
21379 rtx tmp;
21381 if (TARGET_SSE4_1)
21383 rtx (*unpack)(rtx, rtx);
21384 rtx (*extract)(rtx, rtx) = NULL;
21385 enum machine_mode halfmode = BLKmode;
21387 switch (imode)
21389 case V32QImode:
21390 if (unsigned_p)
21391 unpack = gen_avx2_zero_extendv16qiv16hi2;
21392 else
21393 unpack = gen_avx2_sign_extendv16qiv16hi2;
21394 halfmode = V16QImode;
21395 extract
21396 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21397 break;
21398 case V16HImode:
21399 if (unsigned_p)
21400 unpack = gen_avx2_zero_extendv8hiv8si2;
21401 else
21402 unpack = gen_avx2_sign_extendv8hiv8si2;
21403 halfmode = V8HImode;
21404 extract
21405 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21406 break;
21407 case V8SImode:
21408 if (unsigned_p)
21409 unpack = gen_avx2_zero_extendv4siv4di2;
21410 else
21411 unpack = gen_avx2_sign_extendv4siv4di2;
21412 halfmode = V4SImode;
21413 extract
21414 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21415 break;
21416 case V16QImode:
21417 if (unsigned_p)
21418 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21419 else
21420 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21421 break;
21422 case V8HImode:
21423 if (unsigned_p)
21424 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21425 else
21426 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21427 break;
21428 case V4SImode:
21429 if (unsigned_p)
21430 unpack = gen_sse4_1_zero_extendv2siv2di2;
21431 else
21432 unpack = gen_sse4_1_sign_extendv2siv2di2;
21433 break;
21434 default:
21435 gcc_unreachable ();
21438 if (GET_MODE_SIZE (imode) == 32)
21440 tmp = gen_reg_rtx (halfmode);
21441 emit_insn (extract (tmp, src));
21443 else if (high_p)
21445 /* Shift higher 8 bytes to lower 8 bytes. */
21446 tmp = gen_reg_rtx (V1TImode);
21447 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
21448 GEN_INT (64)));
21449 tmp = gen_lowpart (imode, tmp);
21451 else
21452 tmp = src;
21454 emit_insn (unpack (dest, tmp));
21456 else
21458 rtx (*unpack)(rtx, rtx, rtx);
21460 switch (imode)
21462 case V16QImode:
21463 if (high_p)
21464 unpack = gen_vec_interleave_highv16qi;
21465 else
21466 unpack = gen_vec_interleave_lowv16qi;
21467 break;
21468 case V8HImode:
21469 if (high_p)
21470 unpack = gen_vec_interleave_highv8hi;
21471 else
21472 unpack = gen_vec_interleave_lowv8hi;
21473 break;
21474 case V4SImode:
21475 if (high_p)
21476 unpack = gen_vec_interleave_highv4si;
21477 else
21478 unpack = gen_vec_interleave_lowv4si;
21479 break;
21480 default:
21481 gcc_unreachable ();
21484 if (unsigned_p)
21485 tmp = force_reg (imode, CONST0_RTX (imode));
21486 else
21487 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21488 src, pc_rtx, pc_rtx);
21490 rtx tmp2 = gen_reg_rtx (imode);
21491 emit_insn (unpack (tmp2, src, tmp));
21492 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
21496 /* Expand conditional increment or decrement using adb/sbb instructions.
21497 The default case using setcc followed by the conditional move can be
21498 done by generic code. */
21499 bool
21500 ix86_expand_int_addcc (rtx operands[])
21502 enum rtx_code code = GET_CODE (operands[1]);
21503 rtx flags;
21504 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21505 rtx compare_op;
21506 rtx val = const0_rtx;
21507 bool fpcmp = false;
21508 enum machine_mode mode;
21509 rtx op0 = XEXP (operands[1], 0);
21510 rtx op1 = XEXP (operands[1], 1);
21512 if (operands[3] != const1_rtx
21513 && operands[3] != constm1_rtx)
21514 return false;
21515 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21516 return false;
21517 code = GET_CODE (compare_op);
21519 flags = XEXP (compare_op, 0);
21521 if (GET_MODE (flags) == CCFPmode
21522 || GET_MODE (flags) == CCFPUmode)
21524 fpcmp = true;
21525 code = ix86_fp_compare_code_to_integer (code);
21528 if (code != LTU)
21530 val = constm1_rtx;
21531 if (fpcmp)
21532 PUT_CODE (compare_op,
21533 reverse_condition_maybe_unordered
21534 (GET_CODE (compare_op)));
21535 else
21536 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21539 mode = GET_MODE (operands[0]);
21541 /* Construct either adc or sbb insn. */
21542 if ((code == LTU) == (operands[3] == constm1_rtx))
21544 switch (mode)
21546 case QImode:
21547 insn = gen_subqi3_carry;
21548 break;
21549 case HImode:
21550 insn = gen_subhi3_carry;
21551 break;
21552 case SImode:
21553 insn = gen_subsi3_carry;
21554 break;
21555 case DImode:
21556 insn = gen_subdi3_carry;
21557 break;
21558 default:
21559 gcc_unreachable ();
21562 else
21564 switch (mode)
21566 case QImode:
21567 insn = gen_addqi3_carry;
21568 break;
21569 case HImode:
21570 insn = gen_addhi3_carry;
21571 break;
21572 case SImode:
21573 insn = gen_addsi3_carry;
21574 break;
21575 case DImode:
21576 insn = gen_adddi3_carry;
21577 break;
21578 default:
21579 gcc_unreachable ();
21582 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21584 return true;
21588 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21589 but works for floating pointer parameters and nonoffsetable memories.
21590 For pushes, it returns just stack offsets; the values will be saved
21591 in the right order. Maximally three parts are generated. */
21593 static int
21594 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21596 int size;
21598 if (!TARGET_64BIT)
21599 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21600 else
21601 size = (GET_MODE_SIZE (mode) + 4) / 8;
21603 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21604 gcc_assert (size >= 2 && size <= 4);
21606 /* Optimize constant pool reference to immediates. This is used by fp
21607 moves, that force all constants to memory to allow combining. */
21608 if (MEM_P (operand) && MEM_READONLY_P (operand))
21610 rtx tmp = maybe_get_pool_constant (operand);
21611 if (tmp)
21612 operand = tmp;
21615 if (MEM_P (operand) && !offsettable_memref_p (operand))
21617 /* The only non-offsetable memories we handle are pushes. */
21618 int ok = push_operand (operand, VOIDmode);
21620 gcc_assert (ok);
21622 operand = copy_rtx (operand);
21623 PUT_MODE (operand, word_mode);
21624 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21625 return size;
21628 if (GET_CODE (operand) == CONST_VECTOR)
21630 enum machine_mode imode = int_mode_for_mode (mode);
21631 /* Caution: if we looked through a constant pool memory above,
21632 the operand may actually have a different mode now. That's
21633 ok, since we want to pun this all the way back to an integer. */
21634 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21635 gcc_assert (operand != NULL);
21636 mode = imode;
21639 if (!TARGET_64BIT)
21641 if (mode == DImode)
21642 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21643 else
21645 int i;
21647 if (REG_P (operand))
21649 gcc_assert (reload_completed);
21650 for (i = 0; i < size; i++)
21651 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21653 else if (offsettable_memref_p (operand))
21655 operand = adjust_address (operand, SImode, 0);
21656 parts[0] = operand;
21657 for (i = 1; i < size; i++)
21658 parts[i] = adjust_address (operand, SImode, 4 * i);
21660 else if (GET_CODE (operand) == CONST_DOUBLE)
21662 REAL_VALUE_TYPE r;
21663 long l[4];
21665 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21666 switch (mode)
21668 case TFmode:
21669 real_to_target (l, &r, mode);
21670 parts[3] = gen_int_mode (l[3], SImode);
21671 parts[2] = gen_int_mode (l[2], SImode);
21672 break;
21673 case XFmode:
21674 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21675 long double may not be 80-bit. */
21676 real_to_target (l, &r, mode);
21677 parts[2] = gen_int_mode (l[2], SImode);
21678 break;
21679 case DFmode:
21680 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21681 break;
21682 default:
21683 gcc_unreachable ();
21685 parts[1] = gen_int_mode (l[1], SImode);
21686 parts[0] = gen_int_mode (l[0], SImode);
21688 else
21689 gcc_unreachable ();
21692 else
21694 if (mode == TImode)
21695 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21696 if (mode == XFmode || mode == TFmode)
21698 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21699 if (REG_P (operand))
21701 gcc_assert (reload_completed);
21702 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21703 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21705 else if (offsettable_memref_p (operand))
21707 operand = adjust_address (operand, DImode, 0);
21708 parts[0] = operand;
21709 parts[1] = adjust_address (operand, upper_mode, 8);
21711 else if (GET_CODE (operand) == CONST_DOUBLE)
21713 REAL_VALUE_TYPE r;
21714 long l[4];
21716 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21717 real_to_target (l, &r, mode);
21719 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21720 if (HOST_BITS_PER_WIDE_INT >= 64)
21721 parts[0]
21722 = gen_int_mode
21723 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21724 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21725 DImode);
21726 else
21727 parts[0] = immed_double_const (l[0], l[1], DImode);
21729 if (upper_mode == SImode)
21730 parts[1] = gen_int_mode (l[2], SImode);
21731 else if (HOST_BITS_PER_WIDE_INT >= 64)
21732 parts[1]
21733 = gen_int_mode
21734 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21735 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21736 DImode);
21737 else
21738 parts[1] = immed_double_const (l[2], l[3], DImode);
21740 else
21741 gcc_unreachable ();
21745 return size;
21748 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21749 Return false when normal moves are needed; true when all required
21750 insns have been emitted. Operands 2-4 contain the input values
21751 int the correct order; operands 5-7 contain the output values. */
21753 void
21754 ix86_split_long_move (rtx operands[])
21756 rtx part[2][4];
21757 int nparts, i, j;
21758 int push = 0;
21759 int collisions = 0;
21760 enum machine_mode mode = GET_MODE (operands[0]);
21761 bool collisionparts[4];
21763 /* The DFmode expanders may ask us to move double.
21764 For 64bit target this is single move. By hiding the fact
21765 here we simplify i386.md splitters. */
21766 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21768 /* Optimize constant pool reference to immediates. This is used by
21769 fp moves, that force all constants to memory to allow combining. */
21771 if (MEM_P (operands[1])
21772 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21773 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21774 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21775 if (push_operand (operands[0], VOIDmode))
21777 operands[0] = copy_rtx (operands[0]);
21778 PUT_MODE (operands[0], word_mode);
21780 else
21781 operands[0] = gen_lowpart (DImode, operands[0]);
21782 operands[1] = gen_lowpart (DImode, operands[1]);
21783 emit_move_insn (operands[0], operands[1]);
21784 return;
21787 /* The only non-offsettable memory we handle is push. */
21788 if (push_operand (operands[0], VOIDmode))
21789 push = 1;
21790 else
21791 gcc_assert (!MEM_P (operands[0])
21792 || offsettable_memref_p (operands[0]));
21794 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21795 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21797 /* When emitting push, take care for source operands on the stack. */
21798 if (push && MEM_P (operands[1])
21799 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21801 rtx src_base = XEXP (part[1][nparts - 1], 0);
21803 /* Compensate for the stack decrement by 4. */
21804 if (!TARGET_64BIT && nparts == 3
21805 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21806 src_base = plus_constant (Pmode, src_base, 4);
21808 /* src_base refers to the stack pointer and is
21809 automatically decreased by emitted push. */
21810 for (i = 0; i < nparts; i++)
21811 part[1][i] = change_address (part[1][i],
21812 GET_MODE (part[1][i]), src_base);
21815 /* We need to do copy in the right order in case an address register
21816 of the source overlaps the destination. */
21817 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21819 rtx tmp;
21821 for (i = 0; i < nparts; i++)
21823 collisionparts[i]
21824 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21825 if (collisionparts[i])
21826 collisions++;
21829 /* Collision in the middle part can be handled by reordering. */
21830 if (collisions == 1 && nparts == 3 && collisionparts [1])
21832 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21833 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21835 else if (collisions == 1
21836 && nparts == 4
21837 && (collisionparts [1] || collisionparts [2]))
21839 if (collisionparts [1])
21841 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21842 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21844 else
21846 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21847 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21851 /* If there are more collisions, we can't handle it by reordering.
21852 Do an lea to the last part and use only one colliding move. */
21853 else if (collisions > 1)
21855 rtx base;
21857 collisions = 1;
21859 base = part[0][nparts - 1];
21861 /* Handle the case when the last part isn't valid for lea.
21862 Happens in 64-bit mode storing the 12-byte XFmode. */
21863 if (GET_MODE (base) != Pmode)
21864 base = gen_rtx_REG (Pmode, REGNO (base));
21866 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21867 part[1][0] = replace_equiv_address (part[1][0], base);
21868 for (i = 1; i < nparts; i++)
21870 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21871 part[1][i] = replace_equiv_address (part[1][i], tmp);
21876 if (push)
21878 if (!TARGET_64BIT)
21880 if (nparts == 3)
21882 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21883 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21884 stack_pointer_rtx, GEN_INT (-4)));
21885 emit_move_insn (part[0][2], part[1][2]);
21887 else if (nparts == 4)
21889 emit_move_insn (part[0][3], part[1][3]);
21890 emit_move_insn (part[0][2], part[1][2]);
21893 else
21895 /* In 64bit mode we don't have 32bit push available. In case this is
21896 register, it is OK - we will just use larger counterpart. We also
21897 retype memory - these comes from attempt to avoid REX prefix on
21898 moving of second half of TFmode value. */
21899 if (GET_MODE (part[1][1]) == SImode)
21901 switch (GET_CODE (part[1][1]))
21903 case MEM:
21904 part[1][1] = adjust_address (part[1][1], DImode, 0);
21905 break;
21907 case REG:
21908 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21909 break;
21911 default:
21912 gcc_unreachable ();
21915 if (GET_MODE (part[1][0]) == SImode)
21916 part[1][0] = part[1][1];
21919 emit_move_insn (part[0][1], part[1][1]);
21920 emit_move_insn (part[0][0], part[1][0]);
21921 return;
21924 /* Choose correct order to not overwrite the source before it is copied. */
21925 if ((REG_P (part[0][0])
21926 && REG_P (part[1][1])
21927 && (REGNO (part[0][0]) == REGNO (part[1][1])
21928 || (nparts == 3
21929 && REGNO (part[0][0]) == REGNO (part[1][2]))
21930 || (nparts == 4
21931 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21932 || (collisions > 0
21933 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21935 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21937 operands[2 + i] = part[0][j];
21938 operands[6 + i] = part[1][j];
21941 else
21943 for (i = 0; i < nparts; i++)
21945 operands[2 + i] = part[0][i];
21946 operands[6 + i] = part[1][i];
21950 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21951 if (optimize_insn_for_size_p ())
21953 for (j = 0; j < nparts - 1; j++)
21954 if (CONST_INT_P (operands[6 + j])
21955 && operands[6 + j] != const0_rtx
21956 && REG_P (operands[2 + j]))
21957 for (i = j; i < nparts - 1; i++)
21958 if (CONST_INT_P (operands[7 + i])
21959 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21960 operands[7 + i] = operands[2 + j];
21963 for (i = 0; i < nparts; i++)
21964 emit_move_insn (operands[2 + i], operands[6 + i]);
21966 return;
21969 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21970 left shift by a constant, either using a single shift or
21971 a sequence of add instructions. */
21973 static void
21974 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21976 rtx (*insn)(rtx, rtx, rtx);
21978 if (count == 1
21979 || (count * ix86_cost->add <= ix86_cost->shift_const
21980 && !optimize_insn_for_size_p ()))
21982 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21983 while (count-- > 0)
21984 emit_insn (insn (operand, operand, operand));
21986 else
21988 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21989 emit_insn (insn (operand, operand, GEN_INT (count)));
21993 void
21994 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21996 rtx (*gen_ashl3)(rtx, rtx, rtx);
21997 rtx (*gen_shld)(rtx, rtx, rtx);
21998 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22000 rtx low[2], high[2];
22001 int count;
22003 if (CONST_INT_P (operands[2]))
22005 split_double_mode (mode, operands, 2, low, high);
22006 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22008 if (count >= half_width)
22010 emit_move_insn (high[0], low[1]);
22011 emit_move_insn (low[0], const0_rtx);
22013 if (count > half_width)
22014 ix86_expand_ashl_const (high[0], count - half_width, mode);
22016 else
22018 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22020 if (!rtx_equal_p (operands[0], operands[1]))
22021 emit_move_insn (operands[0], operands[1]);
22023 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
22024 ix86_expand_ashl_const (low[0], count, mode);
22026 return;
22029 split_double_mode (mode, operands, 1, low, high);
22031 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
22033 if (operands[1] == const1_rtx)
22035 /* Assuming we've chosen a QImode capable registers, then 1 << N
22036 can be done with two 32/64-bit shifts, no branches, no cmoves. */
22037 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
22039 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
22041 ix86_expand_clear (low[0]);
22042 ix86_expand_clear (high[0]);
22043 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
22045 d = gen_lowpart (QImode, low[0]);
22046 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
22047 s = gen_rtx_EQ (QImode, flags, const0_rtx);
22048 emit_insn (gen_rtx_SET (VOIDmode, d, s));
22050 d = gen_lowpart (QImode, high[0]);
22051 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
22052 s = gen_rtx_NE (QImode, flags, const0_rtx);
22053 emit_insn (gen_rtx_SET (VOIDmode, d, s));
22056 /* Otherwise, we can get the same results by manually performing
22057 a bit extract operation on bit 5/6, and then performing the two
22058 shifts. The two methods of getting 0/1 into low/high are exactly
22059 the same size. Avoiding the shift in the bit extract case helps
22060 pentium4 a bit; no one else seems to care much either way. */
22061 else
22063 enum machine_mode half_mode;
22064 rtx (*gen_lshr3)(rtx, rtx, rtx);
22065 rtx (*gen_and3)(rtx, rtx, rtx);
22066 rtx (*gen_xor3)(rtx, rtx, rtx);
22067 HOST_WIDE_INT bits;
22068 rtx x;
22070 if (mode == DImode)
22072 half_mode = SImode;
22073 gen_lshr3 = gen_lshrsi3;
22074 gen_and3 = gen_andsi3;
22075 gen_xor3 = gen_xorsi3;
22076 bits = 5;
22078 else
22080 half_mode = DImode;
22081 gen_lshr3 = gen_lshrdi3;
22082 gen_and3 = gen_anddi3;
22083 gen_xor3 = gen_xordi3;
22084 bits = 6;
22087 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
22088 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
22089 else
22090 x = gen_lowpart (half_mode, operands[2]);
22091 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
22093 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
22094 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
22095 emit_move_insn (low[0], high[0]);
22096 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
22099 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22100 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
22101 return;
22104 if (operands[1] == constm1_rtx)
22106 /* For -1 << N, we can avoid the shld instruction, because we
22107 know that we're shifting 0...31/63 ones into a -1. */
22108 emit_move_insn (low[0], constm1_rtx);
22109 if (optimize_insn_for_size_p ())
22110 emit_move_insn (high[0], low[0]);
22111 else
22112 emit_move_insn (high[0], constm1_rtx);
22114 else
22116 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22118 if (!rtx_equal_p (operands[0], operands[1]))
22119 emit_move_insn (operands[0], operands[1]);
22121 split_double_mode (mode, operands, 1, low, high);
22122 emit_insn (gen_shld (high[0], low[0], operands[2]));
22125 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22127 if (TARGET_CMOVE && scratch)
22129 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22130 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22132 ix86_expand_clear (scratch);
22133 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
22135 else
22137 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22138 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22140 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
22144 void
22145 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
22147 rtx (*gen_ashr3)(rtx, rtx, rtx)
22148 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
22149 rtx (*gen_shrd)(rtx, rtx, rtx);
22150 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22152 rtx low[2], high[2];
22153 int count;
22155 if (CONST_INT_P (operands[2]))
22157 split_double_mode (mode, operands, 2, low, high);
22158 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22160 if (count == GET_MODE_BITSIZE (mode) - 1)
22162 emit_move_insn (high[0], high[1]);
22163 emit_insn (gen_ashr3 (high[0], high[0],
22164 GEN_INT (half_width - 1)));
22165 emit_move_insn (low[0], high[0]);
22168 else if (count >= half_width)
22170 emit_move_insn (low[0], high[1]);
22171 emit_move_insn (high[0], low[0]);
22172 emit_insn (gen_ashr3 (high[0], high[0],
22173 GEN_INT (half_width - 1)));
22175 if (count > half_width)
22176 emit_insn (gen_ashr3 (low[0], low[0],
22177 GEN_INT (count - half_width)));
22179 else
22181 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22183 if (!rtx_equal_p (operands[0], operands[1]))
22184 emit_move_insn (operands[0], operands[1]);
22186 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22187 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
22190 else
22192 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22194 if (!rtx_equal_p (operands[0], operands[1]))
22195 emit_move_insn (operands[0], operands[1]);
22197 split_double_mode (mode, operands, 1, low, high);
22199 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22200 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
22202 if (TARGET_CMOVE && scratch)
22204 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22205 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22207 emit_move_insn (scratch, high[0]);
22208 emit_insn (gen_ashr3 (scratch, scratch,
22209 GEN_INT (half_width - 1)));
22210 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22211 scratch));
22213 else
22215 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
22216 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
22218 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
22223 void
22224 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
22226 rtx (*gen_lshr3)(rtx, rtx, rtx)
22227 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
22228 rtx (*gen_shrd)(rtx, rtx, rtx);
22229 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22231 rtx low[2], high[2];
22232 int count;
22234 if (CONST_INT_P (operands[2]))
22236 split_double_mode (mode, operands, 2, low, high);
22237 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22239 if (count >= half_width)
22241 emit_move_insn (low[0], high[1]);
22242 ix86_expand_clear (high[0]);
22244 if (count > half_width)
22245 emit_insn (gen_lshr3 (low[0], low[0],
22246 GEN_INT (count - half_width)));
22248 else
22250 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22252 if (!rtx_equal_p (operands[0], operands[1]))
22253 emit_move_insn (operands[0], operands[1]);
22255 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22256 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
22259 else
22261 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22263 if (!rtx_equal_p (operands[0], operands[1]))
22264 emit_move_insn (operands[0], operands[1]);
22266 split_double_mode (mode, operands, 1, low, high);
22268 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22269 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
22271 if (TARGET_CMOVE && scratch)
22273 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22274 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22276 ix86_expand_clear (scratch);
22277 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22278 scratch));
22280 else
22282 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22283 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22285 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
22290 /* Predict just emitted jump instruction to be taken with probability PROB. */
22291 static void
22292 predict_jump (int prob)
22294 rtx insn = get_last_insn ();
22295 gcc_assert (JUMP_P (insn));
22296 add_int_reg_note (insn, REG_BR_PROB, prob);
22299 /* Helper function for the string operations below. Dest VARIABLE whether
22300 it is aligned to VALUE bytes. If true, jump to the label. */
22301 static rtx
22302 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
22304 rtx label = gen_label_rtx ();
22305 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
22306 if (GET_MODE (variable) == DImode)
22307 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
22308 else
22309 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
22310 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
22311 1, label);
22312 if (epilogue)
22313 predict_jump (REG_BR_PROB_BASE * 50 / 100);
22314 else
22315 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22316 return label;
22319 /* Adjust COUNTER by the VALUE. */
22320 static void
22321 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
22323 rtx (*gen_add)(rtx, rtx, rtx)
22324 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
22326 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
22329 /* Zero extend possibly SImode EXP to Pmode register. */
22331 ix86_zero_extend_to_Pmode (rtx exp)
22333 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22336 /* Divide COUNTREG by SCALE. */
22337 static rtx
22338 scale_counter (rtx countreg, int scale)
22340 rtx sc;
22342 if (scale == 1)
22343 return countreg;
22344 if (CONST_INT_P (countreg))
22345 return GEN_INT (INTVAL (countreg) / scale);
22346 gcc_assert (REG_P (countreg));
22348 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22349 GEN_INT (exact_log2 (scale)),
22350 NULL, 1, OPTAB_DIRECT);
22351 return sc;
22354 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22355 DImode for constant loop counts. */
22357 static enum machine_mode
22358 counter_mode (rtx count_exp)
22360 if (GET_MODE (count_exp) != VOIDmode)
22361 return GET_MODE (count_exp);
22362 if (!CONST_INT_P (count_exp))
22363 return Pmode;
22364 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22365 return DImode;
22366 return SImode;
22369 /* Copy the address to a Pmode register. This is used for x32 to
22370 truncate DImode TLS address to a SImode register. */
22372 static rtx
22373 ix86_copy_addr_to_reg (rtx addr)
22375 if (GET_MODE (addr) == Pmode)
22376 return copy_addr_to_reg (addr);
22377 else
22379 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
22380 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
22384 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
22385 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
22386 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
22387 memory by VALUE (supposed to be in MODE).
22389 The size is rounded down to whole number of chunk size moved at once.
22390 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22393 static void
22394 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22395 rtx destptr, rtx srcptr, rtx value,
22396 rtx count, enum machine_mode mode, int unroll,
22397 int expected_size, bool issetmem)
22399 rtx out_label, top_label, iter, tmp;
22400 enum machine_mode iter_mode = counter_mode (count);
22401 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22402 rtx piece_size = GEN_INT (piece_size_n);
22403 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22404 rtx size;
22405 int i;
22407 top_label = gen_label_rtx ();
22408 out_label = gen_label_rtx ();
22409 iter = gen_reg_rtx (iter_mode);
22411 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22412 NULL, 1, OPTAB_DIRECT);
22413 /* Those two should combine. */
22414 if (piece_size == const1_rtx)
22416 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22417 true, out_label);
22418 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22420 emit_move_insn (iter, const0_rtx);
22422 emit_label (top_label);
22424 tmp = convert_modes (Pmode, iter_mode, iter, true);
22426 /* This assert could be relaxed - in this case we'll need to compute
22427 smallest power of two, containing in PIECE_SIZE_N and pass it to
22428 offset_address. */
22429 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22430 destmem = offset_address (destmem, tmp, piece_size_n);
22431 destmem = adjust_address (destmem, mode, 0);
22433 if (!issetmem)
22435 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22436 srcmem = adjust_address (srcmem, mode, 0);
22438 /* When unrolling for chips that reorder memory reads and writes,
22439 we can save registers by using single temporary.
22440 Also using 4 temporaries is overkill in 32bit mode. */
22441 if (!TARGET_64BIT && 0)
22443 for (i = 0; i < unroll; i++)
22445 if (i)
22447 destmem =
22448 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22449 srcmem =
22450 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22452 emit_move_insn (destmem, srcmem);
22455 else
22457 rtx tmpreg[4];
22458 gcc_assert (unroll <= 4);
22459 for (i = 0; i < unroll; i++)
22461 tmpreg[i] = gen_reg_rtx (mode);
22462 if (i)
22464 srcmem =
22465 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22467 emit_move_insn (tmpreg[i], srcmem);
22469 for (i = 0; i < unroll; i++)
22471 if (i)
22473 destmem =
22474 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22476 emit_move_insn (destmem, tmpreg[i]);
22480 else
22481 for (i = 0; i < unroll; i++)
22483 if (i)
22484 destmem =
22485 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22486 emit_move_insn (destmem, value);
22489 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22490 true, OPTAB_LIB_WIDEN);
22491 if (tmp != iter)
22492 emit_move_insn (iter, tmp);
22494 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22495 true, top_label);
22496 if (expected_size != -1)
22498 expected_size /= GET_MODE_SIZE (mode) * unroll;
22499 if (expected_size == 0)
22500 predict_jump (0);
22501 else if (expected_size > REG_BR_PROB_BASE)
22502 predict_jump (REG_BR_PROB_BASE - 1);
22503 else
22504 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22506 else
22507 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22508 iter = ix86_zero_extend_to_Pmode (iter);
22509 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22510 true, OPTAB_LIB_WIDEN);
22511 if (tmp != destptr)
22512 emit_move_insn (destptr, tmp);
22513 if (!issetmem)
22515 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22516 true, OPTAB_LIB_WIDEN);
22517 if (tmp != srcptr)
22518 emit_move_insn (srcptr, tmp);
22520 emit_label (out_label);
22523 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
22524 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
22525 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
22526 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
22527 ORIG_VALUE is the original value passed to memset to fill the memory with.
22528 Other arguments have same meaning as for previous function. */
22530 static void
22531 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
22532 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
22533 rtx count,
22534 enum machine_mode mode, bool issetmem)
22536 rtx destexp;
22537 rtx srcexp;
22538 rtx countreg;
22539 HOST_WIDE_INT rounded_count;
22541 /* If possible, it is shorter to use rep movs.
22542 TODO: Maybe it is better to move this logic to decide_alg. */
22543 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
22544 && (!issetmem || orig_value == const0_rtx))
22545 mode = SImode;
22547 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22548 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22550 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
22551 GET_MODE_SIZE (mode)));
22552 if (mode != QImode)
22554 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22555 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22556 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22558 else
22559 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22560 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
22562 rounded_count = (INTVAL (count)
22563 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22564 destmem = shallow_copy_rtx (destmem);
22565 set_mem_size (destmem, rounded_count);
22567 else if (MEM_SIZE_KNOWN_P (destmem))
22568 clear_mem_size (destmem);
22570 if (issetmem)
22572 value = force_reg (mode, gen_lowpart (mode, value));
22573 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22575 else
22577 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22578 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22579 if (mode != QImode)
22581 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22582 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22583 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22585 else
22586 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22587 if (CONST_INT_P (count))
22589 rounded_count = (INTVAL (count)
22590 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22591 srcmem = shallow_copy_rtx (srcmem);
22592 set_mem_size (srcmem, rounded_count);
22594 else
22596 if (MEM_SIZE_KNOWN_P (srcmem))
22597 clear_mem_size (srcmem);
22599 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22600 destexp, srcexp));
22604 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22605 DESTMEM.
22606 SRC is passed by pointer to be updated on return.
22607 Return value is updated DST. */
22608 static rtx
22609 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22610 HOST_WIDE_INT size_to_move)
22612 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22613 enum insn_code code;
22614 enum machine_mode move_mode;
22615 int piece_size, i;
22617 /* Find the widest mode in which we could perform moves.
22618 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22619 it until move of such size is supported. */
22620 piece_size = 1 << floor_log2 (size_to_move);
22621 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22622 code = optab_handler (mov_optab, move_mode);
22623 while (code == CODE_FOR_nothing && piece_size > 1)
22625 piece_size >>= 1;
22626 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22627 code = optab_handler (mov_optab, move_mode);
22630 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22631 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22632 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22634 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22635 move_mode = mode_for_vector (word_mode, nunits);
22636 code = optab_handler (mov_optab, move_mode);
22637 if (code == CODE_FOR_nothing)
22639 move_mode = word_mode;
22640 piece_size = GET_MODE_SIZE (move_mode);
22641 code = optab_handler (mov_optab, move_mode);
22644 gcc_assert (code != CODE_FOR_nothing);
22646 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22647 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22649 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22650 gcc_assert (size_to_move % piece_size == 0);
22651 adjust = GEN_INT (piece_size);
22652 for (i = 0; i < size_to_move; i += piece_size)
22654 /* We move from memory to memory, so we'll need to do it via
22655 a temporary register. */
22656 tempreg = gen_reg_rtx (move_mode);
22657 emit_insn (GEN_FCN (code) (tempreg, src));
22658 emit_insn (GEN_FCN (code) (dst, tempreg));
22660 emit_move_insn (destptr,
22661 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22662 emit_move_insn (srcptr,
22663 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22665 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22666 piece_size);
22667 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22668 piece_size);
22671 /* Update DST and SRC rtx. */
22672 *srcmem = src;
22673 return dst;
22676 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22677 static void
22678 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22679 rtx destptr, rtx srcptr, rtx count, int max_size)
22681 rtx src, dest;
22682 if (CONST_INT_P (count))
22684 HOST_WIDE_INT countval = INTVAL (count);
22685 HOST_WIDE_INT epilogue_size = countval % max_size;
22686 int i;
22688 /* For now MAX_SIZE should be a power of 2. This assert could be
22689 relaxed, but it'll require a bit more complicated epilogue
22690 expanding. */
22691 gcc_assert ((max_size & (max_size - 1)) == 0);
22692 for (i = max_size; i >= 1; i >>= 1)
22694 if (epilogue_size & i)
22695 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22697 return;
22699 if (max_size > 8)
22701 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22702 count, 1, OPTAB_DIRECT);
22703 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22704 count, QImode, 1, 4, false);
22705 return;
22708 /* When there are stringops, we can cheaply increase dest and src pointers.
22709 Otherwise we save code size by maintaining offset (zero is readily
22710 available from preceding rep operation) and using x86 addressing modes.
22712 if (TARGET_SINGLE_STRINGOP)
22714 if (max_size > 4)
22716 rtx label = ix86_expand_aligntest (count, 4, true);
22717 src = change_address (srcmem, SImode, srcptr);
22718 dest = change_address (destmem, SImode, destptr);
22719 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22720 emit_label (label);
22721 LABEL_NUSES (label) = 1;
22723 if (max_size > 2)
22725 rtx label = ix86_expand_aligntest (count, 2, true);
22726 src = change_address (srcmem, HImode, srcptr);
22727 dest = change_address (destmem, HImode, destptr);
22728 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22729 emit_label (label);
22730 LABEL_NUSES (label) = 1;
22732 if (max_size > 1)
22734 rtx label = ix86_expand_aligntest (count, 1, true);
22735 src = change_address (srcmem, QImode, srcptr);
22736 dest = change_address (destmem, QImode, destptr);
22737 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22738 emit_label (label);
22739 LABEL_NUSES (label) = 1;
22742 else
22744 rtx offset = force_reg (Pmode, const0_rtx);
22745 rtx tmp;
22747 if (max_size > 4)
22749 rtx label = ix86_expand_aligntest (count, 4, true);
22750 src = change_address (srcmem, SImode, srcptr);
22751 dest = change_address (destmem, SImode, destptr);
22752 emit_move_insn (dest, src);
22753 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22754 true, OPTAB_LIB_WIDEN);
22755 if (tmp != offset)
22756 emit_move_insn (offset, tmp);
22757 emit_label (label);
22758 LABEL_NUSES (label) = 1;
22760 if (max_size > 2)
22762 rtx label = ix86_expand_aligntest (count, 2, true);
22763 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22764 src = change_address (srcmem, HImode, tmp);
22765 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22766 dest = change_address (destmem, HImode, tmp);
22767 emit_move_insn (dest, src);
22768 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22769 true, OPTAB_LIB_WIDEN);
22770 if (tmp != offset)
22771 emit_move_insn (offset, tmp);
22772 emit_label (label);
22773 LABEL_NUSES (label) = 1;
22775 if (max_size > 1)
22777 rtx label = ix86_expand_aligntest (count, 1, true);
22778 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22779 src = change_address (srcmem, QImode, tmp);
22780 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22781 dest = change_address (destmem, QImode, tmp);
22782 emit_move_insn (dest, src);
22783 emit_label (label);
22784 LABEL_NUSES (label) = 1;
22789 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
22790 with value PROMOTED_VAL.
22791 SRC is passed by pointer to be updated on return.
22792 Return value is updated DST. */
22793 static rtx
22794 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
22795 HOST_WIDE_INT size_to_move)
22797 rtx dst = destmem, adjust;
22798 enum insn_code code;
22799 enum machine_mode move_mode;
22800 int piece_size, i;
22802 /* Find the widest mode in which we could perform moves.
22803 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22804 it until move of such size is supported. */
22805 move_mode = GET_MODE (promoted_val);
22806 if (move_mode == VOIDmode)
22807 move_mode = QImode;
22808 if (size_to_move < GET_MODE_SIZE (move_mode))
22810 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
22811 promoted_val = gen_lowpart (move_mode, promoted_val);
22813 piece_size = GET_MODE_SIZE (move_mode);
22814 code = optab_handler (mov_optab, move_mode);
22815 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
22817 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22819 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22820 gcc_assert (size_to_move % piece_size == 0);
22821 adjust = GEN_INT (piece_size);
22822 for (i = 0; i < size_to_move; i += piece_size)
22824 if (piece_size <= GET_MODE_SIZE (word_mode))
22826 emit_insn (gen_strset (destptr, dst, promoted_val));
22827 continue;
22830 emit_insn (GEN_FCN (code) (dst, promoted_val));
22832 emit_move_insn (destptr,
22833 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22835 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22836 piece_size);
22839 /* Update DST rtx. */
22840 return dst;
22842 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22843 static void
22844 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22845 rtx count, int max_size)
22847 count =
22848 expand_simple_binop (counter_mode (count), AND, count,
22849 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22850 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22851 gen_lowpart (QImode, value), count, QImode,
22852 1, max_size / 2, true);
22855 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22856 static void
22857 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
22858 rtx count, int max_size)
22860 rtx dest;
22862 if (CONST_INT_P (count))
22864 HOST_WIDE_INT countval = INTVAL (count);
22865 HOST_WIDE_INT epilogue_size = countval % max_size;
22866 int i;
22868 /* For now MAX_SIZE should be a power of 2. This assert could be
22869 relaxed, but it'll require a bit more complicated epilogue
22870 expanding. */
22871 gcc_assert ((max_size & (max_size - 1)) == 0);
22872 for (i = max_size; i >= 1; i >>= 1)
22874 if (epilogue_size & i)
22876 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22877 destmem = emit_memset (destmem, destptr, vec_value, i);
22878 else
22879 destmem = emit_memset (destmem, destptr, value, i);
22882 return;
22884 if (max_size > 32)
22886 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22887 return;
22889 if (max_size > 16)
22891 rtx label = ix86_expand_aligntest (count, 16, true);
22892 if (TARGET_64BIT)
22894 dest = change_address (destmem, DImode, destptr);
22895 emit_insn (gen_strset (destptr, dest, value));
22896 emit_insn (gen_strset (destptr, dest, value));
22898 else
22900 dest = change_address (destmem, SImode, destptr);
22901 emit_insn (gen_strset (destptr, dest, value));
22902 emit_insn (gen_strset (destptr, dest, value));
22903 emit_insn (gen_strset (destptr, dest, value));
22904 emit_insn (gen_strset (destptr, dest, value));
22906 emit_label (label);
22907 LABEL_NUSES (label) = 1;
22909 if (max_size > 8)
22911 rtx label = ix86_expand_aligntest (count, 8, true);
22912 if (TARGET_64BIT)
22914 dest = change_address (destmem, DImode, destptr);
22915 emit_insn (gen_strset (destptr, dest, value));
22917 else
22919 dest = change_address (destmem, SImode, destptr);
22920 emit_insn (gen_strset (destptr, dest, value));
22921 emit_insn (gen_strset (destptr, dest, value));
22923 emit_label (label);
22924 LABEL_NUSES (label) = 1;
22926 if (max_size > 4)
22928 rtx label = ix86_expand_aligntest (count, 4, true);
22929 dest = change_address (destmem, SImode, destptr);
22930 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22931 emit_label (label);
22932 LABEL_NUSES (label) = 1;
22934 if (max_size > 2)
22936 rtx label = ix86_expand_aligntest (count, 2, true);
22937 dest = change_address (destmem, HImode, destptr);
22938 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22939 emit_label (label);
22940 LABEL_NUSES (label) = 1;
22942 if (max_size > 1)
22944 rtx label = ix86_expand_aligntest (count, 1, true);
22945 dest = change_address (destmem, QImode, destptr);
22946 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22947 emit_label (label);
22948 LABEL_NUSES (label) = 1;
22952 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
22953 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
22954 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
22955 ignored.
22956 Return value is updated DESTMEM. */
22957 static rtx
22958 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
22959 rtx destptr, rtx srcptr, rtx value,
22960 rtx vec_value, rtx count, int align,
22961 int desired_alignment, bool issetmem)
22963 int i;
22964 for (i = 1; i < desired_alignment; i <<= 1)
22966 if (align <= i)
22968 rtx label = ix86_expand_aligntest (destptr, i, false);
22969 if (issetmem)
22971 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22972 destmem = emit_memset (destmem, destptr, vec_value, i);
22973 else
22974 destmem = emit_memset (destmem, destptr, value, i);
22976 else
22977 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22978 ix86_adjust_counter (count, i);
22979 emit_label (label);
22980 LABEL_NUSES (label) = 1;
22981 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
22984 return destmem;
22987 /* Test if COUNT&SIZE is nonzero and if so, expand movme
22988 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
22989 and jump to DONE_LABEL. */
22990 static void
22991 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
22992 rtx destptr, rtx srcptr,
22993 rtx value, rtx vec_value,
22994 rtx count, int size,
22995 rtx done_label, bool issetmem)
22997 rtx label = ix86_expand_aligntest (count, size, false);
22998 enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
22999 rtx modesize;
23000 int n;
23002 /* If we do not have vector value to copy, we must reduce size. */
23003 if (issetmem)
23005 if (!vec_value)
23007 if (GET_MODE (value) == VOIDmode && size > 8)
23008 mode = Pmode;
23009 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
23010 mode = GET_MODE (value);
23012 else
23013 mode = GET_MODE (vec_value), value = vec_value;
23015 else
23017 /* Choose appropriate vector mode. */
23018 if (size >= 32)
23019 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
23020 else if (size >= 16)
23021 mode = TARGET_SSE ? V16QImode : DImode;
23022 srcmem = change_address (srcmem, mode, srcptr);
23024 destmem = change_address (destmem, mode, destptr);
23025 modesize = GEN_INT (GET_MODE_SIZE (mode));
23026 gcc_assert (GET_MODE_SIZE (mode) <= size);
23027 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
23029 if (issetmem)
23030 emit_move_insn (destmem, gen_lowpart (mode, value));
23031 else
23033 emit_move_insn (destmem, srcmem);
23034 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23036 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23039 destmem = offset_address (destmem, count, 1);
23040 destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
23041 GET_MODE_SIZE (mode));
23042 if (issetmem)
23043 emit_move_insn (destmem, gen_lowpart (mode, value));
23044 else
23046 srcmem = offset_address (srcmem, count, 1);
23047 srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
23048 GET_MODE_SIZE (mode));
23049 emit_move_insn (destmem, srcmem);
23051 emit_jump_insn (gen_jump (done_label));
23052 emit_barrier ();
23054 emit_label (label);
23055 LABEL_NUSES (label) = 1;
23058 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
23059 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
23060 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
23061 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
23062 DONE_LABEL is a label after the whole copying sequence. The label is created
23063 on demand if *DONE_LABEL is NULL.
23064 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
23065 bounds after the initial copies.
23067 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
23068 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
23069 we will dispatch to a library call for large blocks.
23071 In pseudocode we do:
23073 if (COUNT < SIZE)
23075 Assume that SIZE is 4. Bigger sizes are handled analogously
23076 if (COUNT & 4)
23078 copy 4 bytes from SRCPTR to DESTPTR
23079 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
23080 goto done_label
23082 if (!COUNT)
23083 goto done_label;
23084 copy 1 byte from SRCPTR to DESTPTR
23085 if (COUNT & 2)
23087 copy 2 bytes from SRCPTR to DESTPTR
23088 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
23091 else
23093 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
23094 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
23096 OLD_DESPTR = DESTPTR;
23097 Align DESTPTR up to DESIRED_ALIGN
23098 SRCPTR += DESTPTR - OLD_DESTPTR
23099 COUNT -= DEST_PTR - OLD_DESTPTR
23100 if (DYNAMIC_CHECK)
23101 Round COUNT down to multiple of SIZE
23102 << optional caller supplied zero size guard is here >>
23103 << optional caller suppplied dynamic check is here >>
23104 << caller supplied main copy loop is here >>
23106 done_label:
23108 static void
23109 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
23110 rtx *destptr, rtx *srcptr,
23111 enum machine_mode mode,
23112 rtx value, rtx vec_value,
23113 rtx *count,
23114 rtx *done_label,
23115 int size,
23116 int desired_align,
23117 int align,
23118 unsigned HOST_WIDE_INT *min_size,
23119 bool dynamic_check,
23120 bool issetmem)
23122 rtx loop_label = NULL, label;
23123 int n;
23124 rtx modesize;
23125 int prolog_size = 0;
23126 rtx mode_value;
23128 /* Chose proper value to copy. */
23129 if (issetmem && VECTOR_MODE_P (mode))
23130 mode_value = vec_value;
23131 else
23132 mode_value = value;
23133 gcc_assert (GET_MODE_SIZE (mode) <= size);
23135 /* See if block is big or small, handle small blocks. */
23136 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
23138 int size2 = size;
23139 loop_label = gen_label_rtx ();
23141 if (!*done_label)
23142 *done_label = gen_label_rtx ();
23144 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
23145 1, loop_label);
23146 size2 >>= 1;
23148 /* Handle sizes > 3. */
23149 for (;size2 > 2; size2 >>= 1)
23150 expand_small_movmem_or_setmem (destmem, srcmem,
23151 *destptr, *srcptr,
23152 value, vec_value,
23153 *count,
23154 size2, *done_label, issetmem);
23155 /* Nothing to copy? Jump to DONE_LABEL if so */
23156 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
23157 1, *done_label);
23159 /* Do a byte copy. */
23160 destmem = change_address (destmem, QImode, *destptr);
23161 if (issetmem)
23162 emit_move_insn (destmem, gen_lowpart (QImode, value));
23163 else
23165 srcmem = change_address (srcmem, QImode, *srcptr);
23166 emit_move_insn (destmem, srcmem);
23169 /* Handle sizes 2 and 3. */
23170 label = ix86_expand_aligntest (*count, 2, false);
23171 destmem = change_address (destmem, HImode, *destptr);
23172 destmem = offset_address (destmem, *count, 1);
23173 destmem = offset_address (destmem, GEN_INT (-2), 2);
23174 if (issetmem)
23175 emit_move_insn (destmem, gen_lowpart (HImode, value));
23176 else
23178 srcmem = change_address (srcmem, HImode, *srcptr);
23179 srcmem = offset_address (srcmem, *count, 1);
23180 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
23181 emit_move_insn (destmem, srcmem);
23184 emit_label (label);
23185 LABEL_NUSES (label) = 1;
23186 emit_jump_insn (gen_jump (*done_label));
23187 emit_barrier ();
23189 else
23190 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
23191 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
23193 /* Start memcpy for COUNT >= SIZE. */
23194 if (loop_label)
23196 emit_label (loop_label);
23197 LABEL_NUSES (loop_label) = 1;
23200 /* Copy first desired_align bytes. */
23201 if (!issetmem)
23202 srcmem = change_address (srcmem, mode, *srcptr);
23203 destmem = change_address (destmem, mode, *destptr);
23204 modesize = GEN_INT (GET_MODE_SIZE (mode));
23205 for (n = 0; prolog_size < desired_align - align; n++)
23207 if (issetmem)
23208 emit_move_insn (destmem, mode_value);
23209 else
23211 emit_move_insn (destmem, srcmem);
23212 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23214 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23215 prolog_size += GET_MODE_SIZE (mode);
23219 /* Copy last SIZE bytes. */
23220 destmem = offset_address (destmem, *count, 1);
23221 destmem = offset_address (destmem,
23222 GEN_INT (-size - prolog_size),
23224 if (issetmem)
23225 emit_move_insn (destmem, mode_value);
23226 else
23228 srcmem = offset_address (srcmem, *count, 1);
23229 srcmem = offset_address (srcmem,
23230 GEN_INT (-size - prolog_size),
23232 emit_move_insn (destmem, srcmem);
23234 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
23236 destmem = offset_address (destmem, modesize, 1);
23237 if (issetmem)
23238 emit_move_insn (destmem, mode_value);
23239 else
23241 srcmem = offset_address (srcmem, modesize, 1);
23242 emit_move_insn (destmem, srcmem);
23246 /* Align destination. */
23247 if (desired_align > 1 && desired_align > align)
23249 rtx saveddest = *destptr;
23251 gcc_assert (desired_align <= size);
23252 /* Align destptr up, place it to new register. */
23253 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
23254 GEN_INT (prolog_size),
23255 NULL_RTX, 1, OPTAB_DIRECT);
23256 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
23257 GEN_INT (-desired_align),
23258 *destptr, 1, OPTAB_DIRECT);
23259 /* See how many bytes we skipped. */
23260 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
23261 *destptr,
23262 saveddest, 1, OPTAB_DIRECT);
23263 /* Adjust srcptr and count. */
23264 if (!issetmem)
23265 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
23266 *srcptr, 1, OPTAB_DIRECT);
23267 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23268 saveddest, *count, 1, OPTAB_DIRECT);
23269 /* We copied at most size + prolog_size. */
23270 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
23271 *min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
23272 else
23273 *min_size = 0;
23275 /* Our loops always round down the bock size, but for dispatch to library
23276 we need precise value. */
23277 if (dynamic_check)
23278 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
23279 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
23281 else
23283 gcc_assert (prolog_size == 0);
23284 /* Decrease count, so we won't end up copying last word twice. */
23285 if (!CONST_INT_P (*count))
23286 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23287 constm1_rtx, *count, 1, OPTAB_DIRECT);
23288 else
23289 *count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
23290 if (*min_size)
23291 *min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
23296 /* This function is like the previous one, except here we know how many bytes
23297 need to be copied. That allows us to update alignment not only of DST, which
23298 is returned, but also of SRC, which is passed as a pointer for that
23299 reason. */
23300 static rtx
23301 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
23302 rtx srcreg, rtx value, rtx vec_value,
23303 int desired_align, int align_bytes,
23304 bool issetmem)
23306 rtx src = NULL;
23307 rtx orig_dst = dst;
23308 rtx orig_src = NULL;
23309 int piece_size = 1;
23310 int copied_bytes = 0;
23312 if (!issetmem)
23314 gcc_assert (srcp != NULL);
23315 src = *srcp;
23316 orig_src = src;
23319 for (piece_size = 1;
23320 piece_size <= desired_align && copied_bytes < align_bytes;
23321 piece_size <<= 1)
23323 if (align_bytes & piece_size)
23325 if (issetmem)
23327 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
23328 dst = emit_memset (dst, destreg, vec_value, piece_size);
23329 else
23330 dst = emit_memset (dst, destreg, value, piece_size);
23332 else
23333 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
23334 copied_bytes += piece_size;
23337 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
23338 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23339 if (MEM_SIZE_KNOWN_P (orig_dst))
23340 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
23342 if (!issetmem)
23344 int src_align_bytes = get_mem_align_offset (src, desired_align
23345 * BITS_PER_UNIT);
23346 if (src_align_bytes >= 0)
23347 src_align_bytes = desired_align - src_align_bytes;
23348 if (src_align_bytes >= 0)
23350 unsigned int src_align;
23351 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
23353 if ((src_align_bytes & (src_align - 1))
23354 == (align_bytes & (src_align - 1)))
23355 break;
23357 if (src_align > (unsigned int) desired_align)
23358 src_align = desired_align;
23359 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
23360 set_mem_align (src, src_align * BITS_PER_UNIT);
23362 if (MEM_SIZE_KNOWN_P (orig_src))
23363 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
23364 *srcp = src;
23367 return dst;
23370 /* Return true if ALG can be used in current context.
23371 Assume we expand memset if MEMSET is true. */
23372 static bool
23373 alg_usable_p (enum stringop_alg alg, bool memset)
23375 if (alg == no_stringop)
23376 return false;
23377 if (alg == vector_loop)
23378 return TARGET_SSE || TARGET_AVX;
23379 /* Algorithms using the rep prefix want at least edi and ecx;
23380 additionally, memset wants eax and memcpy wants esi. Don't
23381 consider such algorithms if the user has appropriated those
23382 registers for their own purposes. */
23383 if (alg == rep_prefix_1_byte
23384 || alg == rep_prefix_4_byte
23385 || alg == rep_prefix_8_byte)
23386 return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
23387 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
23388 return true;
23391 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
23392 static enum stringop_alg
23393 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
23394 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
23395 bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
23397 const struct stringop_algs * algs;
23398 bool optimize_for_speed;
23399 int max = -1;
23400 const struct processor_costs *cost;
23401 int i;
23402 bool any_alg_usable_p = false;
23404 *noalign = false;
23405 *dynamic_check = -1;
23407 /* Even if the string operation call is cold, we still might spend a lot
23408 of time processing large blocks. */
23409 if (optimize_function_for_size_p (cfun)
23410 || (optimize_insn_for_size_p ()
23411 && (max_size < 256
23412 || (expected_size != -1 && expected_size < 256))))
23413 optimize_for_speed = false;
23414 else
23415 optimize_for_speed = true;
23417 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
23418 if (memset)
23419 algs = &cost->memset[TARGET_64BIT != 0];
23420 else
23421 algs = &cost->memcpy[TARGET_64BIT != 0];
23423 /* See maximal size for user defined algorithm. */
23424 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23426 enum stringop_alg candidate = algs->size[i].alg;
23427 bool usable = alg_usable_p (candidate, memset);
23428 any_alg_usable_p |= usable;
23430 if (candidate != libcall && candidate && usable)
23431 max = algs->size[i].max;
23434 /* If expected size is not known but max size is small enough
23435 so inline version is a win, set expected size into
23436 the range. */
23437 if (max > 1 && (unsigned HOST_WIDE_INT)max >= max_size && expected_size == -1)
23438 expected_size = min_size / 2 + max_size / 2;
23440 /* If user specified the algorithm, honnor it if possible. */
23441 if (ix86_stringop_alg != no_stringop
23442 && alg_usable_p (ix86_stringop_alg, memset))
23443 return ix86_stringop_alg;
23444 /* rep; movq or rep; movl is the smallest variant. */
23445 else if (!optimize_for_speed)
23447 *noalign = true;
23448 if (!count || (count & 3) || (memset && !zero_memset))
23449 return alg_usable_p (rep_prefix_1_byte, memset)
23450 ? rep_prefix_1_byte : loop_1_byte;
23451 else
23452 return alg_usable_p (rep_prefix_4_byte, memset)
23453 ? rep_prefix_4_byte : loop;
23455 /* Very tiny blocks are best handled via the loop, REP is expensive to
23456 setup. */
23457 else if (expected_size != -1 && expected_size < 4)
23458 return loop_1_byte;
23459 else if (expected_size != -1)
23461 enum stringop_alg alg = libcall;
23462 bool alg_noalign = false;
23463 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23465 /* We get here if the algorithms that were not libcall-based
23466 were rep-prefix based and we are unable to use rep prefixes
23467 based on global register usage. Break out of the loop and
23468 use the heuristic below. */
23469 if (algs->size[i].max == 0)
23470 break;
23471 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
23473 enum stringop_alg candidate = algs->size[i].alg;
23475 if (candidate != libcall && alg_usable_p (candidate, memset))
23477 alg = candidate;
23478 alg_noalign = algs->size[i].noalign;
23480 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
23481 last non-libcall inline algorithm. */
23482 if (TARGET_INLINE_ALL_STRINGOPS)
23484 /* When the current size is best to be copied by a libcall,
23485 but we are still forced to inline, run the heuristic below
23486 that will pick code for medium sized blocks. */
23487 if (alg != libcall)
23489 *noalign = alg_noalign;
23490 return alg;
23492 break;
23494 else if (alg_usable_p (candidate, memset))
23496 *noalign = algs->size[i].noalign;
23497 return candidate;
23502 /* When asked to inline the call anyway, try to pick meaningful choice.
23503 We look for maximal size of block that is faster to copy by hand and
23504 take blocks of at most of that size guessing that average size will
23505 be roughly half of the block.
23507 If this turns out to be bad, we might simply specify the preferred
23508 choice in ix86_costs. */
23509 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23510 && (algs->unknown_size == libcall
23511 || !alg_usable_p (algs->unknown_size, memset)))
23513 enum stringop_alg alg;
23515 /* If there aren't any usable algorithms, then recursing on
23516 smaller sizes isn't going to find anything. Just return the
23517 simple byte-at-a-time copy loop. */
23518 if (!any_alg_usable_p)
23520 /* Pick something reasonable. */
23521 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23522 *dynamic_check = 128;
23523 return loop_1_byte;
23525 if (max == -1)
23526 max = 4096;
23527 alg = decide_alg (count, max / 2, min_size, max_size, memset,
23528 zero_memset, dynamic_check, noalign);
23529 gcc_assert (*dynamic_check == -1);
23530 gcc_assert (alg != libcall);
23531 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23532 *dynamic_check = max;
23533 return alg;
23535 return (alg_usable_p (algs->unknown_size, memset)
23536 ? algs->unknown_size : libcall);
23539 /* Decide on alignment. We know that the operand is already aligned to ALIGN
23540 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
23541 static int
23542 decide_alignment (int align,
23543 enum stringop_alg alg,
23544 int expected_size,
23545 enum machine_mode move_mode)
23547 int desired_align = 0;
23549 gcc_assert (alg != no_stringop);
23551 if (alg == libcall)
23552 return 0;
23553 if (move_mode == VOIDmode)
23554 return 0;
23556 desired_align = GET_MODE_SIZE (move_mode);
23557 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
23558 copying whole cacheline at once. */
23559 if (TARGET_PENTIUMPRO
23560 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
23561 desired_align = 8;
23563 if (optimize_size)
23564 desired_align = 1;
23565 if (desired_align < align)
23566 desired_align = align;
23567 if (expected_size != -1 && expected_size < 4)
23568 desired_align = align;
23570 return desired_align;
23574 /* Helper function for memcpy. For QImode value 0xXY produce
23575 0xXYXYXYXY of wide specified by MODE. This is essentially
23576 a * 0x10101010, but we can do slightly better than
23577 synth_mult by unwinding the sequence by hand on CPUs with
23578 slow multiply. */
23579 static rtx
23580 promote_duplicated_reg (enum machine_mode mode, rtx val)
23582 enum machine_mode valmode = GET_MODE (val);
23583 rtx tmp;
23584 int nops = mode == DImode ? 3 : 2;
23586 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
23587 if (val == const0_rtx)
23588 return copy_to_mode_reg (mode, CONST0_RTX (mode));
23589 if (CONST_INT_P (val))
23591 HOST_WIDE_INT v = INTVAL (val) & 255;
23593 v |= v << 8;
23594 v |= v << 16;
23595 if (mode == DImode)
23596 v |= (v << 16) << 16;
23597 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23600 if (valmode == VOIDmode)
23601 valmode = QImode;
23602 if (valmode != QImode)
23603 val = gen_lowpart (QImode, val);
23604 if (mode == QImode)
23605 return val;
23606 if (!TARGET_PARTIAL_REG_STALL)
23607 nops--;
23608 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23609 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23610 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23611 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23613 rtx reg = convert_modes (mode, QImode, val, true);
23614 tmp = promote_duplicated_reg (mode, const1_rtx);
23615 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23616 OPTAB_DIRECT);
23618 else
23620 rtx reg = convert_modes (mode, QImode, val, true);
23622 if (!TARGET_PARTIAL_REG_STALL)
23623 if (mode == SImode)
23624 emit_insn (gen_movsi_insv_1 (reg, reg));
23625 else
23626 emit_insn (gen_movdi_insv_1 (reg, reg));
23627 else
23629 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23630 NULL, 1, OPTAB_DIRECT);
23631 reg =
23632 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23634 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23635 NULL, 1, OPTAB_DIRECT);
23636 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23637 if (mode == SImode)
23638 return reg;
23639 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23640 NULL, 1, OPTAB_DIRECT);
23641 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23642 return reg;
23646 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23647 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23648 alignment from ALIGN to DESIRED_ALIGN. */
23649 static rtx
23650 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
23651 int align)
23653 rtx promoted_val;
23655 if (TARGET_64BIT
23656 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23657 promoted_val = promote_duplicated_reg (DImode, val);
23658 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23659 promoted_val = promote_duplicated_reg (SImode, val);
23660 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23661 promoted_val = promote_duplicated_reg (HImode, val);
23662 else
23663 promoted_val = val;
23665 return promoted_val;
23668 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
23669 operations when profitable. The code depends upon architecture, block size
23670 and alignment, but always has one of the following overall structures:
23672 Aligned move sequence:
23674 1) Prologue guard: Conditional that jumps up to epilogues for small
23675 blocks that can be handled by epilogue alone. This is faster
23676 but also needed for correctness, since prologue assume the block
23677 is larger than the desired alignment.
23679 Optional dynamic check for size and libcall for large
23680 blocks is emitted here too, with -minline-stringops-dynamically.
23682 2) Prologue: copy first few bytes in order to get destination
23683 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
23684 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
23685 copied. We emit either a jump tree on power of two sized
23686 blocks, or a byte loop.
23688 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23689 with specified algorithm.
23691 4) Epilogue: code copying tail of the block that is too small to be
23692 handled by main body (or up to size guarded by prologue guard).
23694 Misaligned move sequence
23696 1) missaligned move prologue/epilogue containing:
23697 a) Prologue handling small memory blocks and jumping to done_label
23698 (skipped if blocks are known to be large enough)
23699 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
23700 needed by single possibly misaligned move
23701 (skipped if alignment is not needed)
23702 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
23704 2) Zero size guard dispatching to done_label, if needed
23706 3) dispatch to library call, if needed,
23708 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23709 with specified algorithm. */
23710 bool
23711 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
23712 rtx align_exp, rtx expected_align_exp,
23713 rtx expected_size_exp, rtx min_size_exp,
23714 rtx max_size_exp, bool issetmem)
23716 rtx destreg;
23717 rtx srcreg = NULL;
23718 rtx label = NULL;
23719 rtx tmp;
23720 rtx jump_around_label = NULL;
23721 HOST_WIDE_INT align = 1;
23722 unsigned HOST_WIDE_INT count = 0;
23723 HOST_WIDE_INT expected_size = -1;
23724 int size_needed = 0, epilogue_size_needed;
23725 int desired_align = 0, align_bytes = 0;
23726 enum stringop_alg alg;
23727 rtx promoted_val = NULL;
23728 rtx vec_promoted_val = NULL;
23729 bool force_loopy_epilogue = false;
23730 int dynamic_check;
23731 bool need_zero_guard = false;
23732 bool noalign;
23733 enum machine_mode move_mode = VOIDmode;
23734 int unroll_factor = 1;
23735 /* TODO: Once vlaue ranges are available, fill in proper data. */
23736 unsigned HOST_WIDE_INT min_size = 0;
23737 unsigned HOST_WIDE_INT max_size = -1;
23738 bool misaligned_prologue_used = false;
23740 if (CONST_INT_P (align_exp))
23741 align = INTVAL (align_exp);
23742 /* i386 can do misaligned access on reasonably increased cost. */
23743 if (CONST_INT_P (expected_align_exp)
23744 && INTVAL (expected_align_exp) > align)
23745 align = INTVAL (expected_align_exp);
23746 /* ALIGN is the minimum of destination and source alignment, but we care here
23747 just about destination alignment. */
23748 else if (!issetmem
23749 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
23750 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
23752 if (CONST_INT_P (count_exp))
23753 min_size = max_size = count = expected_size = INTVAL (count_exp);
23754 if (min_size_exp)
23755 min_size = INTVAL (min_size_exp);
23756 if (max_size_exp)
23757 max_size = INTVAL (max_size_exp);
23758 if (CONST_INT_P (expected_size_exp) && count == 0)
23759 expected_size = INTVAL (expected_size_exp);
23761 /* Make sure we don't need to care about overflow later on. */
23762 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23763 return false;
23765 /* Step 0: Decide on preferred algorithm, desired alignment and
23766 size of chunks to be copied by main loop. */
23767 alg = decide_alg (count, expected_size, min_size, max_size, issetmem,
23768 issetmem && val_exp == const0_rtx,
23769 &dynamic_check, &noalign);
23770 if (alg == libcall)
23771 return false;
23772 gcc_assert (alg != no_stringop);
23774 /* For now vector-version of memset is generated only for memory zeroing, as
23775 creating of promoted vector value is very cheap in this case. */
23776 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
23777 alg = unrolled_loop;
23779 if (!count)
23780 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
23781 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23782 if (!issetmem)
23783 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
23785 unroll_factor = 1;
23786 move_mode = word_mode;
23787 switch (alg)
23789 case libcall:
23790 case no_stringop:
23791 case last_alg:
23792 gcc_unreachable ();
23793 case loop_1_byte:
23794 need_zero_guard = true;
23795 move_mode = QImode;
23796 break;
23797 case loop:
23798 need_zero_guard = true;
23799 break;
23800 case unrolled_loop:
23801 need_zero_guard = true;
23802 unroll_factor = (TARGET_64BIT ? 4 : 2);
23803 break;
23804 case vector_loop:
23805 need_zero_guard = true;
23806 unroll_factor = 4;
23807 /* Find the widest supported mode. */
23808 move_mode = word_mode;
23809 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23810 != CODE_FOR_nothing)
23811 move_mode = GET_MODE_WIDER_MODE (move_mode);
23813 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23814 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23815 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23817 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23818 move_mode = mode_for_vector (word_mode, nunits);
23819 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23820 move_mode = word_mode;
23822 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23823 break;
23824 case rep_prefix_8_byte:
23825 move_mode = DImode;
23826 break;
23827 case rep_prefix_4_byte:
23828 move_mode = SImode;
23829 break;
23830 case rep_prefix_1_byte:
23831 move_mode = QImode;
23832 break;
23834 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23835 epilogue_size_needed = size_needed;
23837 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23838 if (!TARGET_ALIGN_STRINGOPS || noalign)
23839 align = desired_align;
23841 /* Step 1: Prologue guard. */
23843 /* Alignment code needs count to be in register. */
23844 if (CONST_INT_P (count_exp) && desired_align > align)
23846 if (INTVAL (count_exp) > desired_align
23847 && INTVAL (count_exp) > size_needed)
23849 align_bytes
23850 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23851 if (align_bytes <= 0)
23852 align_bytes = 0;
23853 else
23854 align_bytes = desired_align - align_bytes;
23856 if (align_bytes == 0)
23857 count_exp = force_reg (counter_mode (count_exp), count_exp);
23859 gcc_assert (desired_align >= 1 && align >= 1);
23861 /* Misaligned move sequences handle both prologue and epilogue at once.
23862 Default code generation results in a smaller code for large alignments
23863 and also avoids redundant job when sizes are known precisely. */
23864 misaligned_prologue_used
23865 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
23866 && MAX (desired_align, epilogue_size_needed) <= 32
23867 && desired_align <= epilogue_size_needed
23868 && ((desired_align > align && !align_bytes)
23869 || (!count && epilogue_size_needed > 1)));
23871 /* Do the cheap promotion to allow better CSE across the
23872 main loop and epilogue (ie one load of the big constant in the
23873 front of all code.
23874 For now the misaligned move sequences do not have fast path
23875 without broadcasting. */
23876 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
23878 if (alg == vector_loop)
23880 gcc_assert (val_exp == const0_rtx);
23881 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
23882 promoted_val = promote_duplicated_reg_to_size (val_exp,
23883 GET_MODE_SIZE (word_mode),
23884 desired_align, align);
23886 else
23888 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23889 desired_align, align);
23892 /* Misaligned move sequences handles both prologues and epilogues at once.
23893 Default code generation results in smaller code for large alignments and
23894 also avoids redundant job when sizes are known precisely. */
23895 if (misaligned_prologue_used)
23897 /* Misaligned move prologue handled small blocks by itself. */
23898 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
23899 (dst, src, &destreg, &srcreg,
23900 move_mode, promoted_val, vec_promoted_val,
23901 &count_exp,
23902 &jump_around_label,
23903 desired_align < align
23904 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
23905 desired_align, align, &min_size, dynamic_check, issetmem);
23906 if (!issetmem)
23907 src = change_address (src, BLKmode, srcreg);
23908 dst = change_address (dst, BLKmode, destreg);
23909 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23910 epilogue_size_needed = 0;
23911 if (need_zero_guard && !min_size)
23913 /* It is possible that we copied enough so the main loop will not
23914 execute. */
23915 gcc_assert (size_needed > 1);
23916 if (jump_around_label == NULL_RTX)
23917 jump_around_label = gen_label_rtx ();
23918 emit_cmp_and_jump_insns (count_exp,
23919 GEN_INT (size_needed),
23920 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
23921 if (expected_size == -1
23922 || expected_size < (desired_align - align) / 2 + size_needed)
23923 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23924 else
23925 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23928 /* Ensure that alignment prologue won't copy past end of block. */
23929 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23931 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23932 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23933 Make sure it is power of 2. */
23934 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23936 /* To improve performance of small blocks, we jump around the VAL
23937 promoting mode. This mean that if the promoted VAL is not constant,
23938 we might not use it in the epilogue and have to use byte
23939 loop variant. */
23940 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
23941 force_loopy_epilogue = true;
23942 if (count)
23944 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23946 /* If main algorithm works on QImode, no epilogue is needed.
23947 For small sizes just don't align anything. */
23948 if (size_needed == 1)
23949 desired_align = align;
23950 else
23951 goto epilogue;
23954 else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23956 gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
23957 label = gen_label_rtx ();
23958 emit_cmp_and_jump_insns (count_exp,
23959 GEN_INT (epilogue_size_needed),
23960 LTU, 0, counter_mode (count_exp), 1, label);
23961 if (expected_size == -1 || expected_size < epilogue_size_needed)
23962 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23963 else
23964 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23968 /* Emit code to decide on runtime whether library call or inline should be
23969 used. */
23970 if (dynamic_check != -1)
23972 if (!issetmem && CONST_INT_P (count_exp))
23974 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
23976 emit_block_move_via_libcall (dst, src, count_exp, false);
23977 count_exp = const0_rtx;
23978 goto epilogue;
23981 else
23983 rtx hot_label = gen_label_rtx ();
23984 jump_around_label = gen_label_rtx ();
23985 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23986 LEU, 0, GET_MODE (count_exp), 1, hot_label);
23987 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23988 if (issetmem)
23989 set_storage_via_libcall (dst, count_exp, val_exp, false);
23990 else
23991 emit_block_move_via_libcall (dst, src, count_exp, false);
23992 emit_jump (jump_around_label);
23993 emit_label (hot_label);
23997 /* Step 2: Alignment prologue. */
23998 /* Do the expensive promotion once we branched off the small blocks. */
23999 if (issetmem && !promoted_val)
24000 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
24001 desired_align, align);
24003 if (desired_align > align && !misaligned_prologue_used)
24005 if (align_bytes == 0)
24007 /* Except for the first move in prologue, we no longer know
24008 constant offset in aliasing info. It don't seems to worth
24009 the pain to maintain it for the first move, so throw away
24010 the info early. */
24011 dst = change_address (dst, BLKmode, destreg);
24012 if (!issetmem)
24013 src = change_address (src, BLKmode, srcreg);
24014 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
24015 promoted_val, vec_promoted_val,
24016 count_exp, align, desired_align,
24017 issetmem);
24018 /* At most desired_align - align bytes are copied. */
24019 if (min_size < (unsigned)(desired_align - align))
24020 min_size = 0;
24021 else
24022 min_size -= desired_align - align;
24024 else
24026 /* If we know how many bytes need to be stored before dst is
24027 sufficiently aligned, maintain aliasing info accurately. */
24028 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
24029 srcreg,
24030 promoted_val,
24031 vec_promoted_val,
24032 desired_align,
24033 align_bytes,
24034 issetmem);
24036 count_exp = plus_constant (counter_mode (count_exp),
24037 count_exp, -align_bytes);
24038 count -= align_bytes;
24039 min_size -= align_bytes;
24040 max_size -= align_bytes;
24042 if (need_zero_guard
24043 && !min_size
24044 && (count < (unsigned HOST_WIDE_INT) size_needed
24045 || (align_bytes == 0
24046 && count < ((unsigned HOST_WIDE_INT) size_needed
24047 + desired_align - align))))
24049 /* It is possible that we copied enough so the main loop will not
24050 execute. */
24051 gcc_assert (size_needed > 1);
24052 if (label == NULL_RTX)
24053 label = gen_label_rtx ();
24054 emit_cmp_and_jump_insns (count_exp,
24055 GEN_INT (size_needed),
24056 LTU, 0, counter_mode (count_exp), 1, label);
24057 if (expected_size == -1
24058 || expected_size < (desired_align - align) / 2 + size_needed)
24059 predict_jump (REG_BR_PROB_BASE * 20 / 100);
24060 else
24061 predict_jump (REG_BR_PROB_BASE * 60 / 100);
24064 if (label && size_needed == 1)
24066 emit_label (label);
24067 LABEL_NUSES (label) = 1;
24068 label = NULL;
24069 epilogue_size_needed = 1;
24070 if (issetmem)
24071 promoted_val = val_exp;
24073 else if (label == NULL_RTX && !misaligned_prologue_used)
24074 epilogue_size_needed = size_needed;
24076 /* Step 3: Main loop. */
24078 switch (alg)
24080 case libcall:
24081 case no_stringop:
24082 case last_alg:
24083 gcc_unreachable ();
24084 case loop_1_byte:
24085 case loop:
24086 case unrolled_loop:
24087 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
24088 count_exp, move_mode, unroll_factor,
24089 expected_size, issetmem);
24090 break;
24091 case vector_loop:
24092 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
24093 vec_promoted_val, count_exp, move_mode,
24094 unroll_factor, expected_size, issetmem);
24095 break;
24096 case rep_prefix_8_byte:
24097 case rep_prefix_4_byte:
24098 case rep_prefix_1_byte:
24099 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
24100 val_exp, count_exp, move_mode, issetmem);
24101 break;
24103 /* Adjust properly the offset of src and dest memory for aliasing. */
24104 if (CONST_INT_P (count_exp))
24106 if (!issetmem)
24107 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
24108 (count / size_needed) * size_needed);
24109 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
24110 (count / size_needed) * size_needed);
24112 else
24114 if (!issetmem)
24115 src = change_address (src, BLKmode, srcreg);
24116 dst = change_address (dst, BLKmode, destreg);
24119 /* Step 4: Epilogue to copy the remaining bytes. */
24120 epilogue:
24121 if (label)
24123 /* When the main loop is done, COUNT_EXP might hold original count,
24124 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
24125 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
24126 bytes. Compensate if needed. */
24128 if (size_needed < epilogue_size_needed)
24130 tmp =
24131 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
24132 GEN_INT (size_needed - 1), count_exp, 1,
24133 OPTAB_DIRECT);
24134 if (tmp != count_exp)
24135 emit_move_insn (count_exp, tmp);
24137 emit_label (label);
24138 LABEL_NUSES (label) = 1;
24141 if (count_exp != const0_rtx && epilogue_size_needed > 1)
24143 if (force_loopy_epilogue)
24144 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
24145 epilogue_size_needed);
24146 else
24148 if (issetmem)
24149 expand_setmem_epilogue (dst, destreg, promoted_val,
24150 vec_promoted_val, count_exp,
24151 epilogue_size_needed);
24152 else
24153 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
24154 epilogue_size_needed);
24157 if (jump_around_label)
24158 emit_label (jump_around_label);
24159 return true;
24163 /* Expand the appropriate insns for doing strlen if not just doing
24164 repnz; scasb
24166 out = result, initialized with the start address
24167 align_rtx = alignment of the address.
24168 scratch = scratch register, initialized with the startaddress when
24169 not aligned, otherwise undefined
24171 This is just the body. It needs the initializations mentioned above and
24172 some address computing at the end. These things are done in i386.md. */
24174 static void
24175 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
24177 int align;
24178 rtx tmp;
24179 rtx align_2_label = NULL_RTX;
24180 rtx align_3_label = NULL_RTX;
24181 rtx align_4_label = gen_label_rtx ();
24182 rtx end_0_label = gen_label_rtx ();
24183 rtx mem;
24184 rtx tmpreg = gen_reg_rtx (SImode);
24185 rtx scratch = gen_reg_rtx (SImode);
24186 rtx cmp;
24188 align = 0;
24189 if (CONST_INT_P (align_rtx))
24190 align = INTVAL (align_rtx);
24192 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
24194 /* Is there a known alignment and is it less than 4? */
24195 if (align < 4)
24197 rtx scratch1 = gen_reg_rtx (Pmode);
24198 emit_move_insn (scratch1, out);
24199 /* Is there a known alignment and is it not 2? */
24200 if (align != 2)
24202 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
24203 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
24205 /* Leave just the 3 lower bits. */
24206 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
24207 NULL_RTX, 0, OPTAB_WIDEN);
24209 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24210 Pmode, 1, align_4_label);
24211 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
24212 Pmode, 1, align_2_label);
24213 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
24214 Pmode, 1, align_3_label);
24216 else
24218 /* Since the alignment is 2, we have to check 2 or 0 bytes;
24219 check if is aligned to 4 - byte. */
24221 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
24222 NULL_RTX, 0, OPTAB_WIDEN);
24224 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24225 Pmode, 1, align_4_label);
24228 mem = change_address (src, QImode, out);
24230 /* Now compare the bytes. */
24232 /* Compare the first n unaligned byte on a byte per byte basis. */
24233 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
24234 QImode, 1, end_0_label);
24236 /* Increment the address. */
24237 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24239 /* Not needed with an alignment of 2 */
24240 if (align != 2)
24242 emit_label (align_2_label);
24244 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24245 end_0_label);
24247 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24249 emit_label (align_3_label);
24252 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24253 end_0_label);
24255 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24258 /* Generate loop to check 4 bytes at a time. It is not a good idea to
24259 align this loop. It gives only huge programs, but does not help to
24260 speed up. */
24261 emit_label (align_4_label);
24263 mem = change_address (src, SImode, out);
24264 emit_move_insn (scratch, mem);
24265 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
24267 /* This formula yields a nonzero result iff one of the bytes is zero.
24268 This saves three branches inside loop and many cycles. */
24270 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
24271 emit_insn (gen_one_cmplsi2 (scratch, scratch));
24272 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
24273 emit_insn (gen_andsi3 (tmpreg, tmpreg,
24274 gen_int_mode (0x80808080, SImode)));
24275 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
24276 align_4_label);
24278 if (TARGET_CMOVE)
24280 rtx reg = gen_reg_rtx (SImode);
24281 rtx reg2 = gen_reg_rtx (Pmode);
24282 emit_move_insn (reg, tmpreg);
24283 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
24285 /* If zero is not in the first two bytes, move two bytes forward. */
24286 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24287 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24288 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24289 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
24290 gen_rtx_IF_THEN_ELSE (SImode, tmp,
24291 reg,
24292 tmpreg)));
24293 /* Emit lea manually to avoid clobbering of flags. */
24294 emit_insn (gen_rtx_SET (SImode, reg2,
24295 gen_rtx_PLUS (Pmode, out, const2_rtx)));
24297 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24298 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24299 emit_insn (gen_rtx_SET (VOIDmode, out,
24300 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
24301 reg2,
24302 out)));
24304 else
24306 rtx end_2_label = gen_label_rtx ();
24307 /* Is zero in the first two bytes? */
24309 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24310 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24311 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
24312 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24313 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
24314 pc_rtx);
24315 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
24316 JUMP_LABEL (tmp) = end_2_label;
24318 /* Not in the first two. Move two bytes forward. */
24319 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
24320 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
24322 emit_label (end_2_label);
24326 /* Avoid branch in fixing the byte. */
24327 tmpreg = gen_lowpart (QImode, tmpreg);
24328 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
24329 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
24330 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
24331 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
24333 emit_label (end_0_label);
24336 /* Expand strlen. */
24338 bool
24339 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
24341 rtx addr, scratch1, scratch2, scratch3, scratch4;
24343 /* The generic case of strlen expander is long. Avoid it's
24344 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
24346 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24347 && !TARGET_INLINE_ALL_STRINGOPS
24348 && !optimize_insn_for_size_p ()
24349 && (!CONST_INT_P (align) || INTVAL (align) < 4))
24350 return false;
24352 addr = force_reg (Pmode, XEXP (src, 0));
24353 scratch1 = gen_reg_rtx (Pmode);
24355 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24356 && !optimize_insn_for_size_p ())
24358 /* Well it seems that some optimizer does not combine a call like
24359 foo(strlen(bar), strlen(bar));
24360 when the move and the subtraction is done here. It does calculate
24361 the length just once when these instructions are done inside of
24362 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
24363 often used and I use one fewer register for the lifetime of
24364 output_strlen_unroll() this is better. */
24366 emit_move_insn (out, addr);
24368 ix86_expand_strlensi_unroll_1 (out, src, align);
24370 /* strlensi_unroll_1 returns the address of the zero at the end of
24371 the string, like memchr(), so compute the length by subtracting
24372 the start address. */
24373 emit_insn (ix86_gen_sub3 (out, out, addr));
24375 else
24377 rtx unspec;
24379 /* Can't use this if the user has appropriated eax, ecx, or edi. */
24380 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
24381 return false;
24383 scratch2 = gen_reg_rtx (Pmode);
24384 scratch3 = gen_reg_rtx (Pmode);
24385 scratch4 = force_reg (Pmode, constm1_rtx);
24387 emit_move_insn (scratch3, addr);
24388 eoschar = force_reg (QImode, eoschar);
24390 src = replace_equiv_address_nv (src, scratch3);
24392 /* If .md starts supporting :P, this can be done in .md. */
24393 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
24394 scratch4), UNSPEC_SCAS);
24395 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
24396 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
24397 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
24399 return true;
24402 /* For given symbol (function) construct code to compute address of it's PLT
24403 entry in large x86-64 PIC model. */
24404 static rtx
24405 construct_plt_address (rtx symbol)
24407 rtx tmp, unspec;
24409 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
24410 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
24411 gcc_assert (Pmode == DImode);
24413 tmp = gen_reg_rtx (Pmode);
24414 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
24416 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
24417 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
24418 return tmp;
24422 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
24423 rtx callarg2,
24424 rtx pop, bool sibcall)
24426 unsigned int const cregs_size
24427 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
24428 rtx vec[3 + cregs_size];
24429 rtx use = NULL, call;
24430 unsigned int vec_len = 0;
24432 if (pop == const0_rtx)
24433 pop = NULL;
24434 gcc_assert (!TARGET_64BIT || !pop);
24436 if (TARGET_MACHO && !TARGET_64BIT)
24438 #if TARGET_MACHO
24439 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
24440 fnaddr = machopic_indirect_call_target (fnaddr);
24441 #endif
24443 else
24445 /* Static functions and indirect calls don't need the pic register. */
24446 if (flag_pic
24447 && (!TARGET_64BIT
24448 || (ix86_cmodel == CM_LARGE_PIC
24449 && DEFAULT_ABI != MS_ABI))
24450 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24451 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
24452 use_reg (&use, pic_offset_table_rtx);
24455 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
24457 rtx al = gen_rtx_REG (QImode, AX_REG);
24458 emit_move_insn (al, callarg2);
24459 use_reg (&use, al);
24462 if (ix86_cmodel == CM_LARGE_PIC
24463 && !TARGET_PECOFF
24464 && MEM_P (fnaddr)
24465 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24466 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
24467 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
24468 else if (sibcall
24469 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
24470 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
24472 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
24473 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
24476 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
24477 if (retval)
24478 call = gen_rtx_SET (VOIDmode, retval, call);
24479 vec[vec_len++] = call;
24481 if (pop)
24483 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
24484 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
24485 vec[vec_len++] = pop;
24488 if (TARGET_64BIT_MS_ABI
24489 && (!callarg2 || INTVAL (callarg2) != -2))
24491 unsigned i;
24493 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
24494 UNSPEC_MS_TO_SYSV_CALL);
24496 for (i = 0; i < cregs_size; i++)
24498 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
24499 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
24501 vec[vec_len++]
24502 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
24506 if (vec_len > 1)
24507 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
24508 call = emit_call_insn (call);
24509 if (use)
24510 CALL_INSN_FUNCTION_USAGE (call) = use;
24512 return call;
24515 /* Output the assembly for a call instruction. */
24517 const char *
24518 ix86_output_call_insn (rtx insn, rtx call_op)
24520 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24521 bool seh_nop_p = false;
24522 const char *xasm;
24524 if (SIBLING_CALL_P (insn))
24526 if (direct_p)
24527 xasm = "%!jmp\t%P0";
24528 /* SEH epilogue detection requires the indirect branch case
24529 to include REX.W. */
24530 else if (TARGET_SEH)
24531 xasm = "%!rex.W jmp %A0";
24532 else
24533 xasm = "%!jmp\t%A0";
24535 output_asm_insn (xasm, &call_op);
24536 return "";
24539 /* SEH unwinding can require an extra nop to be emitted in several
24540 circumstances. Determine if we have one of those. */
24541 if (TARGET_SEH)
24543 rtx i;
24545 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24547 /* If we get to another real insn, we don't need the nop. */
24548 if (INSN_P (i))
24549 break;
24551 /* If we get to the epilogue note, prevent a catch region from
24552 being adjacent to the standard epilogue sequence. If non-
24553 call-exceptions, we'll have done this during epilogue emission. */
24554 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24555 && !flag_non_call_exceptions
24556 && !can_throw_internal (insn))
24558 seh_nop_p = true;
24559 break;
24563 /* If we didn't find a real insn following the call, prevent the
24564 unwinder from looking into the next function. */
24565 if (i == NULL)
24566 seh_nop_p = true;
24569 if (direct_p)
24570 xasm = "%!call\t%P0";
24571 else
24572 xasm = "%!call\t%A0";
24574 output_asm_insn (xasm, &call_op);
24576 if (seh_nop_p)
24577 return "nop";
24579 return "";
24582 /* Clear stack slot assignments remembered from previous functions.
24583 This is called from INIT_EXPANDERS once before RTL is emitted for each
24584 function. */
24586 static struct machine_function *
24587 ix86_init_machine_status (void)
24589 struct machine_function *f;
24591 f = ggc_alloc_cleared_machine_function ();
24592 f->use_fast_prologue_epilogue_nregs = -1;
24593 f->call_abi = ix86_abi;
24595 return f;
24598 /* Return a MEM corresponding to a stack slot with mode MODE.
24599 Allocate a new slot if necessary.
24601 The RTL for a function can have several slots available: N is
24602 which slot to use. */
24605 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24607 struct stack_local_entry *s;
24609 gcc_assert (n < MAX_386_STACK_LOCALS);
24611 for (s = ix86_stack_locals; s; s = s->next)
24612 if (s->mode == mode && s->n == n)
24613 return validize_mem (copy_rtx (s->rtl));
24615 s = ggc_alloc_stack_local_entry ();
24616 s->n = n;
24617 s->mode = mode;
24618 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24620 s->next = ix86_stack_locals;
24621 ix86_stack_locals = s;
24622 return validize_mem (s->rtl);
24625 static void
24626 ix86_instantiate_decls (void)
24628 struct stack_local_entry *s;
24630 for (s = ix86_stack_locals; s; s = s->next)
24631 if (s->rtl != NULL_RTX)
24632 instantiate_decl_rtl (s->rtl);
24635 /* Check whether x86 address PARTS is a pc-relative address. */
24637 static bool
24638 rip_relative_addr_p (struct ix86_address *parts)
24640 rtx base, index, disp;
24642 base = parts->base;
24643 index = parts->index;
24644 disp = parts->disp;
24646 if (disp && !base && !index)
24648 if (TARGET_64BIT)
24650 rtx symbol = disp;
24652 if (GET_CODE (disp) == CONST)
24653 symbol = XEXP (disp, 0);
24654 if (GET_CODE (symbol) == PLUS
24655 && CONST_INT_P (XEXP (symbol, 1)))
24656 symbol = XEXP (symbol, 0);
24658 if (GET_CODE (symbol) == LABEL_REF
24659 || (GET_CODE (symbol) == SYMBOL_REF
24660 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
24661 || (GET_CODE (symbol) == UNSPEC
24662 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
24663 || XINT (symbol, 1) == UNSPEC_PCREL
24664 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
24665 return true;
24668 return false;
24671 /* Calculate the length of the memory address in the instruction encoding.
24672 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24673 or other prefixes. We never generate addr32 prefix for LEA insn. */
24676 memory_address_length (rtx addr, bool lea)
24678 struct ix86_address parts;
24679 rtx base, index, disp;
24680 int len;
24681 int ok;
24683 if (GET_CODE (addr) == PRE_DEC
24684 || GET_CODE (addr) == POST_INC
24685 || GET_CODE (addr) == PRE_MODIFY
24686 || GET_CODE (addr) == POST_MODIFY)
24687 return 0;
24689 ok = ix86_decompose_address (addr, &parts);
24690 gcc_assert (ok);
24692 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24694 /* If this is not LEA instruction, add the length of addr32 prefix. */
24695 if (TARGET_64BIT && !lea
24696 && (SImode_address_operand (addr, VOIDmode)
24697 || (parts.base && GET_MODE (parts.base) == SImode)
24698 || (parts.index && GET_MODE (parts.index) == SImode)))
24699 len++;
24701 base = parts.base;
24702 index = parts.index;
24703 disp = parts.disp;
24705 if (base && GET_CODE (base) == SUBREG)
24706 base = SUBREG_REG (base);
24707 if (index && GET_CODE (index) == SUBREG)
24708 index = SUBREG_REG (index);
24710 gcc_assert (base == NULL_RTX || REG_P (base));
24711 gcc_assert (index == NULL_RTX || REG_P (index));
24713 /* Rule of thumb:
24714 - esp as the base always wants an index,
24715 - ebp as the base always wants a displacement,
24716 - r12 as the base always wants an index,
24717 - r13 as the base always wants a displacement. */
24719 /* Register Indirect. */
24720 if (base && !index && !disp)
24722 /* esp (for its index) and ebp (for its displacement) need
24723 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24724 code. */
24725 if (base == arg_pointer_rtx
24726 || base == frame_pointer_rtx
24727 || REGNO (base) == SP_REG
24728 || REGNO (base) == BP_REG
24729 || REGNO (base) == R12_REG
24730 || REGNO (base) == R13_REG)
24731 len++;
24734 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24735 is not disp32, but disp32(%rip), so for disp32
24736 SIB byte is needed, unless print_operand_address
24737 optimizes it into disp32(%rip) or (%rip) is implied
24738 by UNSPEC. */
24739 else if (disp && !base && !index)
24741 len += 4;
24742 if (rip_relative_addr_p (&parts))
24743 len++;
24745 else
24747 /* Find the length of the displacement constant. */
24748 if (disp)
24750 if (base && satisfies_constraint_K (disp))
24751 len += 1;
24752 else
24753 len += 4;
24755 /* ebp always wants a displacement. Similarly r13. */
24756 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24757 len++;
24759 /* An index requires the two-byte modrm form.... */
24760 if (index
24761 /* ...like esp (or r12), which always wants an index. */
24762 || base == arg_pointer_rtx
24763 || base == frame_pointer_rtx
24764 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24765 len++;
24768 return len;
24771 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24772 is set, expect that insn have 8bit immediate alternative. */
24774 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24776 int len = 0;
24777 int i;
24778 extract_insn_cached (insn);
24779 for (i = recog_data.n_operands - 1; i >= 0; --i)
24780 if (CONSTANT_P (recog_data.operand[i]))
24782 enum attr_mode mode = get_attr_mode (insn);
24784 gcc_assert (!len);
24785 if (shortform && CONST_INT_P (recog_data.operand[i]))
24787 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24788 switch (mode)
24790 case MODE_QI:
24791 len = 1;
24792 continue;
24793 case MODE_HI:
24794 ival = trunc_int_for_mode (ival, HImode);
24795 break;
24796 case MODE_SI:
24797 ival = trunc_int_for_mode (ival, SImode);
24798 break;
24799 default:
24800 break;
24802 if (IN_RANGE (ival, -128, 127))
24804 len = 1;
24805 continue;
24808 switch (mode)
24810 case MODE_QI:
24811 len = 1;
24812 break;
24813 case MODE_HI:
24814 len = 2;
24815 break;
24816 case MODE_SI:
24817 len = 4;
24818 break;
24819 /* Immediates for DImode instructions are encoded
24820 as 32bit sign extended values. */
24821 case MODE_DI:
24822 len = 4;
24823 break;
24824 default:
24825 fatal_insn ("unknown insn mode", insn);
24828 return len;
24831 /* Compute default value for "length_address" attribute. */
24833 ix86_attr_length_address_default (rtx insn)
24835 int i;
24837 if (get_attr_type (insn) == TYPE_LEA)
24839 rtx set = PATTERN (insn), addr;
24841 if (GET_CODE (set) == PARALLEL)
24842 set = XVECEXP (set, 0, 0);
24844 gcc_assert (GET_CODE (set) == SET);
24846 addr = SET_SRC (set);
24848 return memory_address_length (addr, true);
24851 extract_insn_cached (insn);
24852 for (i = recog_data.n_operands - 1; i >= 0; --i)
24853 if (MEM_P (recog_data.operand[i]))
24855 constrain_operands_cached (reload_completed);
24856 if (which_alternative != -1)
24858 const char *constraints = recog_data.constraints[i];
24859 int alt = which_alternative;
24861 while (*constraints == '=' || *constraints == '+')
24862 constraints++;
24863 while (alt-- > 0)
24864 while (*constraints++ != ',')
24866 /* Skip ignored operands. */
24867 if (*constraints == 'X')
24868 continue;
24870 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24872 return 0;
24875 /* Compute default value for "length_vex" attribute. It includes
24876 2 or 3 byte VEX prefix and 1 opcode byte. */
24879 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24881 int i;
24883 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24884 byte VEX prefix. */
24885 if (!has_0f_opcode || has_vex_w)
24886 return 3 + 1;
24888 /* We can always use 2 byte VEX prefix in 32bit. */
24889 if (!TARGET_64BIT)
24890 return 2 + 1;
24892 extract_insn_cached (insn);
24894 for (i = recog_data.n_operands - 1; i >= 0; --i)
24895 if (REG_P (recog_data.operand[i]))
24897 /* REX.W bit uses 3 byte VEX prefix. */
24898 if (GET_MODE (recog_data.operand[i]) == DImode
24899 && GENERAL_REG_P (recog_data.operand[i]))
24900 return 3 + 1;
24902 else
24904 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24905 if (MEM_P (recog_data.operand[i])
24906 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24907 return 3 + 1;
24910 return 2 + 1;
24913 /* Return the maximum number of instructions a cpu can issue. */
24915 static int
24916 ix86_issue_rate (void)
24918 switch (ix86_tune)
24920 case PROCESSOR_PENTIUM:
24921 case PROCESSOR_ATOM:
24922 case PROCESSOR_SLM:
24923 case PROCESSOR_K6:
24924 case PROCESSOR_BTVER2:
24925 case PROCESSOR_PENTIUM4:
24926 case PROCESSOR_NOCONA:
24927 return 2;
24929 case PROCESSOR_PENTIUMPRO:
24930 case PROCESSOR_ATHLON:
24931 case PROCESSOR_K8:
24932 case PROCESSOR_AMDFAM10:
24933 case PROCESSOR_GENERIC:
24934 case PROCESSOR_BTVER1:
24935 return 3;
24937 case PROCESSOR_BDVER1:
24938 case PROCESSOR_BDVER2:
24939 case PROCESSOR_BDVER3:
24940 case PROCESSOR_BDVER4:
24941 case PROCESSOR_CORE2:
24942 case PROCESSOR_COREI7:
24943 case PROCESSOR_COREI7_AVX:
24944 case PROCESSOR_HASWELL:
24945 return 4;
24947 default:
24948 return 1;
24952 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24953 by DEP_INSN and nothing set by DEP_INSN. */
24955 static bool
24956 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24958 rtx set, set2;
24960 /* Simplify the test for uninteresting insns. */
24961 if (insn_type != TYPE_SETCC
24962 && insn_type != TYPE_ICMOV
24963 && insn_type != TYPE_FCMOV
24964 && insn_type != TYPE_IBR)
24965 return false;
24967 if ((set = single_set (dep_insn)) != 0)
24969 set = SET_DEST (set);
24970 set2 = NULL_RTX;
24972 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24973 && XVECLEN (PATTERN (dep_insn), 0) == 2
24974 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24975 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24977 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24978 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24980 else
24981 return false;
24983 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24984 return false;
24986 /* This test is true if the dependent insn reads the flags but
24987 not any other potentially set register. */
24988 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24989 return false;
24991 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24992 return false;
24994 return true;
24997 /* Return true iff USE_INSN has a memory address with operands set by
24998 SET_INSN. */
25000 bool
25001 ix86_agi_dependent (rtx set_insn, rtx use_insn)
25003 int i;
25004 extract_insn_cached (use_insn);
25005 for (i = recog_data.n_operands - 1; i >= 0; --i)
25006 if (MEM_P (recog_data.operand[i]))
25008 rtx addr = XEXP (recog_data.operand[i], 0);
25009 return modified_in_p (addr, set_insn) != 0;
25011 return false;
25014 /* Helper function for exact_store_load_dependency.
25015 Return true if addr is found in insn. */
25016 static bool
25017 exact_dependency_1 (rtx addr, rtx insn)
25019 enum rtx_code code;
25020 const char *format_ptr;
25021 int i, j;
25023 code = GET_CODE (insn);
25024 switch (code)
25026 case MEM:
25027 if (rtx_equal_p (addr, insn))
25028 return true;
25029 break;
25030 case REG:
25031 CASE_CONST_ANY:
25032 case SYMBOL_REF:
25033 case CODE_LABEL:
25034 case PC:
25035 case CC0:
25036 case EXPR_LIST:
25037 return false;
25038 default:
25039 break;
25042 format_ptr = GET_RTX_FORMAT (code);
25043 for (i = 0; i < GET_RTX_LENGTH (code); i++)
25045 switch (*format_ptr++)
25047 case 'e':
25048 if (exact_dependency_1 (addr, XEXP (insn, i)))
25049 return true;
25050 break;
25051 case 'E':
25052 for (j = 0; j < XVECLEN (insn, i); j++)
25053 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
25054 return true;
25055 break;
25058 return false;
25061 /* Return true if there exists exact dependency for store & load, i.e.
25062 the same memory address is used in them. */
25063 static bool
25064 exact_store_load_dependency (rtx store, rtx load)
25066 rtx set1, set2;
25068 set1 = single_set (store);
25069 if (!set1)
25070 return false;
25071 if (!MEM_P (SET_DEST (set1)))
25072 return false;
25073 set2 = single_set (load);
25074 if (!set2)
25075 return false;
25076 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
25077 return true;
25078 return false;
25081 static int
25082 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
25084 enum attr_type insn_type, dep_insn_type;
25085 enum attr_memory memory;
25086 rtx set, set2;
25087 int dep_insn_code_number;
25089 /* Anti and output dependencies have zero cost on all CPUs. */
25090 if (REG_NOTE_KIND (link) != 0)
25091 return 0;
25093 dep_insn_code_number = recog_memoized (dep_insn);
25095 /* If we can't recognize the insns, we can't really do anything. */
25096 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
25097 return cost;
25099 insn_type = get_attr_type (insn);
25100 dep_insn_type = get_attr_type (dep_insn);
25102 switch (ix86_tune)
25104 case PROCESSOR_PENTIUM:
25105 /* Address Generation Interlock adds a cycle of latency. */
25106 if (insn_type == TYPE_LEA)
25108 rtx addr = PATTERN (insn);
25110 if (GET_CODE (addr) == PARALLEL)
25111 addr = XVECEXP (addr, 0, 0);
25113 gcc_assert (GET_CODE (addr) == SET);
25115 addr = SET_SRC (addr);
25116 if (modified_in_p (addr, dep_insn))
25117 cost += 1;
25119 else if (ix86_agi_dependent (dep_insn, insn))
25120 cost += 1;
25122 /* ??? Compares pair with jump/setcc. */
25123 if (ix86_flags_dependent (insn, dep_insn, insn_type))
25124 cost = 0;
25126 /* Floating point stores require value to be ready one cycle earlier. */
25127 if (insn_type == TYPE_FMOV
25128 && get_attr_memory (insn) == MEMORY_STORE
25129 && !ix86_agi_dependent (dep_insn, insn))
25130 cost += 1;
25131 break;
25133 case PROCESSOR_PENTIUMPRO:
25134 memory = get_attr_memory (insn);
25136 /* INT->FP conversion is expensive. */
25137 if (get_attr_fp_int_src (dep_insn))
25138 cost += 5;
25140 /* There is one cycle extra latency between an FP op and a store. */
25141 if (insn_type == TYPE_FMOV
25142 && (set = single_set (dep_insn)) != NULL_RTX
25143 && (set2 = single_set (insn)) != NULL_RTX
25144 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
25145 && MEM_P (SET_DEST (set2)))
25146 cost += 1;
25148 /* Show ability of reorder buffer to hide latency of load by executing
25149 in parallel with previous instruction in case
25150 previous instruction is not needed to compute the address. */
25151 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25152 && !ix86_agi_dependent (dep_insn, insn))
25154 /* Claim moves to take one cycle, as core can issue one load
25155 at time and the next load can start cycle later. */
25156 if (dep_insn_type == TYPE_IMOV
25157 || dep_insn_type == TYPE_FMOV)
25158 cost = 1;
25159 else if (cost > 1)
25160 cost--;
25162 break;
25164 case PROCESSOR_K6:
25165 memory = get_attr_memory (insn);
25167 /* The esp dependency is resolved before the instruction is really
25168 finished. */
25169 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25170 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25171 return 1;
25173 /* INT->FP conversion is expensive. */
25174 if (get_attr_fp_int_src (dep_insn))
25175 cost += 5;
25177 /* Show ability of reorder buffer to hide latency of load by executing
25178 in parallel with previous instruction in case
25179 previous instruction is not needed to compute the address. */
25180 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25181 && !ix86_agi_dependent (dep_insn, insn))
25183 /* Claim moves to take one cycle, as core can issue one load
25184 at time and the next load can start cycle later. */
25185 if (dep_insn_type == TYPE_IMOV
25186 || dep_insn_type == TYPE_FMOV)
25187 cost = 1;
25188 else if (cost > 2)
25189 cost -= 2;
25190 else
25191 cost = 1;
25193 break;
25195 case PROCESSOR_ATHLON:
25196 case PROCESSOR_K8:
25197 case PROCESSOR_AMDFAM10:
25198 case PROCESSOR_BDVER1:
25199 case PROCESSOR_BDVER2:
25200 case PROCESSOR_BDVER3:
25201 case PROCESSOR_BDVER4:
25202 case PROCESSOR_BTVER1:
25203 case PROCESSOR_BTVER2:
25204 case PROCESSOR_GENERIC:
25205 memory = get_attr_memory (insn);
25207 /* Stack engine allows to execute push&pop instructions in parall. */
25208 if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25209 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25210 && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
25211 return 0;
25213 /* Show ability of reorder buffer to hide latency of load by executing
25214 in parallel with previous instruction in case
25215 previous instruction is not needed to compute the address. */
25216 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25217 && !ix86_agi_dependent (dep_insn, insn))
25219 enum attr_unit unit = get_attr_unit (insn);
25220 int loadcost = 3;
25222 /* Because of the difference between the length of integer and
25223 floating unit pipeline preparation stages, the memory operands
25224 for floating point are cheaper.
25226 ??? For Athlon it the difference is most probably 2. */
25227 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
25228 loadcost = 3;
25229 else
25230 loadcost = TARGET_ATHLON ? 2 : 0;
25232 if (cost >= loadcost)
25233 cost -= loadcost;
25234 else
25235 cost = 0;
25237 break;
25239 case PROCESSOR_CORE2:
25240 case PROCESSOR_COREI7:
25241 case PROCESSOR_COREI7_AVX:
25242 case PROCESSOR_HASWELL:
25243 memory = get_attr_memory (insn);
25245 /* Stack engine allows to execute push&pop instructions in parall. */
25246 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25247 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25248 return 0;
25250 /* Show ability of reorder buffer to hide latency of load by executing
25251 in parallel with previous instruction in case
25252 previous instruction is not needed to compute the address. */
25253 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25254 && !ix86_agi_dependent (dep_insn, insn))
25256 if (cost >= 4)
25257 cost -= 4;
25258 else
25259 cost = 0;
25261 break;
25263 case PROCESSOR_SLM:
25264 if (!reload_completed)
25265 return cost;
25267 /* Increase cost of integer loads. */
25268 memory = get_attr_memory (dep_insn);
25269 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25271 enum attr_unit unit = get_attr_unit (dep_insn);
25272 if (unit == UNIT_INTEGER && cost == 1)
25274 if (memory == MEMORY_LOAD)
25275 cost = 3;
25276 else
25278 /* Increase cost of ld/st for short int types only
25279 because of store forwarding issue. */
25280 rtx set = single_set (dep_insn);
25281 if (set && (GET_MODE (SET_DEST (set)) == QImode
25282 || GET_MODE (SET_DEST (set)) == HImode))
25284 /* Increase cost of store/load insn if exact
25285 dependence exists and it is load insn. */
25286 enum attr_memory insn_memory = get_attr_memory (insn);
25287 if (insn_memory == MEMORY_LOAD
25288 && exact_store_load_dependency (dep_insn, insn))
25289 cost = 3;
25295 default:
25296 break;
25299 return cost;
25302 /* How many alternative schedules to try. This should be as wide as the
25303 scheduling freedom in the DFA, but no wider. Making this value too
25304 large results extra work for the scheduler. */
25306 static int
25307 ia32_multipass_dfa_lookahead (void)
25309 switch (ix86_tune)
25311 case PROCESSOR_PENTIUM:
25312 return 2;
25314 case PROCESSOR_PENTIUMPRO:
25315 case PROCESSOR_K6:
25316 return 1;
25318 case PROCESSOR_BDVER1:
25319 case PROCESSOR_BDVER2:
25320 case PROCESSOR_BDVER3:
25321 case PROCESSOR_BDVER4:
25322 /* We use lookahead value 4 for BD both before and after reload
25323 schedules. Plan is to have value 8 included for O3. */
25324 return 4;
25326 case PROCESSOR_CORE2:
25327 case PROCESSOR_COREI7:
25328 case PROCESSOR_COREI7_AVX:
25329 case PROCESSOR_HASWELL:
25330 case PROCESSOR_ATOM:
25331 case PROCESSOR_SLM:
25332 /* Generally, we want haifa-sched:max_issue() to look ahead as far
25333 as many instructions can be executed on a cycle, i.e.,
25334 issue_rate. I wonder why tuning for many CPUs does not do this. */
25335 if (reload_completed)
25336 return ix86_issue_rate ();
25337 /* Don't use lookahead for pre-reload schedule to save compile time. */
25338 return 0;
25340 default:
25341 return 0;
25345 /* Return true if target platform supports macro-fusion. */
25347 static bool
25348 ix86_macro_fusion_p ()
25350 return TARGET_FUSE_CMP_AND_BRANCH;
25353 /* Check whether current microarchitecture support macro fusion
25354 for insn pair "CONDGEN + CONDJMP". Refer to
25355 "Intel Architectures Optimization Reference Manual". */
25357 static bool
25358 ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
25360 rtx src, dest;
25361 rtx single_set = single_set (condgen);
25362 enum rtx_code ccode;
25363 rtx compare_set = NULL_RTX, test_if, cond;
25364 rtx alu_set = NULL_RTX, addr = NULL_RTX;
25366 if (get_attr_type (condgen) != TYPE_TEST
25367 && get_attr_type (condgen) != TYPE_ICMP
25368 && get_attr_type (condgen) != TYPE_INCDEC
25369 && get_attr_type (condgen) != TYPE_ALU)
25370 return false;
25372 if (single_set == NULL_RTX
25373 && !TARGET_FUSE_ALU_AND_BRANCH)
25374 return false;
25376 if (single_set != NULL_RTX)
25377 compare_set = single_set;
25378 else
25380 int i;
25381 rtx pat = PATTERN (condgen);
25382 for (i = 0; i < XVECLEN (pat, 0); i++)
25383 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
25385 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
25386 if (GET_CODE (set_src) == COMPARE)
25387 compare_set = XVECEXP (pat, 0, i);
25388 else
25389 alu_set = XVECEXP (pat, 0, i);
25392 if (compare_set == NULL_RTX)
25393 return false;
25394 src = SET_SRC (compare_set);
25395 if (GET_CODE (src) != COMPARE)
25396 return false;
25398 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
25399 supported. */
25400 if ((MEM_P (XEXP (src, 0))
25401 && CONST_INT_P (XEXP (src, 1)))
25402 || (MEM_P (XEXP (src, 1))
25403 && CONST_INT_P (XEXP (src, 0))))
25404 return false;
25406 /* No fusion for RIP-relative address. */
25407 if (MEM_P (XEXP (src, 0)))
25408 addr = XEXP (XEXP (src, 0), 0);
25409 else if (MEM_P (XEXP (src, 1)))
25410 addr = XEXP (XEXP (src, 1), 0);
25412 if (addr) {
25413 ix86_address parts;
25414 int ok = ix86_decompose_address (addr, &parts);
25415 gcc_assert (ok);
25417 if (rip_relative_addr_p (&parts))
25418 return false;
25421 test_if = SET_SRC (pc_set (condjmp));
25422 cond = XEXP (test_if, 0);
25423 ccode = GET_CODE (cond);
25424 /* Check whether conditional jump use Sign or Overflow Flags. */
25425 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
25426 && (ccode == GE
25427 || ccode == GT
25428 || ccode == LE
25429 || ccode == LT))
25430 return false;
25432 /* Return true for TYPE_TEST and TYPE_ICMP. */
25433 if (get_attr_type (condgen) == TYPE_TEST
25434 || get_attr_type (condgen) == TYPE_ICMP)
25435 return true;
25437 /* The following is the case that macro-fusion for alu + jmp. */
25438 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
25439 return false;
25441 /* No fusion for alu op with memory destination operand. */
25442 dest = SET_DEST (alu_set);
25443 if (MEM_P (dest))
25444 return false;
25446 /* Macro-fusion for inc/dec + unsigned conditional jump is not
25447 supported. */
25448 if (get_attr_type (condgen) == TYPE_INCDEC
25449 && (ccode == GEU
25450 || ccode == GTU
25451 || ccode == LEU
25452 || ccode == LTU))
25453 return false;
25455 return true;
25458 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
25459 execution. It is applied if
25460 (1) IMUL instruction is on the top of list;
25461 (2) There exists the only producer of independent IMUL instruction in
25462 ready list.
25463 Return index of IMUL producer if it was found and -1 otherwise. */
25464 static int
25465 do_reorder_for_imul (rtx *ready, int n_ready)
25467 rtx insn, set, insn1, insn2;
25468 sd_iterator_def sd_it;
25469 dep_t dep;
25470 int index = -1;
25471 int i;
25473 if (ix86_tune != PROCESSOR_ATOM)
25474 return index;
25476 /* Check that IMUL instruction is on the top of ready list. */
25477 insn = ready[n_ready - 1];
25478 set = single_set (insn);
25479 if (!set)
25480 return index;
25481 if (!(GET_CODE (SET_SRC (set)) == MULT
25482 && GET_MODE (SET_SRC (set)) == SImode))
25483 return index;
25485 /* Search for producer of independent IMUL instruction. */
25486 for (i = n_ready - 2; i >= 0; i--)
25488 insn = ready[i];
25489 if (!NONDEBUG_INSN_P (insn))
25490 continue;
25491 /* Skip IMUL instruction. */
25492 insn2 = PATTERN (insn);
25493 if (GET_CODE (insn2) == PARALLEL)
25494 insn2 = XVECEXP (insn2, 0, 0);
25495 if (GET_CODE (insn2) == SET
25496 && GET_CODE (SET_SRC (insn2)) == MULT
25497 && GET_MODE (SET_SRC (insn2)) == SImode)
25498 continue;
25500 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
25502 rtx con;
25503 con = DEP_CON (dep);
25504 if (!NONDEBUG_INSN_P (con))
25505 continue;
25506 insn1 = PATTERN (con);
25507 if (GET_CODE (insn1) == PARALLEL)
25508 insn1 = XVECEXP (insn1, 0, 0);
25510 if (GET_CODE (insn1) == SET
25511 && GET_CODE (SET_SRC (insn1)) == MULT
25512 && GET_MODE (SET_SRC (insn1)) == SImode)
25514 sd_iterator_def sd_it1;
25515 dep_t dep1;
25516 /* Check if there is no other dependee for IMUL. */
25517 index = i;
25518 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
25520 rtx pro;
25521 pro = DEP_PRO (dep1);
25522 if (!NONDEBUG_INSN_P (pro))
25523 continue;
25524 if (pro != insn)
25525 index = -1;
25527 if (index >= 0)
25528 break;
25531 if (index >= 0)
25532 break;
25534 return index;
25537 /* Try to find the best candidate on the top of ready list if two insns
25538 have the same priority - candidate is best if its dependees were
25539 scheduled earlier. Applied for Silvermont only.
25540 Return true if top 2 insns must be interchanged. */
25541 static bool
25542 swap_top_of_ready_list (rtx *ready, int n_ready)
25544 rtx top = ready[n_ready - 1];
25545 rtx next = ready[n_ready - 2];
25546 rtx set;
25547 sd_iterator_def sd_it;
25548 dep_t dep;
25549 int clock1 = -1;
25550 int clock2 = -1;
25551 #define INSN_TICK(INSN) (HID (INSN)->tick)
25553 if (ix86_tune != PROCESSOR_SLM)
25554 return false;
25556 if (!NONDEBUG_INSN_P (top))
25557 return false;
25558 if (!NONJUMP_INSN_P (top))
25559 return false;
25560 if (!NONDEBUG_INSN_P (next))
25561 return false;
25562 if (!NONJUMP_INSN_P (next))
25563 return false;
25564 set = single_set (top);
25565 if (!set)
25566 return false;
25567 set = single_set (next);
25568 if (!set)
25569 return false;
25571 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
25573 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
25574 return false;
25575 /* Determine winner more precise. */
25576 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
25578 rtx pro;
25579 pro = DEP_PRO (dep);
25580 if (!NONDEBUG_INSN_P (pro))
25581 continue;
25582 if (INSN_TICK (pro) > clock1)
25583 clock1 = INSN_TICK (pro);
25585 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
25587 rtx pro;
25588 pro = DEP_PRO (dep);
25589 if (!NONDEBUG_INSN_P (pro))
25590 continue;
25591 if (INSN_TICK (pro) > clock2)
25592 clock2 = INSN_TICK (pro);
25595 if (clock1 == clock2)
25597 /* Determine winner - load must win. */
25598 enum attr_memory memory1, memory2;
25599 memory1 = get_attr_memory (top);
25600 memory2 = get_attr_memory (next);
25601 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
25602 return true;
25604 return (bool) (clock2 < clock1);
25606 return false;
25607 #undef INSN_TICK
25610 /* Perform possible reodering of ready list for Atom/Silvermont only.
25611 Return issue rate. */
25612 static int
25613 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
25614 int clock_var)
25616 int issue_rate = -1;
25617 int n_ready = *pn_ready;
25618 int i;
25619 rtx insn;
25620 int index = -1;
25622 /* Set up issue rate. */
25623 issue_rate = ix86_issue_rate ();
25625 /* Do reodering for Atom/SLM only. */
25626 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
25627 return issue_rate;
25629 /* Nothing to do if ready list contains only 1 instruction. */
25630 if (n_ready <= 1)
25631 return issue_rate;
25633 /* Do reodering for post-reload scheduler only. */
25634 if (!reload_completed)
25635 return issue_rate;
25637 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
25639 if (sched_verbose > 1)
25640 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
25641 INSN_UID (ready[index]));
25643 /* Put IMUL producer (ready[index]) at the top of ready list. */
25644 insn = ready[index];
25645 for (i = index; i < n_ready - 1; i++)
25646 ready[i] = ready[i + 1];
25647 ready[n_ready - 1] = insn;
25648 return issue_rate;
25650 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
25652 if (sched_verbose > 1)
25653 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
25654 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
25655 /* Swap 2 top elements of ready list. */
25656 insn = ready[n_ready - 1];
25657 ready[n_ready - 1] = ready[n_ready - 2];
25658 ready[n_ready - 2] = insn;
25660 return issue_rate;
25663 static bool
25664 ix86_class_likely_spilled_p (reg_class_t);
25666 /* Returns true if lhs of insn is HW function argument register and set up
25667 is_spilled to true if it is likely spilled HW register. */
25668 static bool
25669 insn_is_function_arg (rtx insn, bool* is_spilled)
25671 rtx dst;
25673 if (!NONDEBUG_INSN_P (insn))
25674 return false;
25675 /* Call instructions are not movable, ignore it. */
25676 if (CALL_P (insn))
25677 return false;
25678 insn = PATTERN (insn);
25679 if (GET_CODE (insn) == PARALLEL)
25680 insn = XVECEXP (insn, 0, 0);
25681 if (GET_CODE (insn) != SET)
25682 return false;
25683 dst = SET_DEST (insn);
25684 if (REG_P (dst) && HARD_REGISTER_P (dst)
25685 && ix86_function_arg_regno_p (REGNO (dst)))
25687 /* Is it likely spilled HW register? */
25688 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
25689 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
25690 *is_spilled = true;
25691 return true;
25693 return false;
25696 /* Add output dependencies for chain of function adjacent arguments if only
25697 there is a move to likely spilled HW register. Return first argument
25698 if at least one dependence was added or NULL otherwise. */
25699 static rtx
25700 add_parameter_dependencies (rtx call, rtx head)
25702 rtx insn;
25703 rtx last = call;
25704 rtx first_arg = NULL;
25705 bool is_spilled = false;
25707 head = PREV_INSN (head);
25709 /* Find nearest to call argument passing instruction. */
25710 while (true)
25712 last = PREV_INSN (last);
25713 if (last == head)
25714 return NULL;
25715 if (!NONDEBUG_INSN_P (last))
25716 continue;
25717 if (insn_is_function_arg (last, &is_spilled))
25718 break;
25719 return NULL;
25722 first_arg = last;
25723 while (true)
25725 insn = PREV_INSN (last);
25726 if (!INSN_P (insn))
25727 break;
25728 if (insn == head)
25729 break;
25730 if (!NONDEBUG_INSN_P (insn))
25732 last = insn;
25733 continue;
25735 if (insn_is_function_arg (insn, &is_spilled))
25737 /* Add output depdendence between two function arguments if chain
25738 of output arguments contains likely spilled HW registers. */
25739 if (is_spilled)
25740 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25741 first_arg = last = insn;
25743 else
25744 break;
25746 if (!is_spilled)
25747 return NULL;
25748 return first_arg;
25751 /* Add output or anti dependency from insn to first_arg to restrict its code
25752 motion. */
25753 static void
25754 avoid_func_arg_motion (rtx first_arg, rtx insn)
25756 rtx set;
25757 rtx tmp;
25759 set = single_set (insn);
25760 if (!set)
25761 return;
25762 tmp = SET_DEST (set);
25763 if (REG_P (tmp))
25765 /* Add output dependency to the first function argument. */
25766 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25767 return;
25769 /* Add anti dependency. */
25770 add_dependence (first_arg, insn, REG_DEP_ANTI);
25773 /* Avoid cross block motion of function argument through adding dependency
25774 from the first non-jump instruction in bb. */
25775 static void
25776 add_dependee_for_func_arg (rtx arg, basic_block bb)
25778 rtx insn = BB_END (bb);
25780 while (insn)
25782 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25784 rtx set = single_set (insn);
25785 if (set)
25787 avoid_func_arg_motion (arg, insn);
25788 return;
25791 if (insn == BB_HEAD (bb))
25792 return;
25793 insn = PREV_INSN (insn);
25797 /* Hook for pre-reload schedule - avoid motion of function arguments
25798 passed in likely spilled HW registers. */
25799 static void
25800 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25802 rtx insn;
25803 rtx first_arg = NULL;
25804 if (reload_completed)
25805 return;
25806 while (head != tail && DEBUG_INSN_P (head))
25807 head = NEXT_INSN (head);
25808 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25809 if (INSN_P (insn) && CALL_P (insn))
25811 first_arg = add_parameter_dependencies (insn, head);
25812 if (first_arg)
25814 /* Add dependee for first argument to predecessors if only
25815 region contains more than one block. */
25816 basic_block bb = BLOCK_FOR_INSN (insn);
25817 int rgn = CONTAINING_RGN (bb->index);
25818 int nr_blks = RGN_NR_BLOCKS (rgn);
25819 /* Skip trivial regions and region head blocks that can have
25820 predecessors outside of region. */
25821 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25823 edge e;
25824 edge_iterator ei;
25825 /* Assume that region is SCC, i.e. all immediate predecessors
25826 of non-head block are in the same region. */
25827 FOR_EACH_EDGE (e, ei, bb->preds)
25829 /* Avoid creating of loop-carried dependencies through
25830 using topological odering in region. */
25831 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25832 add_dependee_for_func_arg (first_arg, e->src);
25835 insn = first_arg;
25836 if (insn == head)
25837 break;
25840 else if (first_arg)
25841 avoid_func_arg_motion (first_arg, insn);
25844 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25845 HW registers to maximum, to schedule them at soon as possible. These are
25846 moves from function argument registers at the top of the function entry
25847 and moves from function return value registers after call. */
25848 static int
25849 ix86_adjust_priority (rtx insn, int priority)
25851 rtx set;
25853 if (reload_completed)
25854 return priority;
25856 if (!NONDEBUG_INSN_P (insn))
25857 return priority;
25859 set = single_set (insn);
25860 if (set)
25862 rtx tmp = SET_SRC (set);
25863 if (REG_P (tmp)
25864 && HARD_REGISTER_P (tmp)
25865 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25866 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25867 return current_sched_info->sched_max_insns_priority;
25870 return priority;
25873 /* Model decoder of Core 2/i7.
25874 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25875 track the instruction fetch block boundaries and make sure that long
25876 (9+ bytes) instructions are assigned to D0. */
25878 /* Maximum length of an insn that can be handled by
25879 a secondary decoder unit. '8' for Core 2/i7. */
25880 static int core2i7_secondary_decoder_max_insn_size;
25882 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25883 '16' for Core 2/i7. */
25884 static int core2i7_ifetch_block_size;
25886 /* Maximum number of instructions decoder can handle per cycle.
25887 '6' for Core 2/i7. */
25888 static int core2i7_ifetch_block_max_insns;
25890 typedef struct ix86_first_cycle_multipass_data_ *
25891 ix86_first_cycle_multipass_data_t;
25892 typedef const struct ix86_first_cycle_multipass_data_ *
25893 const_ix86_first_cycle_multipass_data_t;
25895 /* A variable to store target state across calls to max_issue within
25896 one cycle. */
25897 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25898 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25900 /* Initialize DATA. */
25901 static void
25902 core2i7_first_cycle_multipass_init (void *_data)
25904 ix86_first_cycle_multipass_data_t data
25905 = (ix86_first_cycle_multipass_data_t) _data;
25907 data->ifetch_block_len = 0;
25908 data->ifetch_block_n_insns = 0;
25909 data->ready_try_change = NULL;
25910 data->ready_try_change_size = 0;
25913 /* Advancing the cycle; reset ifetch block counts. */
25914 static void
25915 core2i7_dfa_post_advance_cycle (void)
25917 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25919 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25921 data->ifetch_block_len = 0;
25922 data->ifetch_block_n_insns = 0;
25925 static int min_insn_size (rtx);
25927 /* Filter out insns from ready_try that the core will not be able to issue
25928 on current cycle due to decoder. */
25929 static void
25930 core2i7_first_cycle_multipass_filter_ready_try
25931 (const_ix86_first_cycle_multipass_data_t data,
25932 char *ready_try, int n_ready, bool first_cycle_insn_p)
25934 while (n_ready--)
25936 rtx insn;
25937 int insn_size;
25939 if (ready_try[n_ready])
25940 continue;
25942 insn = get_ready_element (n_ready);
25943 insn_size = min_insn_size (insn);
25945 if (/* If this is a too long an insn for a secondary decoder ... */
25946 (!first_cycle_insn_p
25947 && insn_size > core2i7_secondary_decoder_max_insn_size)
25948 /* ... or it would not fit into the ifetch block ... */
25949 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25950 /* ... or the decoder is full already ... */
25951 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25952 /* ... mask the insn out. */
25954 ready_try[n_ready] = 1;
25956 if (data->ready_try_change)
25957 bitmap_set_bit (data->ready_try_change, n_ready);
25962 /* Prepare for a new round of multipass lookahead scheduling. */
25963 static void
25964 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25965 bool first_cycle_insn_p)
25967 ix86_first_cycle_multipass_data_t data
25968 = (ix86_first_cycle_multipass_data_t) _data;
25969 const_ix86_first_cycle_multipass_data_t prev_data
25970 = ix86_first_cycle_multipass_data;
25972 /* Restore the state from the end of the previous round. */
25973 data->ifetch_block_len = prev_data->ifetch_block_len;
25974 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
25976 /* Filter instructions that cannot be issued on current cycle due to
25977 decoder restrictions. */
25978 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25979 first_cycle_insn_p);
25982 /* INSN is being issued in current solution. Account for its impact on
25983 the decoder model. */
25984 static void
25985 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
25986 rtx insn, const void *_prev_data)
25988 ix86_first_cycle_multipass_data_t data
25989 = (ix86_first_cycle_multipass_data_t) _data;
25990 const_ix86_first_cycle_multipass_data_t prev_data
25991 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
25993 int insn_size = min_insn_size (insn);
25995 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
25996 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
25997 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
25998 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
26000 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
26001 if (!data->ready_try_change)
26003 data->ready_try_change = sbitmap_alloc (n_ready);
26004 data->ready_try_change_size = n_ready;
26006 else if (data->ready_try_change_size < n_ready)
26008 data->ready_try_change = sbitmap_resize (data->ready_try_change,
26009 n_ready, 0);
26010 data->ready_try_change_size = n_ready;
26012 bitmap_clear (data->ready_try_change);
26014 /* Filter out insns from ready_try that the core will not be able to issue
26015 on current cycle due to decoder. */
26016 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
26017 false);
26020 /* Revert the effect on ready_try. */
26021 static void
26022 core2i7_first_cycle_multipass_backtrack (const void *_data,
26023 char *ready_try,
26024 int n_ready ATTRIBUTE_UNUSED)
26026 const_ix86_first_cycle_multipass_data_t data
26027 = (const_ix86_first_cycle_multipass_data_t) _data;
26028 unsigned int i = 0;
26029 sbitmap_iterator sbi;
26031 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
26032 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
26034 ready_try[i] = 0;
26038 /* Save the result of multipass lookahead scheduling for the next round. */
26039 static void
26040 core2i7_first_cycle_multipass_end (const void *_data)
26042 const_ix86_first_cycle_multipass_data_t data
26043 = (const_ix86_first_cycle_multipass_data_t) _data;
26044 ix86_first_cycle_multipass_data_t next_data
26045 = ix86_first_cycle_multipass_data;
26047 if (data != NULL)
26049 next_data->ifetch_block_len = data->ifetch_block_len;
26050 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
26054 /* Deallocate target data. */
26055 static void
26056 core2i7_first_cycle_multipass_fini (void *_data)
26058 ix86_first_cycle_multipass_data_t data
26059 = (ix86_first_cycle_multipass_data_t) _data;
26061 if (data->ready_try_change)
26063 sbitmap_free (data->ready_try_change);
26064 data->ready_try_change = NULL;
26065 data->ready_try_change_size = 0;
26069 /* Prepare for scheduling pass. */
26070 static void
26071 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
26072 int verbose ATTRIBUTE_UNUSED,
26073 int max_uid ATTRIBUTE_UNUSED)
26075 /* Install scheduling hooks for current CPU. Some of these hooks are used
26076 in time-critical parts of the scheduler, so we only set them up when
26077 they are actually used. */
26078 switch (ix86_tune)
26080 case PROCESSOR_CORE2:
26081 case PROCESSOR_COREI7:
26082 case PROCESSOR_COREI7_AVX:
26083 case PROCESSOR_HASWELL:
26084 /* Do not perform multipass scheduling for pre-reload schedule
26085 to save compile time. */
26086 if (reload_completed)
26088 targetm.sched.dfa_post_advance_cycle
26089 = core2i7_dfa_post_advance_cycle;
26090 targetm.sched.first_cycle_multipass_init
26091 = core2i7_first_cycle_multipass_init;
26092 targetm.sched.first_cycle_multipass_begin
26093 = core2i7_first_cycle_multipass_begin;
26094 targetm.sched.first_cycle_multipass_issue
26095 = core2i7_first_cycle_multipass_issue;
26096 targetm.sched.first_cycle_multipass_backtrack
26097 = core2i7_first_cycle_multipass_backtrack;
26098 targetm.sched.first_cycle_multipass_end
26099 = core2i7_first_cycle_multipass_end;
26100 targetm.sched.first_cycle_multipass_fini
26101 = core2i7_first_cycle_multipass_fini;
26103 /* Set decoder parameters. */
26104 core2i7_secondary_decoder_max_insn_size = 8;
26105 core2i7_ifetch_block_size = 16;
26106 core2i7_ifetch_block_max_insns = 6;
26107 break;
26109 /* ... Fall through ... */
26110 default:
26111 targetm.sched.dfa_post_advance_cycle = NULL;
26112 targetm.sched.first_cycle_multipass_init = NULL;
26113 targetm.sched.first_cycle_multipass_begin = NULL;
26114 targetm.sched.first_cycle_multipass_issue = NULL;
26115 targetm.sched.first_cycle_multipass_backtrack = NULL;
26116 targetm.sched.first_cycle_multipass_end = NULL;
26117 targetm.sched.first_cycle_multipass_fini = NULL;
26118 break;
26123 /* Compute the alignment given to a constant that is being placed in memory.
26124 EXP is the constant and ALIGN is the alignment that the object would
26125 ordinarily have.
26126 The value of this function is used instead of that alignment to align
26127 the object. */
26130 ix86_constant_alignment (tree exp, int align)
26132 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
26133 || TREE_CODE (exp) == INTEGER_CST)
26135 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
26136 return 64;
26137 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
26138 return 128;
26140 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
26141 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
26142 return BITS_PER_WORD;
26144 return align;
26147 /* Compute the alignment for a static variable.
26148 TYPE is the data type, and ALIGN is the alignment that
26149 the object would ordinarily have. The value of this function is used
26150 instead of that alignment to align the object. */
26153 ix86_data_alignment (tree type, int align, bool opt)
26155 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
26157 if (opt
26158 && AGGREGATE_TYPE_P (type)
26159 && TYPE_SIZE (type)
26160 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26161 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
26162 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
26163 && align < max_align)
26164 align = max_align;
26166 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26167 to 16byte boundary. */
26168 if (TARGET_64BIT)
26170 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
26171 && TYPE_SIZE (type)
26172 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26173 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
26174 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
26175 return 128;
26178 if (!opt)
26179 return align;
26181 if (TREE_CODE (type) == ARRAY_TYPE)
26183 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26184 return 64;
26185 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26186 return 128;
26188 else if (TREE_CODE (type) == COMPLEX_TYPE)
26191 if (TYPE_MODE (type) == DCmode && align < 64)
26192 return 64;
26193 if ((TYPE_MODE (type) == XCmode
26194 || TYPE_MODE (type) == TCmode) && align < 128)
26195 return 128;
26197 else if ((TREE_CODE (type) == RECORD_TYPE
26198 || TREE_CODE (type) == UNION_TYPE
26199 || TREE_CODE (type) == QUAL_UNION_TYPE)
26200 && TYPE_FIELDS (type))
26202 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26203 return 64;
26204 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26205 return 128;
26207 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26208 || TREE_CODE (type) == INTEGER_TYPE)
26210 if (TYPE_MODE (type) == DFmode && align < 64)
26211 return 64;
26212 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26213 return 128;
26216 return align;
26219 /* Compute the alignment for a local variable or a stack slot. EXP is
26220 the data type or decl itself, MODE is the widest mode available and
26221 ALIGN is the alignment that the object would ordinarily have. The
26222 value of this macro is used instead of that alignment to align the
26223 object. */
26225 unsigned int
26226 ix86_local_alignment (tree exp, enum machine_mode mode,
26227 unsigned int align)
26229 tree type, decl;
26231 if (exp && DECL_P (exp))
26233 type = TREE_TYPE (exp);
26234 decl = exp;
26236 else
26238 type = exp;
26239 decl = NULL;
26242 /* Don't do dynamic stack realignment for long long objects with
26243 -mpreferred-stack-boundary=2. */
26244 if (!TARGET_64BIT
26245 && align == 64
26246 && ix86_preferred_stack_boundary < 64
26247 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
26248 && (!type || !TYPE_USER_ALIGN (type))
26249 && (!decl || !DECL_USER_ALIGN (decl)))
26250 align = 32;
26252 /* If TYPE is NULL, we are allocating a stack slot for caller-save
26253 register in MODE. We will return the largest alignment of XF
26254 and DF. */
26255 if (!type)
26257 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
26258 align = GET_MODE_ALIGNMENT (DFmode);
26259 return align;
26262 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26263 to 16byte boundary. Exact wording is:
26265 An array uses the same alignment as its elements, except that a local or
26266 global array variable of length at least 16 bytes or
26267 a C99 variable-length array variable always has alignment of at least 16 bytes.
26269 This was added to allow use of aligned SSE instructions at arrays. This
26270 rule is meant for static storage (where compiler can not do the analysis
26271 by itself). We follow it for automatic variables only when convenient.
26272 We fully control everything in the function compiled and functions from
26273 other unit can not rely on the alignment.
26275 Exclude va_list type. It is the common case of local array where
26276 we can not benefit from the alignment.
26278 TODO: Probably one should optimize for size only when var is not escaping. */
26279 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
26280 && TARGET_SSE)
26282 if (AGGREGATE_TYPE_P (type)
26283 && (va_list_type_node == NULL_TREE
26284 || (TYPE_MAIN_VARIANT (type)
26285 != TYPE_MAIN_VARIANT (va_list_type_node)))
26286 && TYPE_SIZE (type)
26287 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26288 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
26289 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
26290 return 128;
26292 if (TREE_CODE (type) == ARRAY_TYPE)
26294 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26295 return 64;
26296 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26297 return 128;
26299 else if (TREE_CODE (type) == COMPLEX_TYPE)
26301 if (TYPE_MODE (type) == DCmode && align < 64)
26302 return 64;
26303 if ((TYPE_MODE (type) == XCmode
26304 || TYPE_MODE (type) == TCmode) && align < 128)
26305 return 128;
26307 else if ((TREE_CODE (type) == RECORD_TYPE
26308 || TREE_CODE (type) == UNION_TYPE
26309 || TREE_CODE (type) == QUAL_UNION_TYPE)
26310 && TYPE_FIELDS (type))
26312 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26313 return 64;
26314 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26315 return 128;
26317 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26318 || TREE_CODE (type) == INTEGER_TYPE)
26321 if (TYPE_MODE (type) == DFmode && align < 64)
26322 return 64;
26323 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26324 return 128;
26326 return align;
26329 /* Compute the minimum required alignment for dynamic stack realignment
26330 purposes for a local variable, parameter or a stack slot. EXP is
26331 the data type or decl itself, MODE is its mode and ALIGN is the
26332 alignment that the object would ordinarily have. */
26334 unsigned int
26335 ix86_minimum_alignment (tree exp, enum machine_mode mode,
26336 unsigned int align)
26338 tree type, decl;
26340 if (exp && DECL_P (exp))
26342 type = TREE_TYPE (exp);
26343 decl = exp;
26345 else
26347 type = exp;
26348 decl = NULL;
26351 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
26352 return align;
26354 /* Don't do dynamic stack realignment for long long objects with
26355 -mpreferred-stack-boundary=2. */
26356 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
26357 && (!type || !TYPE_USER_ALIGN (type))
26358 && (!decl || !DECL_USER_ALIGN (decl)))
26359 return 32;
26361 return align;
26364 /* Find a location for the static chain incoming to a nested function.
26365 This is a register, unless all free registers are used by arguments. */
26367 static rtx
26368 ix86_static_chain (const_tree fndecl, bool incoming_p)
26370 unsigned regno;
26372 if (!DECL_STATIC_CHAIN (fndecl))
26373 return NULL;
26375 if (TARGET_64BIT)
26377 /* We always use R10 in 64-bit mode. */
26378 regno = R10_REG;
26380 else
26382 tree fntype;
26383 unsigned int ccvt;
26385 /* By default in 32-bit mode we use ECX to pass the static chain. */
26386 regno = CX_REG;
26388 fntype = TREE_TYPE (fndecl);
26389 ccvt = ix86_get_callcvt (fntype);
26390 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
26392 /* Fastcall functions use ecx/edx for arguments, which leaves
26393 us with EAX for the static chain.
26394 Thiscall functions use ecx for arguments, which also
26395 leaves us with EAX for the static chain. */
26396 regno = AX_REG;
26398 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
26400 /* Thiscall functions use ecx for arguments, which leaves
26401 us with EAX and EDX for the static chain.
26402 We are using for abi-compatibility EAX. */
26403 regno = AX_REG;
26405 else if (ix86_function_regparm (fntype, fndecl) == 3)
26407 /* For regparm 3, we have no free call-clobbered registers in
26408 which to store the static chain. In order to implement this,
26409 we have the trampoline push the static chain to the stack.
26410 However, we can't push a value below the return address when
26411 we call the nested function directly, so we have to use an
26412 alternate entry point. For this we use ESI, and have the
26413 alternate entry point push ESI, so that things appear the
26414 same once we're executing the nested function. */
26415 if (incoming_p)
26417 if (fndecl == current_function_decl)
26418 ix86_static_chain_on_stack = true;
26419 return gen_frame_mem (SImode,
26420 plus_constant (Pmode,
26421 arg_pointer_rtx, -8));
26423 regno = SI_REG;
26427 return gen_rtx_REG (Pmode, regno);
26430 /* Emit RTL insns to initialize the variable parts of a trampoline.
26431 FNDECL is the decl of the target address; M_TRAMP is a MEM for
26432 the trampoline, and CHAIN_VALUE is an RTX for the static chain
26433 to be passed to the target function. */
26435 static void
26436 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
26438 rtx mem, fnaddr;
26439 int opcode;
26440 int offset = 0;
26442 fnaddr = XEXP (DECL_RTL (fndecl), 0);
26444 if (TARGET_64BIT)
26446 int size;
26448 /* Load the function address to r11. Try to load address using
26449 the shorter movl instead of movabs. We may want to support
26450 movq for kernel mode, but kernel does not use trampolines at
26451 the moment. FNADDR is a 32bit address and may not be in
26452 DImode when ptr_mode == SImode. Always use movl in this
26453 case. */
26454 if (ptr_mode == SImode
26455 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
26457 fnaddr = copy_addr_to_reg (fnaddr);
26459 mem = adjust_address (m_tramp, HImode, offset);
26460 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
26462 mem = adjust_address (m_tramp, SImode, offset + 2);
26463 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
26464 offset += 6;
26466 else
26468 mem = adjust_address (m_tramp, HImode, offset);
26469 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
26471 mem = adjust_address (m_tramp, DImode, offset + 2);
26472 emit_move_insn (mem, fnaddr);
26473 offset += 10;
26476 /* Load static chain using movabs to r10. Use the shorter movl
26477 instead of movabs when ptr_mode == SImode. */
26478 if (ptr_mode == SImode)
26480 opcode = 0xba41;
26481 size = 6;
26483 else
26485 opcode = 0xba49;
26486 size = 10;
26489 mem = adjust_address (m_tramp, HImode, offset);
26490 emit_move_insn (mem, gen_int_mode (opcode, HImode));
26492 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
26493 emit_move_insn (mem, chain_value);
26494 offset += size;
26496 /* Jump to r11; the last (unused) byte is a nop, only there to
26497 pad the write out to a single 32-bit store. */
26498 mem = adjust_address (m_tramp, SImode, offset);
26499 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
26500 offset += 4;
26502 else
26504 rtx disp, chain;
26506 /* Depending on the static chain location, either load a register
26507 with a constant, or push the constant to the stack. All of the
26508 instructions are the same size. */
26509 chain = ix86_static_chain (fndecl, true);
26510 if (REG_P (chain))
26512 switch (REGNO (chain))
26514 case AX_REG:
26515 opcode = 0xb8; break;
26516 case CX_REG:
26517 opcode = 0xb9; break;
26518 default:
26519 gcc_unreachable ();
26522 else
26523 opcode = 0x68;
26525 mem = adjust_address (m_tramp, QImode, offset);
26526 emit_move_insn (mem, gen_int_mode (opcode, QImode));
26528 mem = adjust_address (m_tramp, SImode, offset + 1);
26529 emit_move_insn (mem, chain_value);
26530 offset += 5;
26532 mem = adjust_address (m_tramp, QImode, offset);
26533 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
26535 mem = adjust_address (m_tramp, SImode, offset + 1);
26537 /* Compute offset from the end of the jmp to the target function.
26538 In the case in which the trampoline stores the static chain on
26539 the stack, we need to skip the first insn which pushes the
26540 (call-saved) register static chain; this push is 1 byte. */
26541 offset += 5;
26542 disp = expand_binop (SImode, sub_optab, fnaddr,
26543 plus_constant (Pmode, XEXP (m_tramp, 0),
26544 offset - (MEM_P (chain) ? 1 : 0)),
26545 NULL_RTX, 1, OPTAB_DIRECT);
26546 emit_move_insn (mem, disp);
26549 gcc_assert (offset <= TRAMPOLINE_SIZE);
26551 #ifdef HAVE_ENABLE_EXECUTE_STACK
26552 #ifdef CHECK_EXECUTE_STACK_ENABLED
26553 if (CHECK_EXECUTE_STACK_ENABLED)
26554 #endif
26555 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
26556 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
26557 #endif
26560 /* The following file contains several enumerations and data structures
26561 built from the definitions in i386-builtin-types.def. */
26563 #include "i386-builtin-types.inc"
26565 /* Table for the ix86 builtin non-function types. */
26566 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
26568 /* Retrieve an element from the above table, building some of
26569 the types lazily. */
26571 static tree
26572 ix86_get_builtin_type (enum ix86_builtin_type tcode)
26574 unsigned int index;
26575 tree type, itype;
26577 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
26579 type = ix86_builtin_type_tab[(int) tcode];
26580 if (type != NULL)
26581 return type;
26583 gcc_assert (tcode > IX86_BT_LAST_PRIM);
26584 if (tcode <= IX86_BT_LAST_VECT)
26586 enum machine_mode mode;
26588 index = tcode - IX86_BT_LAST_PRIM - 1;
26589 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
26590 mode = ix86_builtin_type_vect_mode[index];
26592 type = build_vector_type_for_mode (itype, mode);
26594 else
26596 int quals;
26598 index = tcode - IX86_BT_LAST_VECT - 1;
26599 if (tcode <= IX86_BT_LAST_PTR)
26600 quals = TYPE_UNQUALIFIED;
26601 else
26602 quals = TYPE_QUAL_CONST;
26604 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
26605 if (quals != TYPE_UNQUALIFIED)
26606 itype = build_qualified_type (itype, quals);
26608 type = build_pointer_type (itype);
26611 ix86_builtin_type_tab[(int) tcode] = type;
26612 return type;
26615 /* Table for the ix86 builtin function types. */
26616 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
26618 /* Retrieve an element from the above table, building some of
26619 the types lazily. */
26621 static tree
26622 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
26624 tree type;
26626 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
26628 type = ix86_builtin_func_type_tab[(int) tcode];
26629 if (type != NULL)
26630 return type;
26632 if (tcode <= IX86_BT_LAST_FUNC)
26634 unsigned start = ix86_builtin_func_start[(int) tcode];
26635 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
26636 tree rtype, atype, args = void_list_node;
26637 unsigned i;
26639 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
26640 for (i = after - 1; i > start; --i)
26642 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
26643 args = tree_cons (NULL, atype, args);
26646 type = build_function_type (rtype, args);
26648 else
26650 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
26651 enum ix86_builtin_func_type icode;
26653 icode = ix86_builtin_func_alias_base[index];
26654 type = ix86_get_builtin_func_type (icode);
26657 ix86_builtin_func_type_tab[(int) tcode] = type;
26658 return type;
26662 /* Codes for all the SSE/MMX builtins. */
26663 enum ix86_builtins
26665 IX86_BUILTIN_ADDPS,
26666 IX86_BUILTIN_ADDSS,
26667 IX86_BUILTIN_DIVPS,
26668 IX86_BUILTIN_DIVSS,
26669 IX86_BUILTIN_MULPS,
26670 IX86_BUILTIN_MULSS,
26671 IX86_BUILTIN_SUBPS,
26672 IX86_BUILTIN_SUBSS,
26674 IX86_BUILTIN_CMPEQPS,
26675 IX86_BUILTIN_CMPLTPS,
26676 IX86_BUILTIN_CMPLEPS,
26677 IX86_BUILTIN_CMPGTPS,
26678 IX86_BUILTIN_CMPGEPS,
26679 IX86_BUILTIN_CMPNEQPS,
26680 IX86_BUILTIN_CMPNLTPS,
26681 IX86_BUILTIN_CMPNLEPS,
26682 IX86_BUILTIN_CMPNGTPS,
26683 IX86_BUILTIN_CMPNGEPS,
26684 IX86_BUILTIN_CMPORDPS,
26685 IX86_BUILTIN_CMPUNORDPS,
26686 IX86_BUILTIN_CMPEQSS,
26687 IX86_BUILTIN_CMPLTSS,
26688 IX86_BUILTIN_CMPLESS,
26689 IX86_BUILTIN_CMPNEQSS,
26690 IX86_BUILTIN_CMPNLTSS,
26691 IX86_BUILTIN_CMPNLESS,
26692 IX86_BUILTIN_CMPORDSS,
26693 IX86_BUILTIN_CMPUNORDSS,
26695 IX86_BUILTIN_COMIEQSS,
26696 IX86_BUILTIN_COMILTSS,
26697 IX86_BUILTIN_COMILESS,
26698 IX86_BUILTIN_COMIGTSS,
26699 IX86_BUILTIN_COMIGESS,
26700 IX86_BUILTIN_COMINEQSS,
26701 IX86_BUILTIN_UCOMIEQSS,
26702 IX86_BUILTIN_UCOMILTSS,
26703 IX86_BUILTIN_UCOMILESS,
26704 IX86_BUILTIN_UCOMIGTSS,
26705 IX86_BUILTIN_UCOMIGESS,
26706 IX86_BUILTIN_UCOMINEQSS,
26708 IX86_BUILTIN_CVTPI2PS,
26709 IX86_BUILTIN_CVTPS2PI,
26710 IX86_BUILTIN_CVTSI2SS,
26711 IX86_BUILTIN_CVTSI642SS,
26712 IX86_BUILTIN_CVTSS2SI,
26713 IX86_BUILTIN_CVTSS2SI64,
26714 IX86_BUILTIN_CVTTPS2PI,
26715 IX86_BUILTIN_CVTTSS2SI,
26716 IX86_BUILTIN_CVTTSS2SI64,
26718 IX86_BUILTIN_MAXPS,
26719 IX86_BUILTIN_MAXSS,
26720 IX86_BUILTIN_MINPS,
26721 IX86_BUILTIN_MINSS,
26723 IX86_BUILTIN_LOADUPS,
26724 IX86_BUILTIN_STOREUPS,
26725 IX86_BUILTIN_MOVSS,
26727 IX86_BUILTIN_MOVHLPS,
26728 IX86_BUILTIN_MOVLHPS,
26729 IX86_BUILTIN_LOADHPS,
26730 IX86_BUILTIN_LOADLPS,
26731 IX86_BUILTIN_STOREHPS,
26732 IX86_BUILTIN_STORELPS,
26734 IX86_BUILTIN_MASKMOVQ,
26735 IX86_BUILTIN_MOVMSKPS,
26736 IX86_BUILTIN_PMOVMSKB,
26738 IX86_BUILTIN_MOVNTPS,
26739 IX86_BUILTIN_MOVNTQ,
26741 IX86_BUILTIN_LOADDQU,
26742 IX86_BUILTIN_STOREDQU,
26744 IX86_BUILTIN_PACKSSWB,
26745 IX86_BUILTIN_PACKSSDW,
26746 IX86_BUILTIN_PACKUSWB,
26748 IX86_BUILTIN_PADDB,
26749 IX86_BUILTIN_PADDW,
26750 IX86_BUILTIN_PADDD,
26751 IX86_BUILTIN_PADDQ,
26752 IX86_BUILTIN_PADDSB,
26753 IX86_BUILTIN_PADDSW,
26754 IX86_BUILTIN_PADDUSB,
26755 IX86_BUILTIN_PADDUSW,
26756 IX86_BUILTIN_PSUBB,
26757 IX86_BUILTIN_PSUBW,
26758 IX86_BUILTIN_PSUBD,
26759 IX86_BUILTIN_PSUBQ,
26760 IX86_BUILTIN_PSUBSB,
26761 IX86_BUILTIN_PSUBSW,
26762 IX86_BUILTIN_PSUBUSB,
26763 IX86_BUILTIN_PSUBUSW,
26765 IX86_BUILTIN_PAND,
26766 IX86_BUILTIN_PANDN,
26767 IX86_BUILTIN_POR,
26768 IX86_BUILTIN_PXOR,
26770 IX86_BUILTIN_PAVGB,
26771 IX86_BUILTIN_PAVGW,
26773 IX86_BUILTIN_PCMPEQB,
26774 IX86_BUILTIN_PCMPEQW,
26775 IX86_BUILTIN_PCMPEQD,
26776 IX86_BUILTIN_PCMPGTB,
26777 IX86_BUILTIN_PCMPGTW,
26778 IX86_BUILTIN_PCMPGTD,
26780 IX86_BUILTIN_PMADDWD,
26782 IX86_BUILTIN_PMAXSW,
26783 IX86_BUILTIN_PMAXUB,
26784 IX86_BUILTIN_PMINSW,
26785 IX86_BUILTIN_PMINUB,
26787 IX86_BUILTIN_PMULHUW,
26788 IX86_BUILTIN_PMULHW,
26789 IX86_BUILTIN_PMULLW,
26791 IX86_BUILTIN_PSADBW,
26792 IX86_BUILTIN_PSHUFW,
26794 IX86_BUILTIN_PSLLW,
26795 IX86_BUILTIN_PSLLD,
26796 IX86_BUILTIN_PSLLQ,
26797 IX86_BUILTIN_PSRAW,
26798 IX86_BUILTIN_PSRAD,
26799 IX86_BUILTIN_PSRLW,
26800 IX86_BUILTIN_PSRLD,
26801 IX86_BUILTIN_PSRLQ,
26802 IX86_BUILTIN_PSLLWI,
26803 IX86_BUILTIN_PSLLDI,
26804 IX86_BUILTIN_PSLLQI,
26805 IX86_BUILTIN_PSRAWI,
26806 IX86_BUILTIN_PSRADI,
26807 IX86_BUILTIN_PSRLWI,
26808 IX86_BUILTIN_PSRLDI,
26809 IX86_BUILTIN_PSRLQI,
26811 IX86_BUILTIN_PUNPCKHBW,
26812 IX86_BUILTIN_PUNPCKHWD,
26813 IX86_BUILTIN_PUNPCKHDQ,
26814 IX86_BUILTIN_PUNPCKLBW,
26815 IX86_BUILTIN_PUNPCKLWD,
26816 IX86_BUILTIN_PUNPCKLDQ,
26818 IX86_BUILTIN_SHUFPS,
26820 IX86_BUILTIN_RCPPS,
26821 IX86_BUILTIN_RCPSS,
26822 IX86_BUILTIN_RSQRTPS,
26823 IX86_BUILTIN_RSQRTPS_NR,
26824 IX86_BUILTIN_RSQRTSS,
26825 IX86_BUILTIN_RSQRTF,
26826 IX86_BUILTIN_SQRTPS,
26827 IX86_BUILTIN_SQRTPS_NR,
26828 IX86_BUILTIN_SQRTSS,
26830 IX86_BUILTIN_UNPCKHPS,
26831 IX86_BUILTIN_UNPCKLPS,
26833 IX86_BUILTIN_ANDPS,
26834 IX86_BUILTIN_ANDNPS,
26835 IX86_BUILTIN_ORPS,
26836 IX86_BUILTIN_XORPS,
26838 IX86_BUILTIN_EMMS,
26839 IX86_BUILTIN_LDMXCSR,
26840 IX86_BUILTIN_STMXCSR,
26841 IX86_BUILTIN_SFENCE,
26843 IX86_BUILTIN_FXSAVE,
26844 IX86_BUILTIN_FXRSTOR,
26845 IX86_BUILTIN_FXSAVE64,
26846 IX86_BUILTIN_FXRSTOR64,
26848 IX86_BUILTIN_XSAVE,
26849 IX86_BUILTIN_XRSTOR,
26850 IX86_BUILTIN_XSAVE64,
26851 IX86_BUILTIN_XRSTOR64,
26853 IX86_BUILTIN_XSAVEOPT,
26854 IX86_BUILTIN_XSAVEOPT64,
26856 /* 3DNow! Original */
26857 IX86_BUILTIN_FEMMS,
26858 IX86_BUILTIN_PAVGUSB,
26859 IX86_BUILTIN_PF2ID,
26860 IX86_BUILTIN_PFACC,
26861 IX86_BUILTIN_PFADD,
26862 IX86_BUILTIN_PFCMPEQ,
26863 IX86_BUILTIN_PFCMPGE,
26864 IX86_BUILTIN_PFCMPGT,
26865 IX86_BUILTIN_PFMAX,
26866 IX86_BUILTIN_PFMIN,
26867 IX86_BUILTIN_PFMUL,
26868 IX86_BUILTIN_PFRCP,
26869 IX86_BUILTIN_PFRCPIT1,
26870 IX86_BUILTIN_PFRCPIT2,
26871 IX86_BUILTIN_PFRSQIT1,
26872 IX86_BUILTIN_PFRSQRT,
26873 IX86_BUILTIN_PFSUB,
26874 IX86_BUILTIN_PFSUBR,
26875 IX86_BUILTIN_PI2FD,
26876 IX86_BUILTIN_PMULHRW,
26878 /* 3DNow! Athlon Extensions */
26879 IX86_BUILTIN_PF2IW,
26880 IX86_BUILTIN_PFNACC,
26881 IX86_BUILTIN_PFPNACC,
26882 IX86_BUILTIN_PI2FW,
26883 IX86_BUILTIN_PSWAPDSI,
26884 IX86_BUILTIN_PSWAPDSF,
26886 /* SSE2 */
26887 IX86_BUILTIN_ADDPD,
26888 IX86_BUILTIN_ADDSD,
26889 IX86_BUILTIN_DIVPD,
26890 IX86_BUILTIN_DIVSD,
26891 IX86_BUILTIN_MULPD,
26892 IX86_BUILTIN_MULSD,
26893 IX86_BUILTIN_SUBPD,
26894 IX86_BUILTIN_SUBSD,
26896 IX86_BUILTIN_CMPEQPD,
26897 IX86_BUILTIN_CMPLTPD,
26898 IX86_BUILTIN_CMPLEPD,
26899 IX86_BUILTIN_CMPGTPD,
26900 IX86_BUILTIN_CMPGEPD,
26901 IX86_BUILTIN_CMPNEQPD,
26902 IX86_BUILTIN_CMPNLTPD,
26903 IX86_BUILTIN_CMPNLEPD,
26904 IX86_BUILTIN_CMPNGTPD,
26905 IX86_BUILTIN_CMPNGEPD,
26906 IX86_BUILTIN_CMPORDPD,
26907 IX86_BUILTIN_CMPUNORDPD,
26908 IX86_BUILTIN_CMPEQSD,
26909 IX86_BUILTIN_CMPLTSD,
26910 IX86_BUILTIN_CMPLESD,
26911 IX86_BUILTIN_CMPNEQSD,
26912 IX86_BUILTIN_CMPNLTSD,
26913 IX86_BUILTIN_CMPNLESD,
26914 IX86_BUILTIN_CMPORDSD,
26915 IX86_BUILTIN_CMPUNORDSD,
26917 IX86_BUILTIN_COMIEQSD,
26918 IX86_BUILTIN_COMILTSD,
26919 IX86_BUILTIN_COMILESD,
26920 IX86_BUILTIN_COMIGTSD,
26921 IX86_BUILTIN_COMIGESD,
26922 IX86_BUILTIN_COMINEQSD,
26923 IX86_BUILTIN_UCOMIEQSD,
26924 IX86_BUILTIN_UCOMILTSD,
26925 IX86_BUILTIN_UCOMILESD,
26926 IX86_BUILTIN_UCOMIGTSD,
26927 IX86_BUILTIN_UCOMIGESD,
26928 IX86_BUILTIN_UCOMINEQSD,
26930 IX86_BUILTIN_MAXPD,
26931 IX86_BUILTIN_MAXSD,
26932 IX86_BUILTIN_MINPD,
26933 IX86_BUILTIN_MINSD,
26935 IX86_BUILTIN_ANDPD,
26936 IX86_BUILTIN_ANDNPD,
26937 IX86_BUILTIN_ORPD,
26938 IX86_BUILTIN_XORPD,
26940 IX86_BUILTIN_SQRTPD,
26941 IX86_BUILTIN_SQRTSD,
26943 IX86_BUILTIN_UNPCKHPD,
26944 IX86_BUILTIN_UNPCKLPD,
26946 IX86_BUILTIN_SHUFPD,
26948 IX86_BUILTIN_LOADUPD,
26949 IX86_BUILTIN_STOREUPD,
26950 IX86_BUILTIN_MOVSD,
26952 IX86_BUILTIN_LOADHPD,
26953 IX86_BUILTIN_LOADLPD,
26955 IX86_BUILTIN_CVTDQ2PD,
26956 IX86_BUILTIN_CVTDQ2PS,
26958 IX86_BUILTIN_CVTPD2DQ,
26959 IX86_BUILTIN_CVTPD2PI,
26960 IX86_BUILTIN_CVTPD2PS,
26961 IX86_BUILTIN_CVTTPD2DQ,
26962 IX86_BUILTIN_CVTTPD2PI,
26964 IX86_BUILTIN_CVTPI2PD,
26965 IX86_BUILTIN_CVTSI2SD,
26966 IX86_BUILTIN_CVTSI642SD,
26968 IX86_BUILTIN_CVTSD2SI,
26969 IX86_BUILTIN_CVTSD2SI64,
26970 IX86_BUILTIN_CVTSD2SS,
26971 IX86_BUILTIN_CVTSS2SD,
26972 IX86_BUILTIN_CVTTSD2SI,
26973 IX86_BUILTIN_CVTTSD2SI64,
26975 IX86_BUILTIN_CVTPS2DQ,
26976 IX86_BUILTIN_CVTPS2PD,
26977 IX86_BUILTIN_CVTTPS2DQ,
26979 IX86_BUILTIN_MOVNTI,
26980 IX86_BUILTIN_MOVNTI64,
26981 IX86_BUILTIN_MOVNTPD,
26982 IX86_BUILTIN_MOVNTDQ,
26984 IX86_BUILTIN_MOVQ128,
26986 /* SSE2 MMX */
26987 IX86_BUILTIN_MASKMOVDQU,
26988 IX86_BUILTIN_MOVMSKPD,
26989 IX86_BUILTIN_PMOVMSKB128,
26991 IX86_BUILTIN_PACKSSWB128,
26992 IX86_BUILTIN_PACKSSDW128,
26993 IX86_BUILTIN_PACKUSWB128,
26995 IX86_BUILTIN_PADDB128,
26996 IX86_BUILTIN_PADDW128,
26997 IX86_BUILTIN_PADDD128,
26998 IX86_BUILTIN_PADDQ128,
26999 IX86_BUILTIN_PADDSB128,
27000 IX86_BUILTIN_PADDSW128,
27001 IX86_BUILTIN_PADDUSB128,
27002 IX86_BUILTIN_PADDUSW128,
27003 IX86_BUILTIN_PSUBB128,
27004 IX86_BUILTIN_PSUBW128,
27005 IX86_BUILTIN_PSUBD128,
27006 IX86_BUILTIN_PSUBQ128,
27007 IX86_BUILTIN_PSUBSB128,
27008 IX86_BUILTIN_PSUBSW128,
27009 IX86_BUILTIN_PSUBUSB128,
27010 IX86_BUILTIN_PSUBUSW128,
27012 IX86_BUILTIN_PAND128,
27013 IX86_BUILTIN_PANDN128,
27014 IX86_BUILTIN_POR128,
27015 IX86_BUILTIN_PXOR128,
27017 IX86_BUILTIN_PAVGB128,
27018 IX86_BUILTIN_PAVGW128,
27020 IX86_BUILTIN_PCMPEQB128,
27021 IX86_BUILTIN_PCMPEQW128,
27022 IX86_BUILTIN_PCMPEQD128,
27023 IX86_BUILTIN_PCMPGTB128,
27024 IX86_BUILTIN_PCMPGTW128,
27025 IX86_BUILTIN_PCMPGTD128,
27027 IX86_BUILTIN_PMADDWD128,
27029 IX86_BUILTIN_PMAXSW128,
27030 IX86_BUILTIN_PMAXUB128,
27031 IX86_BUILTIN_PMINSW128,
27032 IX86_BUILTIN_PMINUB128,
27034 IX86_BUILTIN_PMULUDQ,
27035 IX86_BUILTIN_PMULUDQ128,
27036 IX86_BUILTIN_PMULHUW128,
27037 IX86_BUILTIN_PMULHW128,
27038 IX86_BUILTIN_PMULLW128,
27040 IX86_BUILTIN_PSADBW128,
27041 IX86_BUILTIN_PSHUFHW,
27042 IX86_BUILTIN_PSHUFLW,
27043 IX86_BUILTIN_PSHUFD,
27045 IX86_BUILTIN_PSLLDQI128,
27046 IX86_BUILTIN_PSLLWI128,
27047 IX86_BUILTIN_PSLLDI128,
27048 IX86_BUILTIN_PSLLQI128,
27049 IX86_BUILTIN_PSRAWI128,
27050 IX86_BUILTIN_PSRADI128,
27051 IX86_BUILTIN_PSRLDQI128,
27052 IX86_BUILTIN_PSRLWI128,
27053 IX86_BUILTIN_PSRLDI128,
27054 IX86_BUILTIN_PSRLQI128,
27056 IX86_BUILTIN_PSLLDQ128,
27057 IX86_BUILTIN_PSLLW128,
27058 IX86_BUILTIN_PSLLD128,
27059 IX86_BUILTIN_PSLLQ128,
27060 IX86_BUILTIN_PSRAW128,
27061 IX86_BUILTIN_PSRAD128,
27062 IX86_BUILTIN_PSRLW128,
27063 IX86_BUILTIN_PSRLD128,
27064 IX86_BUILTIN_PSRLQ128,
27066 IX86_BUILTIN_PUNPCKHBW128,
27067 IX86_BUILTIN_PUNPCKHWD128,
27068 IX86_BUILTIN_PUNPCKHDQ128,
27069 IX86_BUILTIN_PUNPCKHQDQ128,
27070 IX86_BUILTIN_PUNPCKLBW128,
27071 IX86_BUILTIN_PUNPCKLWD128,
27072 IX86_BUILTIN_PUNPCKLDQ128,
27073 IX86_BUILTIN_PUNPCKLQDQ128,
27075 IX86_BUILTIN_CLFLUSH,
27076 IX86_BUILTIN_MFENCE,
27077 IX86_BUILTIN_LFENCE,
27078 IX86_BUILTIN_PAUSE,
27080 IX86_BUILTIN_FNSTENV,
27081 IX86_BUILTIN_FLDENV,
27082 IX86_BUILTIN_FNSTSW,
27083 IX86_BUILTIN_FNCLEX,
27085 IX86_BUILTIN_BSRSI,
27086 IX86_BUILTIN_BSRDI,
27087 IX86_BUILTIN_RDPMC,
27088 IX86_BUILTIN_RDTSC,
27089 IX86_BUILTIN_RDTSCP,
27090 IX86_BUILTIN_ROLQI,
27091 IX86_BUILTIN_ROLHI,
27092 IX86_BUILTIN_RORQI,
27093 IX86_BUILTIN_RORHI,
27095 /* SSE3. */
27096 IX86_BUILTIN_ADDSUBPS,
27097 IX86_BUILTIN_HADDPS,
27098 IX86_BUILTIN_HSUBPS,
27099 IX86_BUILTIN_MOVSHDUP,
27100 IX86_BUILTIN_MOVSLDUP,
27101 IX86_BUILTIN_ADDSUBPD,
27102 IX86_BUILTIN_HADDPD,
27103 IX86_BUILTIN_HSUBPD,
27104 IX86_BUILTIN_LDDQU,
27106 IX86_BUILTIN_MONITOR,
27107 IX86_BUILTIN_MWAIT,
27109 /* SSSE3. */
27110 IX86_BUILTIN_PHADDW,
27111 IX86_BUILTIN_PHADDD,
27112 IX86_BUILTIN_PHADDSW,
27113 IX86_BUILTIN_PHSUBW,
27114 IX86_BUILTIN_PHSUBD,
27115 IX86_BUILTIN_PHSUBSW,
27116 IX86_BUILTIN_PMADDUBSW,
27117 IX86_BUILTIN_PMULHRSW,
27118 IX86_BUILTIN_PSHUFB,
27119 IX86_BUILTIN_PSIGNB,
27120 IX86_BUILTIN_PSIGNW,
27121 IX86_BUILTIN_PSIGND,
27122 IX86_BUILTIN_PALIGNR,
27123 IX86_BUILTIN_PABSB,
27124 IX86_BUILTIN_PABSW,
27125 IX86_BUILTIN_PABSD,
27127 IX86_BUILTIN_PHADDW128,
27128 IX86_BUILTIN_PHADDD128,
27129 IX86_BUILTIN_PHADDSW128,
27130 IX86_BUILTIN_PHSUBW128,
27131 IX86_BUILTIN_PHSUBD128,
27132 IX86_BUILTIN_PHSUBSW128,
27133 IX86_BUILTIN_PMADDUBSW128,
27134 IX86_BUILTIN_PMULHRSW128,
27135 IX86_BUILTIN_PSHUFB128,
27136 IX86_BUILTIN_PSIGNB128,
27137 IX86_BUILTIN_PSIGNW128,
27138 IX86_BUILTIN_PSIGND128,
27139 IX86_BUILTIN_PALIGNR128,
27140 IX86_BUILTIN_PABSB128,
27141 IX86_BUILTIN_PABSW128,
27142 IX86_BUILTIN_PABSD128,
27144 /* AMDFAM10 - SSE4A New Instructions. */
27145 IX86_BUILTIN_MOVNTSD,
27146 IX86_BUILTIN_MOVNTSS,
27147 IX86_BUILTIN_EXTRQI,
27148 IX86_BUILTIN_EXTRQ,
27149 IX86_BUILTIN_INSERTQI,
27150 IX86_BUILTIN_INSERTQ,
27152 /* SSE4.1. */
27153 IX86_BUILTIN_BLENDPD,
27154 IX86_BUILTIN_BLENDPS,
27155 IX86_BUILTIN_BLENDVPD,
27156 IX86_BUILTIN_BLENDVPS,
27157 IX86_BUILTIN_PBLENDVB128,
27158 IX86_BUILTIN_PBLENDW128,
27160 IX86_BUILTIN_DPPD,
27161 IX86_BUILTIN_DPPS,
27163 IX86_BUILTIN_INSERTPS128,
27165 IX86_BUILTIN_MOVNTDQA,
27166 IX86_BUILTIN_MPSADBW128,
27167 IX86_BUILTIN_PACKUSDW128,
27168 IX86_BUILTIN_PCMPEQQ,
27169 IX86_BUILTIN_PHMINPOSUW128,
27171 IX86_BUILTIN_PMAXSB128,
27172 IX86_BUILTIN_PMAXSD128,
27173 IX86_BUILTIN_PMAXUD128,
27174 IX86_BUILTIN_PMAXUW128,
27176 IX86_BUILTIN_PMINSB128,
27177 IX86_BUILTIN_PMINSD128,
27178 IX86_BUILTIN_PMINUD128,
27179 IX86_BUILTIN_PMINUW128,
27181 IX86_BUILTIN_PMOVSXBW128,
27182 IX86_BUILTIN_PMOVSXBD128,
27183 IX86_BUILTIN_PMOVSXBQ128,
27184 IX86_BUILTIN_PMOVSXWD128,
27185 IX86_BUILTIN_PMOVSXWQ128,
27186 IX86_BUILTIN_PMOVSXDQ128,
27188 IX86_BUILTIN_PMOVZXBW128,
27189 IX86_BUILTIN_PMOVZXBD128,
27190 IX86_BUILTIN_PMOVZXBQ128,
27191 IX86_BUILTIN_PMOVZXWD128,
27192 IX86_BUILTIN_PMOVZXWQ128,
27193 IX86_BUILTIN_PMOVZXDQ128,
27195 IX86_BUILTIN_PMULDQ128,
27196 IX86_BUILTIN_PMULLD128,
27198 IX86_BUILTIN_ROUNDSD,
27199 IX86_BUILTIN_ROUNDSS,
27201 IX86_BUILTIN_ROUNDPD,
27202 IX86_BUILTIN_ROUNDPS,
27204 IX86_BUILTIN_FLOORPD,
27205 IX86_BUILTIN_CEILPD,
27206 IX86_BUILTIN_TRUNCPD,
27207 IX86_BUILTIN_RINTPD,
27208 IX86_BUILTIN_ROUNDPD_AZ,
27210 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
27211 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
27212 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
27214 IX86_BUILTIN_FLOORPS,
27215 IX86_BUILTIN_CEILPS,
27216 IX86_BUILTIN_TRUNCPS,
27217 IX86_BUILTIN_RINTPS,
27218 IX86_BUILTIN_ROUNDPS_AZ,
27220 IX86_BUILTIN_FLOORPS_SFIX,
27221 IX86_BUILTIN_CEILPS_SFIX,
27222 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
27224 IX86_BUILTIN_PTESTZ,
27225 IX86_BUILTIN_PTESTC,
27226 IX86_BUILTIN_PTESTNZC,
27228 IX86_BUILTIN_VEC_INIT_V2SI,
27229 IX86_BUILTIN_VEC_INIT_V4HI,
27230 IX86_BUILTIN_VEC_INIT_V8QI,
27231 IX86_BUILTIN_VEC_EXT_V2DF,
27232 IX86_BUILTIN_VEC_EXT_V2DI,
27233 IX86_BUILTIN_VEC_EXT_V4SF,
27234 IX86_BUILTIN_VEC_EXT_V4SI,
27235 IX86_BUILTIN_VEC_EXT_V8HI,
27236 IX86_BUILTIN_VEC_EXT_V2SI,
27237 IX86_BUILTIN_VEC_EXT_V4HI,
27238 IX86_BUILTIN_VEC_EXT_V16QI,
27239 IX86_BUILTIN_VEC_SET_V2DI,
27240 IX86_BUILTIN_VEC_SET_V4SF,
27241 IX86_BUILTIN_VEC_SET_V4SI,
27242 IX86_BUILTIN_VEC_SET_V8HI,
27243 IX86_BUILTIN_VEC_SET_V4HI,
27244 IX86_BUILTIN_VEC_SET_V16QI,
27246 IX86_BUILTIN_VEC_PACK_SFIX,
27247 IX86_BUILTIN_VEC_PACK_SFIX256,
27249 /* SSE4.2. */
27250 IX86_BUILTIN_CRC32QI,
27251 IX86_BUILTIN_CRC32HI,
27252 IX86_BUILTIN_CRC32SI,
27253 IX86_BUILTIN_CRC32DI,
27255 IX86_BUILTIN_PCMPESTRI128,
27256 IX86_BUILTIN_PCMPESTRM128,
27257 IX86_BUILTIN_PCMPESTRA128,
27258 IX86_BUILTIN_PCMPESTRC128,
27259 IX86_BUILTIN_PCMPESTRO128,
27260 IX86_BUILTIN_PCMPESTRS128,
27261 IX86_BUILTIN_PCMPESTRZ128,
27262 IX86_BUILTIN_PCMPISTRI128,
27263 IX86_BUILTIN_PCMPISTRM128,
27264 IX86_BUILTIN_PCMPISTRA128,
27265 IX86_BUILTIN_PCMPISTRC128,
27266 IX86_BUILTIN_PCMPISTRO128,
27267 IX86_BUILTIN_PCMPISTRS128,
27268 IX86_BUILTIN_PCMPISTRZ128,
27270 IX86_BUILTIN_PCMPGTQ,
27272 /* AES instructions */
27273 IX86_BUILTIN_AESENC128,
27274 IX86_BUILTIN_AESENCLAST128,
27275 IX86_BUILTIN_AESDEC128,
27276 IX86_BUILTIN_AESDECLAST128,
27277 IX86_BUILTIN_AESIMC128,
27278 IX86_BUILTIN_AESKEYGENASSIST128,
27280 /* PCLMUL instruction */
27281 IX86_BUILTIN_PCLMULQDQ128,
27283 /* AVX */
27284 IX86_BUILTIN_ADDPD256,
27285 IX86_BUILTIN_ADDPS256,
27286 IX86_BUILTIN_ADDSUBPD256,
27287 IX86_BUILTIN_ADDSUBPS256,
27288 IX86_BUILTIN_ANDPD256,
27289 IX86_BUILTIN_ANDPS256,
27290 IX86_BUILTIN_ANDNPD256,
27291 IX86_BUILTIN_ANDNPS256,
27292 IX86_BUILTIN_BLENDPD256,
27293 IX86_BUILTIN_BLENDPS256,
27294 IX86_BUILTIN_BLENDVPD256,
27295 IX86_BUILTIN_BLENDVPS256,
27296 IX86_BUILTIN_DIVPD256,
27297 IX86_BUILTIN_DIVPS256,
27298 IX86_BUILTIN_DPPS256,
27299 IX86_BUILTIN_HADDPD256,
27300 IX86_BUILTIN_HADDPS256,
27301 IX86_BUILTIN_HSUBPD256,
27302 IX86_BUILTIN_HSUBPS256,
27303 IX86_BUILTIN_MAXPD256,
27304 IX86_BUILTIN_MAXPS256,
27305 IX86_BUILTIN_MINPD256,
27306 IX86_BUILTIN_MINPS256,
27307 IX86_BUILTIN_MULPD256,
27308 IX86_BUILTIN_MULPS256,
27309 IX86_BUILTIN_ORPD256,
27310 IX86_BUILTIN_ORPS256,
27311 IX86_BUILTIN_SHUFPD256,
27312 IX86_BUILTIN_SHUFPS256,
27313 IX86_BUILTIN_SUBPD256,
27314 IX86_BUILTIN_SUBPS256,
27315 IX86_BUILTIN_XORPD256,
27316 IX86_BUILTIN_XORPS256,
27317 IX86_BUILTIN_CMPSD,
27318 IX86_BUILTIN_CMPSS,
27319 IX86_BUILTIN_CMPPD,
27320 IX86_BUILTIN_CMPPS,
27321 IX86_BUILTIN_CMPPD256,
27322 IX86_BUILTIN_CMPPS256,
27323 IX86_BUILTIN_CVTDQ2PD256,
27324 IX86_BUILTIN_CVTDQ2PS256,
27325 IX86_BUILTIN_CVTPD2PS256,
27326 IX86_BUILTIN_CVTPS2DQ256,
27327 IX86_BUILTIN_CVTPS2PD256,
27328 IX86_BUILTIN_CVTTPD2DQ256,
27329 IX86_BUILTIN_CVTPD2DQ256,
27330 IX86_BUILTIN_CVTTPS2DQ256,
27331 IX86_BUILTIN_EXTRACTF128PD256,
27332 IX86_BUILTIN_EXTRACTF128PS256,
27333 IX86_BUILTIN_EXTRACTF128SI256,
27334 IX86_BUILTIN_VZEROALL,
27335 IX86_BUILTIN_VZEROUPPER,
27336 IX86_BUILTIN_VPERMILVARPD,
27337 IX86_BUILTIN_VPERMILVARPS,
27338 IX86_BUILTIN_VPERMILVARPD256,
27339 IX86_BUILTIN_VPERMILVARPS256,
27340 IX86_BUILTIN_VPERMILPD,
27341 IX86_BUILTIN_VPERMILPS,
27342 IX86_BUILTIN_VPERMILPD256,
27343 IX86_BUILTIN_VPERMILPS256,
27344 IX86_BUILTIN_VPERMIL2PD,
27345 IX86_BUILTIN_VPERMIL2PS,
27346 IX86_BUILTIN_VPERMIL2PD256,
27347 IX86_BUILTIN_VPERMIL2PS256,
27348 IX86_BUILTIN_VPERM2F128PD256,
27349 IX86_BUILTIN_VPERM2F128PS256,
27350 IX86_BUILTIN_VPERM2F128SI256,
27351 IX86_BUILTIN_VBROADCASTSS,
27352 IX86_BUILTIN_VBROADCASTSD256,
27353 IX86_BUILTIN_VBROADCASTSS256,
27354 IX86_BUILTIN_VBROADCASTPD256,
27355 IX86_BUILTIN_VBROADCASTPS256,
27356 IX86_BUILTIN_VINSERTF128PD256,
27357 IX86_BUILTIN_VINSERTF128PS256,
27358 IX86_BUILTIN_VINSERTF128SI256,
27359 IX86_BUILTIN_LOADUPD256,
27360 IX86_BUILTIN_LOADUPS256,
27361 IX86_BUILTIN_STOREUPD256,
27362 IX86_BUILTIN_STOREUPS256,
27363 IX86_BUILTIN_LDDQU256,
27364 IX86_BUILTIN_MOVNTDQ256,
27365 IX86_BUILTIN_MOVNTPD256,
27366 IX86_BUILTIN_MOVNTPS256,
27367 IX86_BUILTIN_LOADDQU256,
27368 IX86_BUILTIN_STOREDQU256,
27369 IX86_BUILTIN_MASKLOADPD,
27370 IX86_BUILTIN_MASKLOADPS,
27371 IX86_BUILTIN_MASKSTOREPD,
27372 IX86_BUILTIN_MASKSTOREPS,
27373 IX86_BUILTIN_MASKLOADPD256,
27374 IX86_BUILTIN_MASKLOADPS256,
27375 IX86_BUILTIN_MASKSTOREPD256,
27376 IX86_BUILTIN_MASKSTOREPS256,
27377 IX86_BUILTIN_MOVSHDUP256,
27378 IX86_BUILTIN_MOVSLDUP256,
27379 IX86_BUILTIN_MOVDDUP256,
27381 IX86_BUILTIN_SQRTPD256,
27382 IX86_BUILTIN_SQRTPS256,
27383 IX86_BUILTIN_SQRTPS_NR256,
27384 IX86_BUILTIN_RSQRTPS256,
27385 IX86_BUILTIN_RSQRTPS_NR256,
27387 IX86_BUILTIN_RCPPS256,
27389 IX86_BUILTIN_ROUNDPD256,
27390 IX86_BUILTIN_ROUNDPS256,
27392 IX86_BUILTIN_FLOORPD256,
27393 IX86_BUILTIN_CEILPD256,
27394 IX86_BUILTIN_TRUNCPD256,
27395 IX86_BUILTIN_RINTPD256,
27396 IX86_BUILTIN_ROUNDPD_AZ256,
27398 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
27399 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
27400 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
27402 IX86_BUILTIN_FLOORPS256,
27403 IX86_BUILTIN_CEILPS256,
27404 IX86_BUILTIN_TRUNCPS256,
27405 IX86_BUILTIN_RINTPS256,
27406 IX86_BUILTIN_ROUNDPS_AZ256,
27408 IX86_BUILTIN_FLOORPS_SFIX256,
27409 IX86_BUILTIN_CEILPS_SFIX256,
27410 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
27412 IX86_BUILTIN_UNPCKHPD256,
27413 IX86_BUILTIN_UNPCKLPD256,
27414 IX86_BUILTIN_UNPCKHPS256,
27415 IX86_BUILTIN_UNPCKLPS256,
27417 IX86_BUILTIN_SI256_SI,
27418 IX86_BUILTIN_PS256_PS,
27419 IX86_BUILTIN_PD256_PD,
27420 IX86_BUILTIN_SI_SI256,
27421 IX86_BUILTIN_PS_PS256,
27422 IX86_BUILTIN_PD_PD256,
27424 IX86_BUILTIN_VTESTZPD,
27425 IX86_BUILTIN_VTESTCPD,
27426 IX86_BUILTIN_VTESTNZCPD,
27427 IX86_BUILTIN_VTESTZPS,
27428 IX86_BUILTIN_VTESTCPS,
27429 IX86_BUILTIN_VTESTNZCPS,
27430 IX86_BUILTIN_VTESTZPD256,
27431 IX86_BUILTIN_VTESTCPD256,
27432 IX86_BUILTIN_VTESTNZCPD256,
27433 IX86_BUILTIN_VTESTZPS256,
27434 IX86_BUILTIN_VTESTCPS256,
27435 IX86_BUILTIN_VTESTNZCPS256,
27436 IX86_BUILTIN_PTESTZ256,
27437 IX86_BUILTIN_PTESTC256,
27438 IX86_BUILTIN_PTESTNZC256,
27440 IX86_BUILTIN_MOVMSKPD256,
27441 IX86_BUILTIN_MOVMSKPS256,
27443 /* AVX2 */
27444 IX86_BUILTIN_MPSADBW256,
27445 IX86_BUILTIN_PABSB256,
27446 IX86_BUILTIN_PABSW256,
27447 IX86_BUILTIN_PABSD256,
27448 IX86_BUILTIN_PACKSSDW256,
27449 IX86_BUILTIN_PACKSSWB256,
27450 IX86_BUILTIN_PACKUSDW256,
27451 IX86_BUILTIN_PACKUSWB256,
27452 IX86_BUILTIN_PADDB256,
27453 IX86_BUILTIN_PADDW256,
27454 IX86_BUILTIN_PADDD256,
27455 IX86_BUILTIN_PADDQ256,
27456 IX86_BUILTIN_PADDSB256,
27457 IX86_BUILTIN_PADDSW256,
27458 IX86_BUILTIN_PADDUSB256,
27459 IX86_BUILTIN_PADDUSW256,
27460 IX86_BUILTIN_PALIGNR256,
27461 IX86_BUILTIN_AND256I,
27462 IX86_BUILTIN_ANDNOT256I,
27463 IX86_BUILTIN_PAVGB256,
27464 IX86_BUILTIN_PAVGW256,
27465 IX86_BUILTIN_PBLENDVB256,
27466 IX86_BUILTIN_PBLENDVW256,
27467 IX86_BUILTIN_PCMPEQB256,
27468 IX86_BUILTIN_PCMPEQW256,
27469 IX86_BUILTIN_PCMPEQD256,
27470 IX86_BUILTIN_PCMPEQQ256,
27471 IX86_BUILTIN_PCMPGTB256,
27472 IX86_BUILTIN_PCMPGTW256,
27473 IX86_BUILTIN_PCMPGTD256,
27474 IX86_BUILTIN_PCMPGTQ256,
27475 IX86_BUILTIN_PHADDW256,
27476 IX86_BUILTIN_PHADDD256,
27477 IX86_BUILTIN_PHADDSW256,
27478 IX86_BUILTIN_PHSUBW256,
27479 IX86_BUILTIN_PHSUBD256,
27480 IX86_BUILTIN_PHSUBSW256,
27481 IX86_BUILTIN_PMADDUBSW256,
27482 IX86_BUILTIN_PMADDWD256,
27483 IX86_BUILTIN_PMAXSB256,
27484 IX86_BUILTIN_PMAXSW256,
27485 IX86_BUILTIN_PMAXSD256,
27486 IX86_BUILTIN_PMAXUB256,
27487 IX86_BUILTIN_PMAXUW256,
27488 IX86_BUILTIN_PMAXUD256,
27489 IX86_BUILTIN_PMINSB256,
27490 IX86_BUILTIN_PMINSW256,
27491 IX86_BUILTIN_PMINSD256,
27492 IX86_BUILTIN_PMINUB256,
27493 IX86_BUILTIN_PMINUW256,
27494 IX86_BUILTIN_PMINUD256,
27495 IX86_BUILTIN_PMOVMSKB256,
27496 IX86_BUILTIN_PMOVSXBW256,
27497 IX86_BUILTIN_PMOVSXBD256,
27498 IX86_BUILTIN_PMOVSXBQ256,
27499 IX86_BUILTIN_PMOVSXWD256,
27500 IX86_BUILTIN_PMOVSXWQ256,
27501 IX86_BUILTIN_PMOVSXDQ256,
27502 IX86_BUILTIN_PMOVZXBW256,
27503 IX86_BUILTIN_PMOVZXBD256,
27504 IX86_BUILTIN_PMOVZXBQ256,
27505 IX86_BUILTIN_PMOVZXWD256,
27506 IX86_BUILTIN_PMOVZXWQ256,
27507 IX86_BUILTIN_PMOVZXDQ256,
27508 IX86_BUILTIN_PMULDQ256,
27509 IX86_BUILTIN_PMULHRSW256,
27510 IX86_BUILTIN_PMULHUW256,
27511 IX86_BUILTIN_PMULHW256,
27512 IX86_BUILTIN_PMULLW256,
27513 IX86_BUILTIN_PMULLD256,
27514 IX86_BUILTIN_PMULUDQ256,
27515 IX86_BUILTIN_POR256,
27516 IX86_BUILTIN_PSADBW256,
27517 IX86_BUILTIN_PSHUFB256,
27518 IX86_BUILTIN_PSHUFD256,
27519 IX86_BUILTIN_PSHUFHW256,
27520 IX86_BUILTIN_PSHUFLW256,
27521 IX86_BUILTIN_PSIGNB256,
27522 IX86_BUILTIN_PSIGNW256,
27523 IX86_BUILTIN_PSIGND256,
27524 IX86_BUILTIN_PSLLDQI256,
27525 IX86_BUILTIN_PSLLWI256,
27526 IX86_BUILTIN_PSLLW256,
27527 IX86_BUILTIN_PSLLDI256,
27528 IX86_BUILTIN_PSLLD256,
27529 IX86_BUILTIN_PSLLQI256,
27530 IX86_BUILTIN_PSLLQ256,
27531 IX86_BUILTIN_PSRAWI256,
27532 IX86_BUILTIN_PSRAW256,
27533 IX86_BUILTIN_PSRADI256,
27534 IX86_BUILTIN_PSRAD256,
27535 IX86_BUILTIN_PSRLDQI256,
27536 IX86_BUILTIN_PSRLWI256,
27537 IX86_BUILTIN_PSRLW256,
27538 IX86_BUILTIN_PSRLDI256,
27539 IX86_BUILTIN_PSRLD256,
27540 IX86_BUILTIN_PSRLQI256,
27541 IX86_BUILTIN_PSRLQ256,
27542 IX86_BUILTIN_PSUBB256,
27543 IX86_BUILTIN_PSUBW256,
27544 IX86_BUILTIN_PSUBD256,
27545 IX86_BUILTIN_PSUBQ256,
27546 IX86_BUILTIN_PSUBSB256,
27547 IX86_BUILTIN_PSUBSW256,
27548 IX86_BUILTIN_PSUBUSB256,
27549 IX86_BUILTIN_PSUBUSW256,
27550 IX86_BUILTIN_PUNPCKHBW256,
27551 IX86_BUILTIN_PUNPCKHWD256,
27552 IX86_BUILTIN_PUNPCKHDQ256,
27553 IX86_BUILTIN_PUNPCKHQDQ256,
27554 IX86_BUILTIN_PUNPCKLBW256,
27555 IX86_BUILTIN_PUNPCKLWD256,
27556 IX86_BUILTIN_PUNPCKLDQ256,
27557 IX86_BUILTIN_PUNPCKLQDQ256,
27558 IX86_BUILTIN_PXOR256,
27559 IX86_BUILTIN_MOVNTDQA256,
27560 IX86_BUILTIN_VBROADCASTSS_PS,
27561 IX86_BUILTIN_VBROADCASTSS_PS256,
27562 IX86_BUILTIN_VBROADCASTSD_PD256,
27563 IX86_BUILTIN_VBROADCASTSI256,
27564 IX86_BUILTIN_PBLENDD256,
27565 IX86_BUILTIN_PBLENDD128,
27566 IX86_BUILTIN_PBROADCASTB256,
27567 IX86_BUILTIN_PBROADCASTW256,
27568 IX86_BUILTIN_PBROADCASTD256,
27569 IX86_BUILTIN_PBROADCASTQ256,
27570 IX86_BUILTIN_PBROADCASTB128,
27571 IX86_BUILTIN_PBROADCASTW128,
27572 IX86_BUILTIN_PBROADCASTD128,
27573 IX86_BUILTIN_PBROADCASTQ128,
27574 IX86_BUILTIN_VPERMVARSI256,
27575 IX86_BUILTIN_VPERMDF256,
27576 IX86_BUILTIN_VPERMVARSF256,
27577 IX86_BUILTIN_VPERMDI256,
27578 IX86_BUILTIN_VPERMTI256,
27579 IX86_BUILTIN_VEXTRACT128I256,
27580 IX86_BUILTIN_VINSERT128I256,
27581 IX86_BUILTIN_MASKLOADD,
27582 IX86_BUILTIN_MASKLOADQ,
27583 IX86_BUILTIN_MASKLOADD256,
27584 IX86_BUILTIN_MASKLOADQ256,
27585 IX86_BUILTIN_MASKSTORED,
27586 IX86_BUILTIN_MASKSTOREQ,
27587 IX86_BUILTIN_MASKSTORED256,
27588 IX86_BUILTIN_MASKSTOREQ256,
27589 IX86_BUILTIN_PSLLVV4DI,
27590 IX86_BUILTIN_PSLLVV2DI,
27591 IX86_BUILTIN_PSLLVV8SI,
27592 IX86_BUILTIN_PSLLVV4SI,
27593 IX86_BUILTIN_PSRAVV8SI,
27594 IX86_BUILTIN_PSRAVV4SI,
27595 IX86_BUILTIN_PSRLVV4DI,
27596 IX86_BUILTIN_PSRLVV2DI,
27597 IX86_BUILTIN_PSRLVV8SI,
27598 IX86_BUILTIN_PSRLVV4SI,
27600 IX86_BUILTIN_GATHERSIV2DF,
27601 IX86_BUILTIN_GATHERSIV4DF,
27602 IX86_BUILTIN_GATHERDIV2DF,
27603 IX86_BUILTIN_GATHERDIV4DF,
27604 IX86_BUILTIN_GATHERSIV4SF,
27605 IX86_BUILTIN_GATHERSIV8SF,
27606 IX86_BUILTIN_GATHERDIV4SF,
27607 IX86_BUILTIN_GATHERDIV8SF,
27608 IX86_BUILTIN_GATHERSIV2DI,
27609 IX86_BUILTIN_GATHERSIV4DI,
27610 IX86_BUILTIN_GATHERDIV2DI,
27611 IX86_BUILTIN_GATHERDIV4DI,
27612 IX86_BUILTIN_GATHERSIV4SI,
27613 IX86_BUILTIN_GATHERSIV8SI,
27614 IX86_BUILTIN_GATHERDIV4SI,
27615 IX86_BUILTIN_GATHERDIV8SI,
27617 /* Alternate 4 element gather for the vectorizer where
27618 all operands are 32-byte wide. */
27619 IX86_BUILTIN_GATHERALTSIV4DF,
27620 IX86_BUILTIN_GATHERALTDIV8SF,
27621 IX86_BUILTIN_GATHERALTSIV4DI,
27622 IX86_BUILTIN_GATHERALTDIV8SI,
27624 /* TFmode support builtins. */
27625 IX86_BUILTIN_INFQ,
27626 IX86_BUILTIN_HUGE_VALQ,
27627 IX86_BUILTIN_FABSQ,
27628 IX86_BUILTIN_COPYSIGNQ,
27630 /* Vectorizer support builtins. */
27631 IX86_BUILTIN_CPYSGNPS,
27632 IX86_BUILTIN_CPYSGNPD,
27633 IX86_BUILTIN_CPYSGNPS256,
27634 IX86_BUILTIN_CPYSGNPD256,
27636 /* FMA4 instructions. */
27637 IX86_BUILTIN_VFMADDSS,
27638 IX86_BUILTIN_VFMADDSD,
27639 IX86_BUILTIN_VFMADDPS,
27640 IX86_BUILTIN_VFMADDPD,
27641 IX86_BUILTIN_VFMADDPS256,
27642 IX86_BUILTIN_VFMADDPD256,
27643 IX86_BUILTIN_VFMADDSUBPS,
27644 IX86_BUILTIN_VFMADDSUBPD,
27645 IX86_BUILTIN_VFMADDSUBPS256,
27646 IX86_BUILTIN_VFMADDSUBPD256,
27648 /* FMA3 instructions. */
27649 IX86_BUILTIN_VFMADDSS3,
27650 IX86_BUILTIN_VFMADDSD3,
27652 /* XOP instructions. */
27653 IX86_BUILTIN_VPCMOV,
27654 IX86_BUILTIN_VPCMOV_V2DI,
27655 IX86_BUILTIN_VPCMOV_V4SI,
27656 IX86_BUILTIN_VPCMOV_V8HI,
27657 IX86_BUILTIN_VPCMOV_V16QI,
27658 IX86_BUILTIN_VPCMOV_V4SF,
27659 IX86_BUILTIN_VPCMOV_V2DF,
27660 IX86_BUILTIN_VPCMOV256,
27661 IX86_BUILTIN_VPCMOV_V4DI256,
27662 IX86_BUILTIN_VPCMOV_V8SI256,
27663 IX86_BUILTIN_VPCMOV_V16HI256,
27664 IX86_BUILTIN_VPCMOV_V32QI256,
27665 IX86_BUILTIN_VPCMOV_V8SF256,
27666 IX86_BUILTIN_VPCMOV_V4DF256,
27668 IX86_BUILTIN_VPPERM,
27670 IX86_BUILTIN_VPMACSSWW,
27671 IX86_BUILTIN_VPMACSWW,
27672 IX86_BUILTIN_VPMACSSWD,
27673 IX86_BUILTIN_VPMACSWD,
27674 IX86_BUILTIN_VPMACSSDD,
27675 IX86_BUILTIN_VPMACSDD,
27676 IX86_BUILTIN_VPMACSSDQL,
27677 IX86_BUILTIN_VPMACSSDQH,
27678 IX86_BUILTIN_VPMACSDQL,
27679 IX86_BUILTIN_VPMACSDQH,
27680 IX86_BUILTIN_VPMADCSSWD,
27681 IX86_BUILTIN_VPMADCSWD,
27683 IX86_BUILTIN_VPHADDBW,
27684 IX86_BUILTIN_VPHADDBD,
27685 IX86_BUILTIN_VPHADDBQ,
27686 IX86_BUILTIN_VPHADDWD,
27687 IX86_BUILTIN_VPHADDWQ,
27688 IX86_BUILTIN_VPHADDDQ,
27689 IX86_BUILTIN_VPHADDUBW,
27690 IX86_BUILTIN_VPHADDUBD,
27691 IX86_BUILTIN_VPHADDUBQ,
27692 IX86_BUILTIN_VPHADDUWD,
27693 IX86_BUILTIN_VPHADDUWQ,
27694 IX86_BUILTIN_VPHADDUDQ,
27695 IX86_BUILTIN_VPHSUBBW,
27696 IX86_BUILTIN_VPHSUBWD,
27697 IX86_BUILTIN_VPHSUBDQ,
27699 IX86_BUILTIN_VPROTB,
27700 IX86_BUILTIN_VPROTW,
27701 IX86_BUILTIN_VPROTD,
27702 IX86_BUILTIN_VPROTQ,
27703 IX86_BUILTIN_VPROTB_IMM,
27704 IX86_BUILTIN_VPROTW_IMM,
27705 IX86_BUILTIN_VPROTD_IMM,
27706 IX86_BUILTIN_VPROTQ_IMM,
27708 IX86_BUILTIN_VPSHLB,
27709 IX86_BUILTIN_VPSHLW,
27710 IX86_BUILTIN_VPSHLD,
27711 IX86_BUILTIN_VPSHLQ,
27712 IX86_BUILTIN_VPSHAB,
27713 IX86_BUILTIN_VPSHAW,
27714 IX86_BUILTIN_VPSHAD,
27715 IX86_BUILTIN_VPSHAQ,
27717 IX86_BUILTIN_VFRCZSS,
27718 IX86_BUILTIN_VFRCZSD,
27719 IX86_BUILTIN_VFRCZPS,
27720 IX86_BUILTIN_VFRCZPD,
27721 IX86_BUILTIN_VFRCZPS256,
27722 IX86_BUILTIN_VFRCZPD256,
27724 IX86_BUILTIN_VPCOMEQUB,
27725 IX86_BUILTIN_VPCOMNEUB,
27726 IX86_BUILTIN_VPCOMLTUB,
27727 IX86_BUILTIN_VPCOMLEUB,
27728 IX86_BUILTIN_VPCOMGTUB,
27729 IX86_BUILTIN_VPCOMGEUB,
27730 IX86_BUILTIN_VPCOMFALSEUB,
27731 IX86_BUILTIN_VPCOMTRUEUB,
27733 IX86_BUILTIN_VPCOMEQUW,
27734 IX86_BUILTIN_VPCOMNEUW,
27735 IX86_BUILTIN_VPCOMLTUW,
27736 IX86_BUILTIN_VPCOMLEUW,
27737 IX86_BUILTIN_VPCOMGTUW,
27738 IX86_BUILTIN_VPCOMGEUW,
27739 IX86_BUILTIN_VPCOMFALSEUW,
27740 IX86_BUILTIN_VPCOMTRUEUW,
27742 IX86_BUILTIN_VPCOMEQUD,
27743 IX86_BUILTIN_VPCOMNEUD,
27744 IX86_BUILTIN_VPCOMLTUD,
27745 IX86_BUILTIN_VPCOMLEUD,
27746 IX86_BUILTIN_VPCOMGTUD,
27747 IX86_BUILTIN_VPCOMGEUD,
27748 IX86_BUILTIN_VPCOMFALSEUD,
27749 IX86_BUILTIN_VPCOMTRUEUD,
27751 IX86_BUILTIN_VPCOMEQUQ,
27752 IX86_BUILTIN_VPCOMNEUQ,
27753 IX86_BUILTIN_VPCOMLTUQ,
27754 IX86_BUILTIN_VPCOMLEUQ,
27755 IX86_BUILTIN_VPCOMGTUQ,
27756 IX86_BUILTIN_VPCOMGEUQ,
27757 IX86_BUILTIN_VPCOMFALSEUQ,
27758 IX86_BUILTIN_VPCOMTRUEUQ,
27760 IX86_BUILTIN_VPCOMEQB,
27761 IX86_BUILTIN_VPCOMNEB,
27762 IX86_BUILTIN_VPCOMLTB,
27763 IX86_BUILTIN_VPCOMLEB,
27764 IX86_BUILTIN_VPCOMGTB,
27765 IX86_BUILTIN_VPCOMGEB,
27766 IX86_BUILTIN_VPCOMFALSEB,
27767 IX86_BUILTIN_VPCOMTRUEB,
27769 IX86_BUILTIN_VPCOMEQW,
27770 IX86_BUILTIN_VPCOMNEW,
27771 IX86_BUILTIN_VPCOMLTW,
27772 IX86_BUILTIN_VPCOMLEW,
27773 IX86_BUILTIN_VPCOMGTW,
27774 IX86_BUILTIN_VPCOMGEW,
27775 IX86_BUILTIN_VPCOMFALSEW,
27776 IX86_BUILTIN_VPCOMTRUEW,
27778 IX86_BUILTIN_VPCOMEQD,
27779 IX86_BUILTIN_VPCOMNED,
27780 IX86_BUILTIN_VPCOMLTD,
27781 IX86_BUILTIN_VPCOMLED,
27782 IX86_BUILTIN_VPCOMGTD,
27783 IX86_BUILTIN_VPCOMGED,
27784 IX86_BUILTIN_VPCOMFALSED,
27785 IX86_BUILTIN_VPCOMTRUED,
27787 IX86_BUILTIN_VPCOMEQQ,
27788 IX86_BUILTIN_VPCOMNEQ,
27789 IX86_BUILTIN_VPCOMLTQ,
27790 IX86_BUILTIN_VPCOMLEQ,
27791 IX86_BUILTIN_VPCOMGTQ,
27792 IX86_BUILTIN_VPCOMGEQ,
27793 IX86_BUILTIN_VPCOMFALSEQ,
27794 IX86_BUILTIN_VPCOMTRUEQ,
27796 /* LWP instructions. */
27797 IX86_BUILTIN_LLWPCB,
27798 IX86_BUILTIN_SLWPCB,
27799 IX86_BUILTIN_LWPVAL32,
27800 IX86_BUILTIN_LWPVAL64,
27801 IX86_BUILTIN_LWPINS32,
27802 IX86_BUILTIN_LWPINS64,
27804 IX86_BUILTIN_CLZS,
27806 /* RTM */
27807 IX86_BUILTIN_XBEGIN,
27808 IX86_BUILTIN_XEND,
27809 IX86_BUILTIN_XABORT,
27810 IX86_BUILTIN_XTEST,
27812 /* BMI instructions. */
27813 IX86_BUILTIN_BEXTR32,
27814 IX86_BUILTIN_BEXTR64,
27815 IX86_BUILTIN_CTZS,
27817 /* TBM instructions. */
27818 IX86_BUILTIN_BEXTRI32,
27819 IX86_BUILTIN_BEXTRI64,
27821 /* BMI2 instructions. */
27822 IX86_BUILTIN_BZHI32,
27823 IX86_BUILTIN_BZHI64,
27824 IX86_BUILTIN_PDEP32,
27825 IX86_BUILTIN_PDEP64,
27826 IX86_BUILTIN_PEXT32,
27827 IX86_BUILTIN_PEXT64,
27829 /* ADX instructions. */
27830 IX86_BUILTIN_ADDCARRYX32,
27831 IX86_BUILTIN_ADDCARRYX64,
27833 /* FSGSBASE instructions. */
27834 IX86_BUILTIN_RDFSBASE32,
27835 IX86_BUILTIN_RDFSBASE64,
27836 IX86_BUILTIN_RDGSBASE32,
27837 IX86_BUILTIN_RDGSBASE64,
27838 IX86_BUILTIN_WRFSBASE32,
27839 IX86_BUILTIN_WRFSBASE64,
27840 IX86_BUILTIN_WRGSBASE32,
27841 IX86_BUILTIN_WRGSBASE64,
27843 /* RDRND instructions. */
27844 IX86_BUILTIN_RDRAND16_STEP,
27845 IX86_BUILTIN_RDRAND32_STEP,
27846 IX86_BUILTIN_RDRAND64_STEP,
27848 /* RDSEED instructions. */
27849 IX86_BUILTIN_RDSEED16_STEP,
27850 IX86_BUILTIN_RDSEED32_STEP,
27851 IX86_BUILTIN_RDSEED64_STEP,
27853 /* F16C instructions. */
27854 IX86_BUILTIN_CVTPH2PS,
27855 IX86_BUILTIN_CVTPH2PS256,
27856 IX86_BUILTIN_CVTPS2PH,
27857 IX86_BUILTIN_CVTPS2PH256,
27859 /* CFString built-in for darwin */
27860 IX86_BUILTIN_CFSTRING,
27862 /* Builtins to get CPU type and supported features. */
27863 IX86_BUILTIN_CPU_INIT,
27864 IX86_BUILTIN_CPU_IS,
27865 IX86_BUILTIN_CPU_SUPPORTS,
27867 IX86_BUILTIN_MAX
27870 /* Table for the ix86 builtin decls. */
27871 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27873 /* Table of all of the builtin functions that are possible with different ISA's
27874 but are waiting to be built until a function is declared to use that
27875 ISA. */
27876 struct builtin_isa {
27877 const char *name; /* function name */
27878 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27879 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27880 bool const_p; /* true if the declaration is constant */
27881 bool set_and_not_built_p;
27884 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27887 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27888 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27889 function decl in the ix86_builtins array. Returns the function decl or
27890 NULL_TREE, if the builtin was not added.
27892 If the front end has a special hook for builtin functions, delay adding
27893 builtin functions that aren't in the current ISA until the ISA is changed
27894 with function specific optimization. Doing so, can save about 300K for the
27895 default compiler. When the builtin is expanded, check at that time whether
27896 it is valid.
27898 If the front end doesn't have a special hook, record all builtins, even if
27899 it isn't an instruction set in the current ISA in case the user uses
27900 function specific options for a different ISA, so that we don't get scope
27901 errors if a builtin is added in the middle of a function scope. */
27903 static inline tree
27904 def_builtin (HOST_WIDE_INT mask, const char *name,
27905 enum ix86_builtin_func_type tcode,
27906 enum ix86_builtins code)
27908 tree decl = NULL_TREE;
27910 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27912 ix86_builtins_isa[(int) code].isa = mask;
27914 mask &= ~OPTION_MASK_ISA_64BIT;
27915 if (mask == 0
27916 || (mask & ix86_isa_flags) != 0
27917 || (lang_hooks.builtin_function
27918 == lang_hooks.builtin_function_ext_scope))
27921 tree type = ix86_get_builtin_func_type (tcode);
27922 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27923 NULL, NULL_TREE);
27924 ix86_builtins[(int) code] = decl;
27925 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27927 else
27929 ix86_builtins[(int) code] = NULL_TREE;
27930 ix86_builtins_isa[(int) code].tcode = tcode;
27931 ix86_builtins_isa[(int) code].name = name;
27932 ix86_builtins_isa[(int) code].const_p = false;
27933 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27937 return decl;
27940 /* Like def_builtin, but also marks the function decl "const". */
27942 static inline tree
27943 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27944 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27946 tree decl = def_builtin (mask, name, tcode, code);
27947 if (decl)
27948 TREE_READONLY (decl) = 1;
27949 else
27950 ix86_builtins_isa[(int) code].const_p = true;
27952 return decl;
27955 /* Add any new builtin functions for a given ISA that may not have been
27956 declared. This saves a bit of space compared to adding all of the
27957 declarations to the tree, even if we didn't use them. */
27959 static void
27960 ix86_add_new_builtins (HOST_WIDE_INT isa)
27962 int i;
27964 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27966 if ((ix86_builtins_isa[i].isa & isa) != 0
27967 && ix86_builtins_isa[i].set_and_not_built_p)
27969 tree decl, type;
27971 /* Don't define the builtin again. */
27972 ix86_builtins_isa[i].set_and_not_built_p = false;
27974 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
27975 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
27976 type, i, BUILT_IN_MD, NULL,
27977 NULL_TREE);
27979 ix86_builtins[i] = decl;
27980 if (ix86_builtins_isa[i].const_p)
27981 TREE_READONLY (decl) = 1;
27986 /* Bits for builtin_description.flag. */
27988 /* Set when we don't support the comparison natively, and should
27989 swap_comparison in order to support it. */
27990 #define BUILTIN_DESC_SWAP_OPERANDS 1
27992 struct builtin_description
27994 const HOST_WIDE_INT mask;
27995 const enum insn_code icode;
27996 const char *const name;
27997 const enum ix86_builtins code;
27998 const enum rtx_code comparison;
27999 const int flag;
28002 static const struct builtin_description bdesc_comi[] =
28004 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
28005 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
28006 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
28007 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
28008 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
28009 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
28010 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
28011 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
28012 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
28013 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
28014 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
28015 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
28016 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
28017 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
28018 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
28019 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
28020 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
28021 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
28022 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
28023 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
28024 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
28025 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
28026 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
28027 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
28030 static const struct builtin_description bdesc_pcmpestr[] =
28032 /* SSE4.2 */
28033 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
28034 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
28035 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
28036 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
28037 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
28038 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
28039 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
28042 static const struct builtin_description bdesc_pcmpistr[] =
28044 /* SSE4.2 */
28045 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
28046 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
28047 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
28048 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
28049 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
28050 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
28051 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
28054 /* Special builtins with variable number of arguments. */
28055 static const struct builtin_description bdesc_special_args[] =
28057 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
28058 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
28059 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
28061 /* 80387 (for use internally for atomic compound assignment). */
28062 { 0, CODE_FOR_fnstenv, "__builtin_ia32_fnstenv", IX86_BUILTIN_FNSTENV, UNKNOWN, (int) VOID_FTYPE_PVOID },
28063 { 0, CODE_FOR_fldenv, "__builtin_ia32_fldenv", IX86_BUILTIN_FLDENV, UNKNOWN, (int) VOID_FTYPE_PCVOID },
28064 { 0, CODE_FOR_fnstsw, "__builtin_ia32_fnstsw", IX86_BUILTIN_FNSTSW, UNKNOWN, (int) VOID_FTYPE_PUSHORT },
28065 { 0, CODE_FOR_fnclex, "__builtin_ia32_fnclex", IX86_BUILTIN_FNCLEX, UNKNOWN, (int) VOID_FTYPE_VOID },
28067 /* MMX */
28068 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
28070 /* 3DNow! */
28071 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
28073 /* FXSR, XSAVE and XSAVEOPT */
28074 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
28075 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
28076 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28077 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28078 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28080 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
28081 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
28082 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28083 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28084 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28086 /* SSE */
28087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28091 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28092 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28093 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28096 /* SSE or 3DNow!A */
28097 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28098 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
28100 /* SSE2 */
28101 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28102 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28103 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28104 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
28105 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28106 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
28107 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
28108 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
28109 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
28110 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28112 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28113 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28115 /* SSE3 */
28116 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28118 /* SSE4.1 */
28119 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
28121 /* SSE4A */
28122 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28123 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28125 /* AVX */
28126 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
28127 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
28129 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28130 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28131 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28132 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
28133 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
28135 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28136 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28137 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28138 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28139 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28140 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
28141 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28143 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
28144 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28145 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28147 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
28148 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
28149 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
28150 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
28151 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
28152 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
28153 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
28154 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
28156 /* AVX2 */
28157 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
28158 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
28159 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
28160 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
28161 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
28162 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
28163 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
28164 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
28165 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
28167 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
28168 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
28169 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
28170 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
28171 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
28172 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
28174 /* FSGSBASE */
28175 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28176 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28177 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28178 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28179 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28180 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28181 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28182 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28184 /* RTM */
28185 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28186 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
28187 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
28190 /* Builtins with variable number of arguments. */
28191 static const struct builtin_description bdesc_args[] =
28193 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
28194 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
28195 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
28196 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28197 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28198 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28199 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28201 /* MMX */
28202 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28203 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28204 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28205 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28206 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28207 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28209 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28210 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28211 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28212 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28213 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28214 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28215 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28216 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28218 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28219 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28221 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28222 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28223 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28224 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28226 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28227 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28228 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28229 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28230 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28231 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28233 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28234 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28235 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28236 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28237 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
28238 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
28240 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28241 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
28242 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28244 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
28246 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28247 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28248 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28249 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28250 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28251 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28253 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28254 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28255 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28256 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28257 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28258 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28260 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28261 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28262 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28263 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28265 /* 3DNow! */
28266 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28267 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28268 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28269 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28271 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28272 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28273 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28274 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28275 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28276 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28277 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28278 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28279 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28280 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28281 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28282 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28283 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28284 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28285 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28287 /* 3DNow!A */
28288 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28289 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28290 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28291 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28292 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28293 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28295 /* SSE */
28296 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
28297 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28298 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28299 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28300 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28301 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28302 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28303 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28304 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28305 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28306 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28307 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28309 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28311 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28312 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28313 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28314 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28315 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28316 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28317 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28318 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28320 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28321 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28322 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28323 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28324 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28325 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28326 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28327 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28328 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28329 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28330 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
28331 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28332 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28333 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28334 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28335 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28336 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28337 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28338 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28339 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28341 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28342 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28343 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28344 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28346 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28347 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28348 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28349 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28351 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28353 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28354 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28355 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28356 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28357 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28359 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
28360 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
28361 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
28363 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
28365 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28366 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28367 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28369 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
28370 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
28372 /* SSE MMX or 3Dnow!A */
28373 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28374 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28375 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28377 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28378 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28379 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28380 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28382 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
28383 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
28385 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
28387 /* SSE2 */
28388 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28390 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
28391 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
28392 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28393 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
28394 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
28396 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28397 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28398 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
28399 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28400 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28402 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
28404 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28405 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28406 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28407 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28409 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28410 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
28411 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28413 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28414 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28415 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28416 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28417 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28418 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28419 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28420 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28422 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28423 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28424 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28425 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28426 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
28427 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28428 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28429 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28430 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28431 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28432 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28433 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28434 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28435 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28436 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28437 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28438 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28439 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28440 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28441 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28443 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28444 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28445 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28446 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28448 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28449 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28450 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28451 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28453 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28455 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28456 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28457 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28459 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28461 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28462 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28463 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28464 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28465 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28466 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28467 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28468 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28470 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28471 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28472 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28473 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28474 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28475 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28476 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28477 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28479 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28480 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
28482 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28483 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28484 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28485 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28487 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28488 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28490 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28491 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28492 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28493 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28494 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28495 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28497 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28498 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28499 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28500 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28502 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28503 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28504 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28505 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28506 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28507 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28508 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28509 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28511 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28512 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28513 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28515 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28516 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
28518 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
28519 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28521 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
28523 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
28524 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
28525 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
28526 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
28528 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28529 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28530 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28531 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28532 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28533 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28534 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28536 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28537 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28538 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28539 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28540 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28541 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28542 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28544 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28545 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28546 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28547 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28549 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
28550 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28551 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28553 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
28555 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28557 /* SSE2 MMX */
28558 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28559 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28561 /* SSE3 */
28562 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
28563 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28565 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28566 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28567 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28568 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28569 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28570 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28572 /* SSSE3 */
28573 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28574 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
28575 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28576 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
28577 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28578 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28580 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28581 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28582 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28583 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28584 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28585 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28586 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28587 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28588 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28589 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28590 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28591 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28592 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
28593 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
28594 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28595 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28596 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28597 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28598 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28599 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28600 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28601 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28602 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28603 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28605 /* SSSE3. */
28606 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
28607 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
28609 /* SSE4.1 */
28610 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28611 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28612 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
28613 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
28614 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28615 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28616 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28617 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
28618 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
28619 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
28621 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28622 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28623 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28624 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28625 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28626 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28627 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28628 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28629 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28630 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28631 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28632 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28633 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28635 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28636 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28637 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28638 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28639 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28640 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28641 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28642 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28643 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28644 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28645 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28646 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28648 /* SSE4.1 */
28649 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28650 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28651 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28652 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28654 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
28655 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
28656 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
28657 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
28659 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28660 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28662 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28663 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28665 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
28666 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
28667 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
28668 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
28670 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
28671 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
28673 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28674 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28676 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28677 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28678 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28680 /* SSE4.2 */
28681 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28682 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
28683 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
28684 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28685 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28687 /* SSE4A */
28688 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
28689 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
28690 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
28691 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28693 /* AES */
28694 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
28695 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28697 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28698 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28699 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28700 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28702 /* PCLMUL */
28703 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28705 /* AVX */
28706 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28707 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28708 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28709 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28710 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28711 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28712 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28713 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28714 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28715 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28716 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28717 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28718 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28719 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28720 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28721 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28722 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28723 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28724 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28725 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28726 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28727 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28728 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28729 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28730 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28731 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28733 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28734 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28735 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28736 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28738 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28739 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28740 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28741 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28742 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28743 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28744 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28745 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28746 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28747 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28748 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28749 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28750 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28751 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28752 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28753 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28754 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28755 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28756 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28757 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28758 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28759 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28760 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28761 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28762 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28763 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28764 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28765 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28766 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28767 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28768 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28769 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28770 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28771 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28773 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28774 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28775 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28777 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28779 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28780 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28781 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28783 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28785 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28786 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28788 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28789 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28790 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28791 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28793 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28794 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28796 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28797 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28799 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28800 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28801 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28802 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28804 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28805 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28807 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28808 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28810 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28811 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28812 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28813 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28815 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28816 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28817 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28818 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28819 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28820 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28822 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28823 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28824 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28825 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28826 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28827 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28828 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28829 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28830 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28831 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28832 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28834 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28835 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28836 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28838 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28839 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28841 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28842 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28844 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28846 /* AVX2 */
28847 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28848 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28849 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28850 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28851 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28852 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28853 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28854 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28855 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28856 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28857 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28858 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28859 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28860 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28861 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28862 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28863 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28864 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28865 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28866 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28867 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28868 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28869 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28870 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28871 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28872 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28873 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28874 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28875 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28876 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28877 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28878 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28879 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28880 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28881 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28882 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28883 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28884 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28885 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28886 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28887 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28888 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28889 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28890 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28891 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28892 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28893 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28894 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28895 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28896 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28897 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28898 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28899 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28900 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28901 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28904 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28905 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28906 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28907 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28908 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28909 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28911 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28913 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28914 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28915 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28916 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28917 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28918 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28919 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28920 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28921 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28922 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28923 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28924 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28925 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28926 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28927 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28928 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28929 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28930 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28931 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28932 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28933 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28934 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28935 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28936 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28937 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28938 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28939 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28940 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28941 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28942 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28943 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28944 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28945 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28946 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28947 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28948 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28950 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28951 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28952 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28953 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28954 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28955 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28956 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28957 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28958 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28959 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28960 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28961 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28962 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28963 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28964 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28965 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28966 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28967 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28968 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28969 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28970 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28971 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28972 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28973 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28974 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28975 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28976 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28977 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28978 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28979 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
28980 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
28981 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
28982 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
28983 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28984 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28985 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28986 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28987 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28988 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28989 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28990 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28991 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28992 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28994 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28996 /* BMI */
28997 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28998 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28999 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
29001 /* TBM */
29002 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29003 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29005 /* F16C */
29006 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
29007 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
29008 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
29009 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
29011 /* BMI2 */
29012 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29013 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29014 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29015 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29016 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29017 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29020 /* FMA4 and XOP. */
29021 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
29022 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
29023 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
29024 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
29025 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
29026 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
29027 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
29028 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
29029 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
29030 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
29031 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
29032 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
29033 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
29034 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
29035 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
29036 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
29037 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
29038 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
29039 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
29040 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
29041 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
29042 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
29043 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
29044 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
29045 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
29046 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
29047 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
29048 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
29049 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
29050 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
29051 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
29052 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
29053 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
29054 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
29055 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
29056 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
29057 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
29058 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
29059 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
29060 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
29061 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
29062 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
29063 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
29064 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
29065 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
29066 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
29067 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
29068 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
29069 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
29070 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
29071 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
29072 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
29074 static const struct builtin_description bdesc_multi_arg[] =
29076 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
29077 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
29078 UNKNOWN, (int)MULTI_ARG_3_SF },
29079 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
29080 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
29081 UNKNOWN, (int)MULTI_ARG_3_DF },
29083 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
29084 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
29085 UNKNOWN, (int)MULTI_ARG_3_SF },
29086 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
29087 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
29088 UNKNOWN, (int)MULTI_ARG_3_DF },
29090 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
29091 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
29092 UNKNOWN, (int)MULTI_ARG_3_SF },
29093 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
29094 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
29095 UNKNOWN, (int)MULTI_ARG_3_DF },
29096 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
29097 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
29098 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29099 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
29100 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
29101 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29103 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
29104 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
29105 UNKNOWN, (int)MULTI_ARG_3_SF },
29106 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
29107 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
29108 UNKNOWN, (int)MULTI_ARG_3_DF },
29109 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
29110 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
29111 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29112 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
29113 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
29114 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
29117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
29118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
29119 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
29120 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
29121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
29122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
29124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
29127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
29128 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
29129 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
29130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
29132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
29134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29137 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29142 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29143 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29144 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29145 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29147 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29148 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
29149 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
29150 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
29151 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
29152 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
29153 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
29154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
29155 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29156 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
29157 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
29158 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
29159 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29160 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
29161 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
29162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
29164 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
29165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
29166 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
29167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
29168 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
29169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
29171 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29174 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29175 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29176 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29177 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29179 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29181 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29183 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29184 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29185 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29187 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
29188 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29189 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29190 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
29191 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
29192 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
29193 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
29195 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
29196 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29197 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29198 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
29199 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
29200 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
29201 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
29203 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
29204 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29205 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29206 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
29207 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
29208 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
29209 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
29211 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29212 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29213 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29214 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
29215 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
29216 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
29217 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
29219 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
29220 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29221 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29222 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
29223 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
29224 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
29225 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
29227 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
29228 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29229 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29230 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
29231 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
29232 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
29233 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
29235 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
29236 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29237 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29238 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
29239 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
29240 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
29241 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
29243 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29244 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29245 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29246 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
29247 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
29248 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
29249 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
29251 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29252 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29253 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29254 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29255 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29256 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29257 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29258 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29260 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29261 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29262 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29263 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29264 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29265 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29266 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29267 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29269 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
29270 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
29271 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
29272 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
29276 /* TM vector builtins. */
29278 /* Reuse the existing x86-specific `struct builtin_description' cause
29279 we're lazy. Add casts to make them fit. */
29280 static const struct builtin_description bdesc_tm[] =
29282 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29283 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29284 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29285 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29286 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29287 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29288 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29290 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29291 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29292 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29293 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29294 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29295 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29296 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29298 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29299 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29300 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29301 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29302 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29303 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29304 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29306 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
29307 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
29308 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
29311 /* TM callbacks. */
29313 /* Return the builtin decl needed to load a vector of TYPE. */
29315 static tree
29316 ix86_builtin_tm_load (tree type)
29318 if (TREE_CODE (type) == VECTOR_TYPE)
29320 switch (tree_low_cst (TYPE_SIZE (type), 1))
29322 case 64:
29323 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
29324 case 128:
29325 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
29326 case 256:
29327 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
29330 return NULL_TREE;
29333 /* Return the builtin decl needed to store a vector of TYPE. */
29335 static tree
29336 ix86_builtin_tm_store (tree type)
29338 if (TREE_CODE (type) == VECTOR_TYPE)
29340 switch (tree_low_cst (TYPE_SIZE (type), 1))
29342 case 64:
29343 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
29344 case 128:
29345 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
29346 case 256:
29347 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
29350 return NULL_TREE;
29353 /* Initialize the transactional memory vector load/store builtins. */
29355 static void
29356 ix86_init_tm_builtins (void)
29358 enum ix86_builtin_func_type ftype;
29359 const struct builtin_description *d;
29360 size_t i;
29361 tree decl;
29362 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
29363 tree attrs_log, attrs_type_log;
29365 if (!flag_tm)
29366 return;
29368 /* If there are no builtins defined, we must be compiling in a
29369 language without trans-mem support. */
29370 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
29371 return;
29373 /* Use whatever attributes a normal TM load has. */
29374 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
29375 attrs_load = DECL_ATTRIBUTES (decl);
29376 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29377 /* Use whatever attributes a normal TM store has. */
29378 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
29379 attrs_store = DECL_ATTRIBUTES (decl);
29380 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29381 /* Use whatever attributes a normal TM log has. */
29382 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
29383 attrs_log = DECL_ATTRIBUTES (decl);
29384 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29386 for (i = 0, d = bdesc_tm;
29387 i < ARRAY_SIZE (bdesc_tm);
29388 i++, d++)
29390 if ((d->mask & ix86_isa_flags) != 0
29391 || (lang_hooks.builtin_function
29392 == lang_hooks.builtin_function_ext_scope))
29394 tree type, attrs, attrs_type;
29395 enum built_in_function code = (enum built_in_function) d->code;
29397 ftype = (enum ix86_builtin_func_type) d->flag;
29398 type = ix86_get_builtin_func_type (ftype);
29400 if (BUILTIN_TM_LOAD_P (code))
29402 attrs = attrs_load;
29403 attrs_type = attrs_type_load;
29405 else if (BUILTIN_TM_STORE_P (code))
29407 attrs = attrs_store;
29408 attrs_type = attrs_type_store;
29410 else
29412 attrs = attrs_log;
29413 attrs_type = attrs_type_log;
29415 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
29416 /* The builtin without the prefix for
29417 calling it directly. */
29418 d->name + strlen ("__builtin_"),
29419 attrs);
29420 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
29421 set the TYPE_ATTRIBUTES. */
29422 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
29424 set_builtin_decl (code, decl, false);
29429 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
29430 in the current target ISA to allow the user to compile particular modules
29431 with different target specific options that differ from the command line
29432 options. */
29433 static void
29434 ix86_init_mmx_sse_builtins (void)
29436 const struct builtin_description * d;
29437 enum ix86_builtin_func_type ftype;
29438 size_t i;
29440 /* Add all special builtins with variable number of operands. */
29441 for (i = 0, d = bdesc_special_args;
29442 i < ARRAY_SIZE (bdesc_special_args);
29443 i++, d++)
29445 if (d->name == 0)
29446 continue;
29448 ftype = (enum ix86_builtin_func_type) d->flag;
29449 def_builtin (d->mask, d->name, ftype, d->code);
29452 /* Add all builtins with variable number of operands. */
29453 for (i = 0, d = bdesc_args;
29454 i < ARRAY_SIZE (bdesc_args);
29455 i++, d++)
29457 if (d->name == 0)
29458 continue;
29460 ftype = (enum ix86_builtin_func_type) d->flag;
29461 def_builtin_const (d->mask, d->name, ftype, d->code);
29464 /* pcmpestr[im] insns. */
29465 for (i = 0, d = bdesc_pcmpestr;
29466 i < ARRAY_SIZE (bdesc_pcmpestr);
29467 i++, d++)
29469 if (d->code == IX86_BUILTIN_PCMPESTRM128)
29470 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
29471 else
29472 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
29473 def_builtin_const (d->mask, d->name, ftype, d->code);
29476 /* pcmpistr[im] insns. */
29477 for (i = 0, d = bdesc_pcmpistr;
29478 i < ARRAY_SIZE (bdesc_pcmpistr);
29479 i++, d++)
29481 if (d->code == IX86_BUILTIN_PCMPISTRM128)
29482 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
29483 else
29484 ftype = INT_FTYPE_V16QI_V16QI_INT;
29485 def_builtin_const (d->mask, d->name, ftype, d->code);
29488 /* comi/ucomi insns. */
29489 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29491 if (d->mask == OPTION_MASK_ISA_SSE2)
29492 ftype = INT_FTYPE_V2DF_V2DF;
29493 else
29494 ftype = INT_FTYPE_V4SF_V4SF;
29495 def_builtin_const (d->mask, d->name, ftype, d->code);
29498 /* SSE */
29499 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
29500 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
29501 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
29502 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
29504 /* SSE or 3DNow!A */
29505 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29506 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
29507 IX86_BUILTIN_MASKMOVQ);
29509 /* SSE2 */
29510 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
29511 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
29513 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
29514 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
29515 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
29516 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
29518 /* SSE3. */
29519 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
29520 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
29521 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
29522 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
29524 /* AES */
29525 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
29526 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
29527 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
29528 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
29529 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
29530 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
29531 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
29532 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
29533 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
29534 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
29535 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
29536 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
29538 /* PCLMUL */
29539 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
29540 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
29542 /* RDRND */
29543 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
29544 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
29545 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
29546 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
29547 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
29548 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
29549 IX86_BUILTIN_RDRAND64_STEP);
29551 /* AVX2 */
29552 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
29553 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
29554 IX86_BUILTIN_GATHERSIV2DF);
29556 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
29557 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
29558 IX86_BUILTIN_GATHERSIV4DF);
29560 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
29561 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
29562 IX86_BUILTIN_GATHERDIV2DF);
29564 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
29565 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
29566 IX86_BUILTIN_GATHERDIV4DF);
29568 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
29569 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
29570 IX86_BUILTIN_GATHERSIV4SF);
29572 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
29573 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
29574 IX86_BUILTIN_GATHERSIV8SF);
29576 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
29577 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
29578 IX86_BUILTIN_GATHERDIV4SF);
29580 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
29581 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
29582 IX86_BUILTIN_GATHERDIV8SF);
29584 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
29585 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
29586 IX86_BUILTIN_GATHERSIV2DI);
29588 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
29589 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
29590 IX86_BUILTIN_GATHERSIV4DI);
29592 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
29593 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
29594 IX86_BUILTIN_GATHERDIV2DI);
29596 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
29597 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
29598 IX86_BUILTIN_GATHERDIV4DI);
29600 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
29601 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
29602 IX86_BUILTIN_GATHERSIV4SI);
29604 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
29605 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
29606 IX86_BUILTIN_GATHERSIV8SI);
29608 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
29609 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
29610 IX86_BUILTIN_GATHERDIV4SI);
29612 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
29613 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
29614 IX86_BUILTIN_GATHERDIV8SI);
29616 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
29617 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
29618 IX86_BUILTIN_GATHERALTSIV4DF);
29620 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
29621 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
29622 IX86_BUILTIN_GATHERALTDIV8SF);
29624 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
29625 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
29626 IX86_BUILTIN_GATHERALTSIV4DI);
29628 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
29629 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
29630 IX86_BUILTIN_GATHERALTDIV8SI);
29632 /* RTM. */
29633 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
29634 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
29636 /* MMX access to the vec_init patterns. */
29637 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
29638 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
29640 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
29641 V4HI_FTYPE_HI_HI_HI_HI,
29642 IX86_BUILTIN_VEC_INIT_V4HI);
29644 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
29645 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
29646 IX86_BUILTIN_VEC_INIT_V8QI);
29648 /* Access to the vec_extract patterns. */
29649 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
29650 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
29651 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
29652 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
29653 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
29654 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
29655 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
29656 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
29657 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
29658 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
29660 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29661 "__builtin_ia32_vec_ext_v4hi",
29662 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
29664 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
29665 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
29667 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
29668 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
29670 /* Access to the vec_set patterns. */
29671 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
29672 "__builtin_ia32_vec_set_v2di",
29673 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
29675 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
29676 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
29678 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
29679 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
29681 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
29682 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
29684 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29685 "__builtin_ia32_vec_set_v4hi",
29686 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
29688 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
29689 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
29691 /* RDSEED */
29692 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
29693 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
29694 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
29695 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
29696 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
29697 "__builtin_ia32_rdseed_di_step",
29698 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
29700 /* ADCX */
29701 def_builtin (0, "__builtin_ia32_addcarryx_u32",
29702 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29703 def_builtin (OPTION_MASK_ISA_64BIT,
29704 "__builtin_ia32_addcarryx_u64",
29705 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29706 IX86_BUILTIN_ADDCARRYX64);
29708 /* Add FMA4 multi-arg argument instructions */
29709 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29711 if (d->name == 0)
29712 continue;
29714 ftype = (enum ix86_builtin_func_type) d->flag;
29715 def_builtin_const (d->mask, d->name, ftype, d->code);
29719 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29720 to return a pointer to VERSION_DECL if the outcome of the expression
29721 formed by PREDICATE_CHAIN is true. This function will be called during
29722 version dispatch to decide which function version to execute. It returns
29723 the basic block at the end, to which more conditions can be added. */
29725 static basic_block
29726 add_condition_to_bb (tree function_decl, tree version_decl,
29727 tree predicate_chain, basic_block new_bb)
29729 gimple return_stmt;
29730 tree convert_expr, result_var;
29731 gimple convert_stmt;
29732 gimple call_cond_stmt;
29733 gimple if_else_stmt;
29735 basic_block bb1, bb2, bb3;
29736 edge e12, e23;
29738 tree cond_var, and_expr_var = NULL_TREE;
29739 gimple_seq gseq;
29741 tree predicate_decl, predicate_arg;
29743 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29745 gcc_assert (new_bb != NULL);
29746 gseq = bb_seq (new_bb);
29749 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29750 build_fold_addr_expr (version_decl));
29751 result_var = create_tmp_var (ptr_type_node, NULL);
29752 convert_stmt = gimple_build_assign (result_var, convert_expr);
29753 return_stmt = gimple_build_return (result_var);
29755 if (predicate_chain == NULL_TREE)
29757 gimple_seq_add_stmt (&gseq, convert_stmt);
29758 gimple_seq_add_stmt (&gseq, return_stmt);
29759 set_bb_seq (new_bb, gseq);
29760 gimple_set_bb (convert_stmt, new_bb);
29761 gimple_set_bb (return_stmt, new_bb);
29762 pop_cfun ();
29763 return new_bb;
29766 while (predicate_chain != NULL)
29768 cond_var = create_tmp_var (integer_type_node, NULL);
29769 predicate_decl = TREE_PURPOSE (predicate_chain);
29770 predicate_arg = TREE_VALUE (predicate_chain);
29771 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29772 gimple_call_set_lhs (call_cond_stmt, cond_var);
29774 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29775 gimple_set_bb (call_cond_stmt, new_bb);
29776 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29778 predicate_chain = TREE_CHAIN (predicate_chain);
29780 if (and_expr_var == NULL)
29781 and_expr_var = cond_var;
29782 else
29784 gimple assign_stmt;
29785 /* Use MIN_EXPR to check if any integer is zero?.
29786 and_expr_var = min_expr <cond_var, and_expr_var> */
29787 assign_stmt = gimple_build_assign (and_expr_var,
29788 build2 (MIN_EXPR, integer_type_node,
29789 cond_var, and_expr_var));
29791 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29792 gimple_set_bb (assign_stmt, new_bb);
29793 gimple_seq_add_stmt (&gseq, assign_stmt);
29797 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29798 integer_zero_node,
29799 NULL_TREE, NULL_TREE);
29800 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29801 gimple_set_bb (if_else_stmt, new_bb);
29802 gimple_seq_add_stmt (&gseq, if_else_stmt);
29804 gimple_seq_add_stmt (&gseq, convert_stmt);
29805 gimple_seq_add_stmt (&gseq, return_stmt);
29806 set_bb_seq (new_bb, gseq);
29808 bb1 = new_bb;
29809 e12 = split_block (bb1, if_else_stmt);
29810 bb2 = e12->dest;
29811 e12->flags &= ~EDGE_FALLTHRU;
29812 e12->flags |= EDGE_TRUE_VALUE;
29814 e23 = split_block (bb2, return_stmt);
29816 gimple_set_bb (convert_stmt, bb2);
29817 gimple_set_bb (return_stmt, bb2);
29819 bb3 = e23->dest;
29820 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29822 remove_edge (e23);
29823 make_edge (bb2, EXIT_BLOCK_PTR, 0);
29825 pop_cfun ();
29827 return bb3;
29830 /* This parses the attribute arguments to target in DECL and determines
29831 the right builtin to use to match the platform specification.
29832 It returns the priority value for this version decl. If PREDICATE_LIST
29833 is not NULL, it stores the list of cpu features that need to be checked
29834 before dispatching this function. */
29836 static unsigned int
29837 get_builtin_code_for_version (tree decl, tree *predicate_list)
29839 tree attrs;
29840 struct cl_target_option cur_target;
29841 tree target_node;
29842 struct cl_target_option *new_target;
29843 const char *arg_str = NULL;
29844 const char *attrs_str = NULL;
29845 char *tok_str = NULL;
29846 char *token;
29848 /* Priority of i386 features, greater value is higher priority. This is
29849 used to decide the order in which function dispatch must happen. For
29850 instance, a version specialized for SSE4.2 should be checked for dispatch
29851 before a version for SSE3, as SSE4.2 implies SSE3. */
29852 enum feature_priority
29854 P_ZERO = 0,
29855 P_MMX,
29856 P_SSE,
29857 P_SSE2,
29858 P_SSE3,
29859 P_SSSE3,
29860 P_PROC_SSSE3,
29861 P_SSE4_a,
29862 P_PROC_SSE4_a,
29863 P_SSE4_1,
29864 P_SSE4_2,
29865 P_PROC_SSE4_2,
29866 P_POPCNT,
29867 P_AVX,
29868 P_AVX2,
29869 P_FMA,
29870 P_PROC_FMA
29873 enum feature_priority priority = P_ZERO;
29875 /* These are the target attribute strings for which a dispatcher is
29876 available, from fold_builtin_cpu. */
29878 static struct _feature_list
29880 const char *const name;
29881 const enum feature_priority priority;
29883 const feature_list[] =
29885 {"mmx", P_MMX},
29886 {"sse", P_SSE},
29887 {"sse2", P_SSE2},
29888 {"sse3", P_SSE3},
29889 {"ssse3", P_SSSE3},
29890 {"sse4.1", P_SSE4_1},
29891 {"sse4.2", P_SSE4_2},
29892 {"popcnt", P_POPCNT},
29893 {"avx", P_AVX},
29894 {"avx2", P_AVX2}
29898 static unsigned int NUM_FEATURES
29899 = sizeof (feature_list) / sizeof (struct _feature_list);
29901 unsigned int i;
29903 tree predicate_chain = NULL_TREE;
29904 tree predicate_decl, predicate_arg;
29906 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29907 gcc_assert (attrs != NULL);
29909 attrs = TREE_VALUE (TREE_VALUE (attrs));
29911 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29912 attrs_str = TREE_STRING_POINTER (attrs);
29914 /* Return priority zero for default function. */
29915 if (strcmp (attrs_str, "default") == 0)
29916 return 0;
29918 /* Handle arch= if specified. For priority, set it to be 1 more than
29919 the best instruction set the processor can handle. For instance, if
29920 there is a version for atom and a version for ssse3 (the highest ISA
29921 priority for atom), the atom version must be checked for dispatch
29922 before the ssse3 version. */
29923 if (strstr (attrs_str, "arch=") != NULL)
29925 cl_target_option_save (&cur_target, &global_options);
29926 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
29927 &global_options_set);
29929 gcc_assert (target_node);
29930 new_target = TREE_TARGET_OPTION (target_node);
29931 gcc_assert (new_target);
29933 if (new_target->arch_specified && new_target->arch > 0)
29935 switch (new_target->arch)
29937 case PROCESSOR_CORE2:
29938 arg_str = "core2";
29939 priority = P_PROC_SSSE3;
29940 break;
29941 case PROCESSOR_COREI7:
29942 arg_str = "corei7";
29943 priority = P_PROC_SSE4_2;
29944 break;
29945 case PROCESSOR_COREI7_AVX:
29946 arg_str = "corei7-avx";
29947 priority = P_PROC_SSE4_2;
29948 break;
29949 case PROCESSOR_ATOM:
29950 arg_str = "atom";
29951 priority = P_PROC_SSSE3;
29952 break;
29953 case PROCESSOR_AMDFAM10:
29954 arg_str = "amdfam10h";
29955 priority = P_PROC_SSE4_a;
29956 break;
29957 case PROCESSOR_BDVER1:
29958 arg_str = "bdver1";
29959 priority = P_PROC_FMA;
29960 break;
29961 case PROCESSOR_BDVER2:
29962 arg_str = "bdver2";
29963 priority = P_PROC_FMA;
29964 break;
29968 cl_target_option_restore (&global_options, &cur_target);
29970 if (predicate_list && arg_str == NULL)
29972 error_at (DECL_SOURCE_LOCATION (decl),
29973 "No dispatcher found for the versioning attributes");
29974 return 0;
29977 if (predicate_list)
29979 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
29980 /* For a C string literal the length includes the trailing NULL. */
29981 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
29982 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29983 predicate_chain);
29987 /* Process feature name. */
29988 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
29989 strcpy (tok_str, attrs_str);
29990 token = strtok (tok_str, ",");
29991 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
29993 while (token != NULL)
29995 /* Do not process "arch=" */
29996 if (strncmp (token, "arch=", 5) == 0)
29998 token = strtok (NULL, ",");
29999 continue;
30001 for (i = 0; i < NUM_FEATURES; ++i)
30003 if (strcmp (token, feature_list[i].name) == 0)
30005 if (predicate_list)
30007 predicate_arg = build_string_literal (
30008 strlen (feature_list[i].name) + 1,
30009 feature_list[i].name);
30010 predicate_chain = tree_cons (predicate_decl, predicate_arg,
30011 predicate_chain);
30013 /* Find the maximum priority feature. */
30014 if (feature_list[i].priority > priority)
30015 priority = feature_list[i].priority;
30017 break;
30020 if (predicate_list && i == NUM_FEATURES)
30022 error_at (DECL_SOURCE_LOCATION (decl),
30023 "No dispatcher found for %s", token);
30024 return 0;
30026 token = strtok (NULL, ",");
30028 free (tok_str);
30030 if (predicate_list && predicate_chain == NULL_TREE)
30032 error_at (DECL_SOURCE_LOCATION (decl),
30033 "No dispatcher found for the versioning attributes : %s",
30034 attrs_str);
30035 return 0;
30037 else if (predicate_list)
30039 predicate_chain = nreverse (predicate_chain);
30040 *predicate_list = predicate_chain;
30043 return priority;
30046 /* This compares the priority of target features in function DECL1
30047 and DECL2. It returns positive value if DECL1 is higher priority,
30048 negative value if DECL2 is higher priority and 0 if they are the
30049 same. */
30051 static int
30052 ix86_compare_version_priority (tree decl1, tree decl2)
30054 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
30055 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
30057 return (int)priority1 - (int)priority2;
30060 /* V1 and V2 point to function versions with different priorities
30061 based on the target ISA. This function compares their priorities. */
30063 static int
30064 feature_compare (const void *v1, const void *v2)
30066 typedef struct _function_version_info
30068 tree version_decl;
30069 tree predicate_chain;
30070 unsigned int dispatch_priority;
30071 } function_version_info;
30073 const function_version_info c1 = *(const function_version_info *)v1;
30074 const function_version_info c2 = *(const function_version_info *)v2;
30075 return (c2.dispatch_priority - c1.dispatch_priority);
30078 /* This function generates the dispatch function for
30079 multi-versioned functions. DISPATCH_DECL is the function which will
30080 contain the dispatch logic. FNDECLS are the function choices for
30081 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
30082 in DISPATCH_DECL in which the dispatch code is generated. */
30084 static int
30085 dispatch_function_versions (tree dispatch_decl,
30086 void *fndecls_p,
30087 basic_block *empty_bb)
30089 tree default_decl;
30090 gimple ifunc_cpu_init_stmt;
30091 gimple_seq gseq;
30092 int ix;
30093 tree ele;
30094 vec<tree> *fndecls;
30095 unsigned int num_versions = 0;
30096 unsigned int actual_versions = 0;
30097 unsigned int i;
30099 struct _function_version_info
30101 tree version_decl;
30102 tree predicate_chain;
30103 unsigned int dispatch_priority;
30104 }*function_version_info;
30106 gcc_assert (dispatch_decl != NULL
30107 && fndecls_p != NULL
30108 && empty_bb != NULL);
30110 /*fndecls_p is actually a vector. */
30111 fndecls = static_cast<vec<tree> *> (fndecls_p);
30113 /* At least one more version other than the default. */
30114 num_versions = fndecls->length ();
30115 gcc_assert (num_versions >= 2);
30117 function_version_info = (struct _function_version_info *)
30118 XNEWVEC (struct _function_version_info, (num_versions - 1));
30120 /* The first version in the vector is the default decl. */
30121 default_decl = (*fndecls)[0];
30123 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
30125 gseq = bb_seq (*empty_bb);
30126 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
30127 constructors, so explicity call __builtin_cpu_init here. */
30128 ifunc_cpu_init_stmt = gimple_build_call_vec (
30129 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
30130 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
30131 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
30132 set_bb_seq (*empty_bb, gseq);
30134 pop_cfun ();
30137 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
30139 tree version_decl = ele;
30140 tree predicate_chain = NULL_TREE;
30141 unsigned int priority;
30142 /* Get attribute string, parse it and find the right predicate decl.
30143 The predicate function could be a lengthy combination of many
30144 features, like arch-type and various isa-variants. */
30145 priority = get_builtin_code_for_version (version_decl,
30146 &predicate_chain);
30148 if (predicate_chain == NULL_TREE)
30149 continue;
30151 function_version_info [actual_versions].version_decl = version_decl;
30152 function_version_info [actual_versions].predicate_chain
30153 = predicate_chain;
30154 function_version_info [actual_versions].dispatch_priority = priority;
30155 actual_versions++;
30158 /* Sort the versions according to descending order of dispatch priority. The
30159 priority is based on the ISA. This is not a perfect solution. There
30160 could still be ambiguity. If more than one function version is suitable
30161 to execute, which one should be dispatched? In future, allow the user
30162 to specify a dispatch priority next to the version. */
30163 qsort (function_version_info, actual_versions,
30164 sizeof (struct _function_version_info), feature_compare);
30166 for (i = 0; i < actual_versions; ++i)
30167 *empty_bb = add_condition_to_bb (dispatch_decl,
30168 function_version_info[i].version_decl,
30169 function_version_info[i].predicate_chain,
30170 *empty_bb);
30172 /* dispatch default version at the end. */
30173 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
30174 NULL, *empty_bb);
30176 free (function_version_info);
30177 return 0;
30180 /* Comparator function to be used in qsort routine to sort attribute
30181 specification strings to "target". */
30183 static int
30184 attr_strcmp (const void *v1, const void *v2)
30186 const char *c1 = *(char *const*)v1;
30187 const char *c2 = *(char *const*)v2;
30188 return strcmp (c1, c2);
30191 /* ARGLIST is the argument to target attribute. This function tokenizes
30192 the comma separated arguments, sorts them and returns a string which
30193 is a unique identifier for the comma separated arguments. It also
30194 replaces non-identifier characters "=,-" with "_". */
30196 static char *
30197 sorted_attr_string (tree arglist)
30199 tree arg;
30200 size_t str_len_sum = 0;
30201 char **args = NULL;
30202 char *attr_str, *ret_str;
30203 char *attr = NULL;
30204 unsigned int argnum = 1;
30205 unsigned int i;
30207 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30209 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30210 size_t len = strlen (str);
30211 str_len_sum += len + 1;
30212 if (arg != arglist)
30213 argnum++;
30214 for (i = 0; i < strlen (str); i++)
30215 if (str[i] == ',')
30216 argnum++;
30219 attr_str = XNEWVEC (char, str_len_sum);
30220 str_len_sum = 0;
30221 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30223 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30224 size_t len = strlen (str);
30225 memcpy (attr_str + str_len_sum, str, len);
30226 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
30227 str_len_sum += len + 1;
30230 /* Replace "=,-" with "_". */
30231 for (i = 0; i < strlen (attr_str); i++)
30232 if (attr_str[i] == '=' || attr_str[i]== '-')
30233 attr_str[i] = '_';
30235 if (argnum == 1)
30236 return attr_str;
30238 args = XNEWVEC (char *, argnum);
30240 i = 0;
30241 attr = strtok (attr_str, ",");
30242 while (attr != NULL)
30244 args[i] = attr;
30245 i++;
30246 attr = strtok (NULL, ",");
30249 qsort (args, argnum, sizeof (char *), attr_strcmp);
30251 ret_str = XNEWVEC (char, str_len_sum);
30252 str_len_sum = 0;
30253 for (i = 0; i < argnum; i++)
30255 size_t len = strlen (args[i]);
30256 memcpy (ret_str + str_len_sum, args[i], len);
30257 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
30258 str_len_sum += len + 1;
30261 XDELETEVEC (args);
30262 XDELETEVEC (attr_str);
30263 return ret_str;
30266 /* This function changes the assembler name for functions that are
30267 versions. If DECL is a function version and has a "target"
30268 attribute, it appends the attribute string to its assembler name. */
30270 static tree
30271 ix86_mangle_function_version_assembler_name (tree decl, tree id)
30273 tree version_attr;
30274 const char *orig_name, *version_string;
30275 char *attr_str, *assembler_name;
30277 if (DECL_DECLARED_INLINE_P (decl)
30278 && lookup_attribute ("gnu_inline",
30279 DECL_ATTRIBUTES (decl)))
30280 error_at (DECL_SOURCE_LOCATION (decl),
30281 "Function versions cannot be marked as gnu_inline,"
30282 " bodies have to be generated");
30284 if (DECL_VIRTUAL_P (decl)
30285 || DECL_VINDEX (decl))
30286 sorry ("Virtual function multiversioning not supported");
30288 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30290 /* target attribute string cannot be NULL. */
30291 gcc_assert (version_attr != NULL_TREE);
30293 orig_name = IDENTIFIER_POINTER (id);
30294 version_string
30295 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
30297 if (strcmp (version_string, "default") == 0)
30298 return id;
30300 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
30301 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
30303 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
30305 /* Allow assembler name to be modified if already set. */
30306 if (DECL_ASSEMBLER_NAME_SET_P (decl))
30307 SET_DECL_RTL (decl, NULL);
30309 tree ret = get_identifier (assembler_name);
30310 XDELETEVEC (attr_str);
30311 XDELETEVEC (assembler_name);
30312 return ret;
30315 /* This function returns true if FN1 and FN2 are versions of the same function,
30316 that is, the target strings of the function decls are different. This assumes
30317 that FN1 and FN2 have the same signature. */
30319 static bool
30320 ix86_function_versions (tree fn1, tree fn2)
30322 tree attr1, attr2;
30323 char *target1, *target2;
30324 bool result;
30326 if (TREE_CODE (fn1) != FUNCTION_DECL
30327 || TREE_CODE (fn2) != FUNCTION_DECL)
30328 return false;
30330 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
30331 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
30333 /* At least one function decl should have the target attribute specified. */
30334 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
30335 return false;
30337 /* Diagnose missing target attribute if one of the decls is already
30338 multi-versioned. */
30339 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
30341 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
30343 if (attr2 != NULL_TREE)
30345 tree tem = fn1;
30346 fn1 = fn2;
30347 fn2 = tem;
30348 attr1 = attr2;
30350 error_at (DECL_SOURCE_LOCATION (fn2),
30351 "missing %<target%> attribute for multi-versioned %D",
30352 fn2);
30353 inform (DECL_SOURCE_LOCATION (fn1),
30354 "previous declaration of %D", fn1);
30355 /* Prevent diagnosing of the same error multiple times. */
30356 DECL_ATTRIBUTES (fn2)
30357 = tree_cons (get_identifier ("target"),
30358 copy_node (TREE_VALUE (attr1)),
30359 DECL_ATTRIBUTES (fn2));
30361 return false;
30364 target1 = sorted_attr_string (TREE_VALUE (attr1));
30365 target2 = sorted_attr_string (TREE_VALUE (attr2));
30367 /* The sorted target strings must be different for fn1 and fn2
30368 to be versions. */
30369 if (strcmp (target1, target2) == 0)
30370 result = false;
30371 else
30372 result = true;
30374 XDELETEVEC (target1);
30375 XDELETEVEC (target2);
30377 return result;
30380 static tree
30381 ix86_mangle_decl_assembler_name (tree decl, tree id)
30383 /* For function version, add the target suffix to the assembler name. */
30384 if (TREE_CODE (decl) == FUNCTION_DECL
30385 && DECL_FUNCTION_VERSIONED (decl))
30386 id = ix86_mangle_function_version_assembler_name (decl, id);
30387 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
30388 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
30389 #endif
30391 return id;
30394 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
30395 is true, append the full path name of the source file. */
30397 static char *
30398 make_name (tree decl, const char *suffix, bool make_unique)
30400 char *global_var_name;
30401 int name_len;
30402 const char *name;
30403 const char *unique_name = NULL;
30405 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
30407 /* Get a unique name that can be used globally without any chances
30408 of collision at link time. */
30409 if (make_unique)
30410 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
30412 name_len = strlen (name) + strlen (suffix) + 2;
30414 if (make_unique)
30415 name_len += strlen (unique_name) + 1;
30416 global_var_name = XNEWVEC (char, name_len);
30418 /* Use '.' to concatenate names as it is demangler friendly. */
30419 if (make_unique)
30420 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
30421 suffix);
30422 else
30423 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
30425 return global_var_name;
30428 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30430 /* Make a dispatcher declaration for the multi-versioned function DECL.
30431 Calls to DECL function will be replaced with calls to the dispatcher
30432 by the front-end. Return the decl created. */
30434 static tree
30435 make_dispatcher_decl (const tree decl)
30437 tree func_decl;
30438 char *func_name;
30439 tree fn_type, func_type;
30440 bool is_uniq = false;
30442 if (TREE_PUBLIC (decl) == 0)
30443 is_uniq = true;
30445 func_name = make_name (decl, "ifunc", is_uniq);
30447 fn_type = TREE_TYPE (decl);
30448 func_type = build_function_type (TREE_TYPE (fn_type),
30449 TYPE_ARG_TYPES (fn_type));
30451 func_decl = build_fn_decl (func_name, func_type);
30452 XDELETEVEC (func_name);
30453 TREE_USED (func_decl) = 1;
30454 DECL_CONTEXT (func_decl) = NULL_TREE;
30455 DECL_INITIAL (func_decl) = error_mark_node;
30456 DECL_ARTIFICIAL (func_decl) = 1;
30457 /* Mark this func as external, the resolver will flip it again if
30458 it gets generated. */
30459 DECL_EXTERNAL (func_decl) = 1;
30460 /* This will be of type IFUNCs have to be externally visible. */
30461 TREE_PUBLIC (func_decl) = 1;
30463 return func_decl;
30466 #endif
30468 /* Returns true if decl is multi-versioned and DECL is the default function,
30469 that is it is not tagged with target specific optimization. */
30471 static bool
30472 is_function_default_version (const tree decl)
30474 if (TREE_CODE (decl) != FUNCTION_DECL
30475 || !DECL_FUNCTION_VERSIONED (decl))
30476 return false;
30477 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30478 gcc_assert (attr);
30479 attr = TREE_VALUE (TREE_VALUE (attr));
30480 return (TREE_CODE (attr) == STRING_CST
30481 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
30484 /* Make a dispatcher declaration for the multi-versioned function DECL.
30485 Calls to DECL function will be replaced with calls to the dispatcher
30486 by the front-end. Returns the decl of the dispatcher function. */
30488 static tree
30489 ix86_get_function_versions_dispatcher (void *decl)
30491 tree fn = (tree) decl;
30492 struct cgraph_node *node = NULL;
30493 struct cgraph_node *default_node = NULL;
30494 struct cgraph_function_version_info *node_v = NULL;
30495 struct cgraph_function_version_info *first_v = NULL;
30497 tree dispatch_decl = NULL;
30499 struct cgraph_function_version_info *default_version_info = NULL;
30501 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
30503 node = cgraph_get_node (fn);
30504 gcc_assert (node != NULL);
30506 node_v = get_cgraph_node_version (node);
30507 gcc_assert (node_v != NULL);
30509 if (node_v->dispatcher_resolver != NULL)
30510 return node_v->dispatcher_resolver;
30512 /* Find the default version and make it the first node. */
30513 first_v = node_v;
30514 /* Go to the beginning of the chain. */
30515 while (first_v->prev != NULL)
30516 first_v = first_v->prev;
30517 default_version_info = first_v;
30518 while (default_version_info != NULL)
30520 if (is_function_default_version
30521 (default_version_info->this_node->decl))
30522 break;
30523 default_version_info = default_version_info->next;
30526 /* If there is no default node, just return NULL. */
30527 if (default_version_info == NULL)
30528 return NULL;
30530 /* Make default info the first node. */
30531 if (first_v != default_version_info)
30533 default_version_info->prev->next = default_version_info->next;
30534 if (default_version_info->next)
30535 default_version_info->next->prev = default_version_info->prev;
30536 first_v->prev = default_version_info;
30537 default_version_info->next = first_v;
30538 default_version_info->prev = NULL;
30541 default_node = default_version_info->this_node;
30543 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30544 if (targetm.has_ifunc_p ())
30546 struct cgraph_function_version_info *it_v = NULL;
30547 struct cgraph_node *dispatcher_node = NULL;
30548 struct cgraph_function_version_info *dispatcher_version_info = NULL;
30550 /* Right now, the dispatching is done via ifunc. */
30551 dispatch_decl = make_dispatcher_decl (default_node->decl);
30553 dispatcher_node = cgraph_get_create_node (dispatch_decl);
30554 gcc_assert (dispatcher_node != NULL);
30555 dispatcher_node->dispatcher_function = 1;
30556 dispatcher_version_info
30557 = insert_new_cgraph_node_version (dispatcher_node);
30558 dispatcher_version_info->next = default_version_info;
30559 dispatcher_node->definition = 1;
30561 /* Set the dispatcher for all the versions. */
30562 it_v = default_version_info;
30563 while (it_v != NULL)
30565 it_v->dispatcher_resolver = dispatch_decl;
30566 it_v = it_v->next;
30569 else
30570 #endif
30572 error_at (DECL_SOURCE_LOCATION (default_node->decl),
30573 "multiversioning needs ifunc which is not supported "
30574 "on this target");
30577 return dispatch_decl;
30580 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
30581 it to CHAIN. */
30583 static tree
30584 make_attribute (const char *name, const char *arg_name, tree chain)
30586 tree attr_name;
30587 tree attr_arg_name;
30588 tree attr_args;
30589 tree attr;
30591 attr_name = get_identifier (name);
30592 attr_arg_name = build_string (strlen (arg_name), arg_name);
30593 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
30594 attr = tree_cons (attr_name, attr_args, chain);
30595 return attr;
30598 /* Make the resolver function decl to dispatch the versions of
30599 a multi-versioned function, DEFAULT_DECL. Create an
30600 empty basic block in the resolver and store the pointer in
30601 EMPTY_BB. Return the decl of the resolver function. */
30603 static tree
30604 make_resolver_func (const tree default_decl,
30605 const tree dispatch_decl,
30606 basic_block *empty_bb)
30608 char *resolver_name;
30609 tree decl, type, decl_name, t;
30610 bool is_uniq = false;
30612 /* IFUNC's have to be globally visible. So, if the default_decl is
30613 not, then the name of the IFUNC should be made unique. */
30614 if (TREE_PUBLIC (default_decl) == 0)
30615 is_uniq = true;
30617 /* Append the filename to the resolver function if the versions are
30618 not externally visible. This is because the resolver function has
30619 to be externally visible for the loader to find it. So, appending
30620 the filename will prevent conflicts with a resolver function from
30621 another module which is based on the same version name. */
30622 resolver_name = make_name (default_decl, "resolver", is_uniq);
30624 /* The resolver function should return a (void *). */
30625 type = build_function_type_list (ptr_type_node, NULL_TREE);
30627 decl = build_fn_decl (resolver_name, type);
30628 decl_name = get_identifier (resolver_name);
30629 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
30631 DECL_NAME (decl) = decl_name;
30632 TREE_USED (decl) = 1;
30633 DECL_ARTIFICIAL (decl) = 1;
30634 DECL_IGNORED_P (decl) = 0;
30635 /* IFUNC resolvers have to be externally visible. */
30636 TREE_PUBLIC (decl) = 1;
30637 DECL_UNINLINABLE (decl) = 1;
30639 /* Resolver is not external, body is generated. */
30640 DECL_EXTERNAL (decl) = 0;
30641 DECL_EXTERNAL (dispatch_decl) = 0;
30643 DECL_CONTEXT (decl) = NULL_TREE;
30644 DECL_INITIAL (decl) = make_node (BLOCK);
30645 DECL_STATIC_CONSTRUCTOR (decl) = 0;
30647 if (DECL_COMDAT_GROUP (default_decl)
30648 || TREE_PUBLIC (default_decl))
30650 /* In this case, each translation unit with a call to this
30651 versioned function will put out a resolver. Ensure it
30652 is comdat to keep just one copy. */
30653 DECL_COMDAT (decl) = 1;
30654 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
30656 /* Build result decl and add to function_decl. */
30657 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
30658 DECL_ARTIFICIAL (t) = 1;
30659 DECL_IGNORED_P (t) = 1;
30660 DECL_RESULT (decl) = t;
30662 gimplify_function_tree (decl);
30663 push_cfun (DECL_STRUCT_FUNCTION (decl));
30664 *empty_bb = init_lowered_empty_function (decl, false);
30666 cgraph_add_new_function (decl, true);
30667 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
30669 pop_cfun ();
30671 gcc_assert (dispatch_decl != NULL);
30672 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
30673 DECL_ATTRIBUTES (dispatch_decl)
30674 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
30676 /* Create the alias for dispatch to resolver here. */
30677 /*cgraph_create_function_alias (dispatch_decl, decl);*/
30678 cgraph_same_body_alias (NULL, dispatch_decl, decl);
30679 XDELETEVEC (resolver_name);
30680 return decl;
30683 /* Generate the dispatching code body to dispatch multi-versioned function
30684 DECL. The target hook is called to process the "target" attributes and
30685 provide the code to dispatch the right function at run-time. NODE points
30686 to the dispatcher decl whose body will be created. */
30688 static tree
30689 ix86_generate_version_dispatcher_body (void *node_p)
30691 tree resolver_decl;
30692 basic_block empty_bb;
30693 vec<tree> fn_ver_vec = vNULL;
30694 tree default_ver_decl;
30695 struct cgraph_node *versn;
30696 struct cgraph_node *node;
30698 struct cgraph_function_version_info *node_version_info = NULL;
30699 struct cgraph_function_version_info *versn_info = NULL;
30701 node = (cgraph_node *)node_p;
30703 node_version_info = get_cgraph_node_version (node);
30704 gcc_assert (node->dispatcher_function
30705 && node_version_info != NULL);
30707 if (node_version_info->dispatcher_resolver)
30708 return node_version_info->dispatcher_resolver;
30710 /* The first version in the chain corresponds to the default version. */
30711 default_ver_decl = node_version_info->next->this_node->decl;
30713 /* node is going to be an alias, so remove the finalized bit. */
30714 node->definition = false;
30716 resolver_decl = make_resolver_func (default_ver_decl,
30717 node->decl, &empty_bb);
30719 node_version_info->dispatcher_resolver = resolver_decl;
30721 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30723 fn_ver_vec.create (2);
30725 for (versn_info = node_version_info->next; versn_info;
30726 versn_info = versn_info->next)
30728 versn = versn_info->this_node;
30729 /* Check for virtual functions here again, as by this time it should
30730 have been determined if this function needs a vtable index or
30731 not. This happens for methods in derived classes that override
30732 virtual methods in base classes but are not explicitly marked as
30733 virtual. */
30734 if (DECL_VINDEX (versn->decl))
30735 sorry ("Virtual function multiversioning not supported");
30737 fn_ver_vec.safe_push (versn->decl);
30740 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30741 fn_ver_vec.release ();
30742 rebuild_cgraph_edges ();
30743 pop_cfun ();
30744 return resolver_decl;
30746 /* This builds the processor_model struct type defined in
30747 libgcc/config/i386/cpuinfo.c */
30749 static tree
30750 build_processor_model_struct (void)
30752 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30753 "__cpu_features"};
30754 tree field = NULL_TREE, field_chain = NULL_TREE;
30755 int i;
30756 tree type = make_node (RECORD_TYPE);
30758 /* The first 3 fields are unsigned int. */
30759 for (i = 0; i < 3; ++i)
30761 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30762 get_identifier (field_name[i]), unsigned_type_node);
30763 if (field_chain != NULL_TREE)
30764 DECL_CHAIN (field) = field_chain;
30765 field_chain = field;
30768 /* The last field is an array of unsigned integers of size one. */
30769 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30770 get_identifier (field_name[3]),
30771 build_array_type (unsigned_type_node,
30772 build_index_type (size_one_node)));
30773 if (field_chain != NULL_TREE)
30774 DECL_CHAIN (field) = field_chain;
30775 field_chain = field;
30777 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30778 return type;
30781 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30783 static tree
30784 make_var_decl (tree type, const char *name)
30786 tree new_decl;
30788 new_decl = build_decl (UNKNOWN_LOCATION,
30789 VAR_DECL,
30790 get_identifier(name),
30791 type);
30793 DECL_EXTERNAL (new_decl) = 1;
30794 TREE_STATIC (new_decl) = 1;
30795 TREE_PUBLIC (new_decl) = 1;
30796 DECL_INITIAL (new_decl) = 0;
30797 DECL_ARTIFICIAL (new_decl) = 0;
30798 DECL_PRESERVE_P (new_decl) = 1;
30800 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30801 assemble_variable (new_decl, 0, 0, 0);
30803 return new_decl;
30806 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30807 into an integer defined in libgcc/config/i386/cpuinfo.c */
30809 static tree
30810 fold_builtin_cpu (tree fndecl, tree *args)
30812 unsigned int i;
30813 enum ix86_builtins fn_code = (enum ix86_builtins)
30814 DECL_FUNCTION_CODE (fndecl);
30815 tree param_string_cst = NULL;
30817 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30818 enum processor_features
30820 F_CMOV = 0,
30821 F_MMX,
30822 F_POPCNT,
30823 F_SSE,
30824 F_SSE2,
30825 F_SSE3,
30826 F_SSSE3,
30827 F_SSE4_1,
30828 F_SSE4_2,
30829 F_AVX,
30830 F_AVX2,
30831 F_MAX
30834 /* These are the values for vendor types and cpu types and subtypes
30835 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30836 the corresponding start value. */
30837 enum processor_model
30839 M_INTEL = 1,
30840 M_AMD,
30841 M_CPU_TYPE_START,
30842 M_INTEL_ATOM,
30843 M_INTEL_CORE2,
30844 M_INTEL_COREI7,
30845 M_AMDFAM10H,
30846 M_AMDFAM15H,
30847 M_INTEL_SLM,
30848 M_CPU_SUBTYPE_START,
30849 M_INTEL_COREI7_NEHALEM,
30850 M_INTEL_COREI7_WESTMERE,
30851 M_INTEL_COREI7_SANDYBRIDGE,
30852 M_AMDFAM10H_BARCELONA,
30853 M_AMDFAM10H_SHANGHAI,
30854 M_AMDFAM10H_ISTANBUL,
30855 M_AMDFAM15H_BDVER1,
30856 M_AMDFAM15H_BDVER2,
30857 M_AMDFAM15H_BDVER3,
30858 M_AMDFAM15H_BDVER4
30861 static struct _arch_names_table
30863 const char *const name;
30864 const enum processor_model model;
30866 const arch_names_table[] =
30868 {"amd", M_AMD},
30869 {"intel", M_INTEL},
30870 {"atom", M_INTEL_ATOM},
30871 {"slm", M_INTEL_SLM},
30872 {"core2", M_INTEL_CORE2},
30873 {"corei7", M_INTEL_COREI7},
30874 {"nehalem", M_INTEL_COREI7_NEHALEM},
30875 {"westmere", M_INTEL_COREI7_WESTMERE},
30876 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30877 {"amdfam10h", M_AMDFAM10H},
30878 {"barcelona", M_AMDFAM10H_BARCELONA},
30879 {"shanghai", M_AMDFAM10H_SHANGHAI},
30880 {"istanbul", M_AMDFAM10H_ISTANBUL},
30881 {"amdfam15h", M_AMDFAM15H},
30882 {"bdver1", M_AMDFAM15H_BDVER1},
30883 {"bdver2", M_AMDFAM15H_BDVER2},
30884 {"bdver3", M_AMDFAM15H_BDVER3},
30885 {"bdver4", M_AMDFAM15H_BDVER4},
30888 static struct _isa_names_table
30890 const char *const name;
30891 const enum processor_features feature;
30893 const isa_names_table[] =
30895 {"cmov", F_CMOV},
30896 {"mmx", F_MMX},
30897 {"popcnt", F_POPCNT},
30898 {"sse", F_SSE},
30899 {"sse2", F_SSE2},
30900 {"sse3", F_SSE3},
30901 {"ssse3", F_SSSE3},
30902 {"sse4.1", F_SSE4_1},
30903 {"sse4.2", F_SSE4_2},
30904 {"avx", F_AVX},
30905 {"avx2", F_AVX2}
30908 tree __processor_model_type = build_processor_model_struct ();
30909 tree __cpu_model_var = make_var_decl (__processor_model_type,
30910 "__cpu_model");
30913 varpool_add_new_variable (__cpu_model_var);
30915 gcc_assert ((args != NULL) && (*args != NULL));
30917 param_string_cst = *args;
30918 while (param_string_cst
30919 && TREE_CODE (param_string_cst) != STRING_CST)
30921 /* *args must be a expr that can contain other EXPRS leading to a
30922 STRING_CST. */
30923 if (!EXPR_P (param_string_cst))
30925 error ("Parameter to builtin must be a string constant or literal");
30926 return integer_zero_node;
30928 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30931 gcc_assert (param_string_cst);
30933 if (fn_code == IX86_BUILTIN_CPU_IS)
30935 tree ref;
30936 tree field;
30937 tree final;
30939 unsigned int field_val = 0;
30940 unsigned int NUM_ARCH_NAMES
30941 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30943 for (i = 0; i < NUM_ARCH_NAMES; i++)
30944 if (strcmp (arch_names_table[i].name,
30945 TREE_STRING_POINTER (param_string_cst)) == 0)
30946 break;
30948 if (i == NUM_ARCH_NAMES)
30950 error ("Parameter to builtin not valid: %s",
30951 TREE_STRING_POINTER (param_string_cst));
30952 return integer_zero_node;
30955 field = TYPE_FIELDS (__processor_model_type);
30956 field_val = arch_names_table[i].model;
30958 /* CPU types are stored in the next field. */
30959 if (field_val > M_CPU_TYPE_START
30960 && field_val < M_CPU_SUBTYPE_START)
30962 field = DECL_CHAIN (field);
30963 field_val -= M_CPU_TYPE_START;
30966 /* CPU subtypes are stored in the next field. */
30967 if (field_val > M_CPU_SUBTYPE_START)
30969 field = DECL_CHAIN ( DECL_CHAIN (field));
30970 field_val -= M_CPU_SUBTYPE_START;
30973 /* Get the appropriate field in __cpu_model. */
30974 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30975 field, NULL_TREE);
30977 /* Check the value. */
30978 final = build2 (EQ_EXPR, unsigned_type_node, ref,
30979 build_int_cstu (unsigned_type_node, field_val));
30980 return build1 (CONVERT_EXPR, integer_type_node, final);
30982 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30984 tree ref;
30985 tree array_elt;
30986 tree field;
30987 tree final;
30989 unsigned int field_val = 0;
30990 unsigned int NUM_ISA_NAMES
30991 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
30993 for (i = 0; i < NUM_ISA_NAMES; i++)
30994 if (strcmp (isa_names_table[i].name,
30995 TREE_STRING_POINTER (param_string_cst)) == 0)
30996 break;
30998 if (i == NUM_ISA_NAMES)
31000 error ("Parameter to builtin not valid: %s",
31001 TREE_STRING_POINTER (param_string_cst));
31002 return integer_zero_node;
31005 field = TYPE_FIELDS (__processor_model_type);
31006 /* Get the last field, which is __cpu_features. */
31007 while (DECL_CHAIN (field))
31008 field = DECL_CHAIN (field);
31010 /* Get the appropriate field: __cpu_model.__cpu_features */
31011 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
31012 field, NULL_TREE);
31014 /* Access the 0th element of __cpu_features array. */
31015 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
31016 integer_zero_node, NULL_TREE, NULL_TREE);
31018 field_val = (1 << isa_names_table[i].feature);
31019 /* Return __cpu_model.__cpu_features[0] & field_val */
31020 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
31021 build_int_cstu (unsigned_type_node, field_val));
31022 return build1 (CONVERT_EXPR, integer_type_node, final);
31024 gcc_unreachable ();
31027 static tree
31028 ix86_fold_builtin (tree fndecl, int n_args,
31029 tree *args, bool ignore ATTRIBUTE_UNUSED)
31031 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
31033 enum ix86_builtins fn_code = (enum ix86_builtins)
31034 DECL_FUNCTION_CODE (fndecl);
31035 if (fn_code == IX86_BUILTIN_CPU_IS
31036 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
31038 gcc_assert (n_args == 1);
31039 return fold_builtin_cpu (fndecl, args);
31043 #ifdef SUBTARGET_FOLD_BUILTIN
31044 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
31045 #endif
31047 return NULL_TREE;
31050 /* Make builtins to detect cpu type and features supported. NAME is
31051 the builtin name, CODE is the builtin code, and FTYPE is the function
31052 type of the builtin. */
31054 static void
31055 make_cpu_type_builtin (const char* name, int code,
31056 enum ix86_builtin_func_type ftype, bool is_const)
31058 tree decl;
31059 tree type;
31061 type = ix86_get_builtin_func_type (ftype);
31062 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31063 NULL, NULL_TREE);
31064 gcc_assert (decl != NULL_TREE);
31065 ix86_builtins[(int) code] = decl;
31066 TREE_READONLY (decl) = is_const;
31069 /* Make builtins to get CPU type and features supported. The created
31070 builtins are :
31072 __builtin_cpu_init (), to detect cpu type and features,
31073 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
31074 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
31077 static void
31078 ix86_init_platform_type_builtins (void)
31080 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
31081 INT_FTYPE_VOID, false);
31082 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
31083 INT_FTYPE_PCCHAR, true);
31084 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
31085 INT_FTYPE_PCCHAR, true);
31088 /* Internal method for ix86_init_builtins. */
31090 static void
31091 ix86_init_builtins_va_builtins_abi (void)
31093 tree ms_va_ref, sysv_va_ref;
31094 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
31095 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
31096 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
31097 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
31099 if (!TARGET_64BIT)
31100 return;
31101 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
31102 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
31103 ms_va_ref = build_reference_type (ms_va_list_type_node);
31104 sysv_va_ref =
31105 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
31107 fnvoid_va_end_ms =
31108 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31109 fnvoid_va_start_ms =
31110 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31111 fnvoid_va_end_sysv =
31112 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
31113 fnvoid_va_start_sysv =
31114 build_varargs_function_type_list (void_type_node, sysv_va_ref,
31115 NULL_TREE);
31116 fnvoid_va_copy_ms =
31117 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
31118 NULL_TREE);
31119 fnvoid_va_copy_sysv =
31120 build_function_type_list (void_type_node, sysv_va_ref,
31121 sysv_va_ref, NULL_TREE);
31123 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
31124 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
31125 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
31126 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
31127 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
31128 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
31129 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
31130 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31131 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
31132 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31133 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
31134 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31137 static void
31138 ix86_init_builtin_types (void)
31140 tree float128_type_node, float80_type_node;
31142 /* The __float80 type. */
31143 float80_type_node = long_double_type_node;
31144 if (TYPE_MODE (float80_type_node) != XFmode)
31146 /* The __float80 type. */
31147 float80_type_node = make_node (REAL_TYPE);
31149 TYPE_PRECISION (float80_type_node) = 80;
31150 layout_type (float80_type_node);
31152 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
31154 /* The __float128 type. */
31155 float128_type_node = make_node (REAL_TYPE);
31156 TYPE_PRECISION (float128_type_node) = 128;
31157 layout_type (float128_type_node);
31158 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
31160 /* This macro is built by i386-builtin-types.awk. */
31161 DEFINE_BUILTIN_PRIMITIVE_TYPES;
31164 static void
31165 ix86_init_builtins (void)
31167 tree t;
31169 ix86_init_builtin_types ();
31171 /* Builtins to get CPU type and features. */
31172 ix86_init_platform_type_builtins ();
31174 /* TFmode support builtins. */
31175 def_builtin_const (0, "__builtin_infq",
31176 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
31177 def_builtin_const (0, "__builtin_huge_valq",
31178 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
31180 /* We will expand them to normal call if SSE isn't available since
31181 they are used by libgcc. */
31182 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
31183 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
31184 BUILT_IN_MD, "__fabstf2", NULL_TREE);
31185 TREE_READONLY (t) = 1;
31186 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
31188 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
31189 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
31190 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
31191 TREE_READONLY (t) = 1;
31192 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
31194 ix86_init_tm_builtins ();
31195 ix86_init_mmx_sse_builtins ();
31197 if (TARGET_LP64)
31198 ix86_init_builtins_va_builtins_abi ();
31200 #ifdef SUBTARGET_INIT_BUILTINS
31201 SUBTARGET_INIT_BUILTINS;
31202 #endif
31205 /* Return the ix86 builtin for CODE. */
31207 static tree
31208 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
31210 if (code >= IX86_BUILTIN_MAX)
31211 return error_mark_node;
31213 return ix86_builtins[code];
31216 /* Errors in the source file can cause expand_expr to return const0_rtx
31217 where we expect a vector. To avoid crashing, use one of the vector
31218 clear instructions. */
31219 static rtx
31220 safe_vector_operand (rtx x, enum machine_mode mode)
31222 if (x == const0_rtx)
31223 x = CONST0_RTX (mode);
31224 return x;
31227 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
31229 static rtx
31230 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
31232 rtx pat;
31233 tree arg0 = CALL_EXPR_ARG (exp, 0);
31234 tree arg1 = CALL_EXPR_ARG (exp, 1);
31235 rtx op0 = expand_normal (arg0);
31236 rtx op1 = expand_normal (arg1);
31237 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31238 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31239 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
31241 if (VECTOR_MODE_P (mode0))
31242 op0 = safe_vector_operand (op0, mode0);
31243 if (VECTOR_MODE_P (mode1))
31244 op1 = safe_vector_operand (op1, mode1);
31246 if (optimize || !target
31247 || GET_MODE (target) != tmode
31248 || !insn_data[icode].operand[0].predicate (target, tmode))
31249 target = gen_reg_rtx (tmode);
31251 if (GET_MODE (op1) == SImode && mode1 == TImode)
31253 rtx x = gen_reg_rtx (V4SImode);
31254 emit_insn (gen_sse2_loadd (x, op1));
31255 op1 = gen_lowpart (TImode, x);
31258 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31259 op0 = copy_to_mode_reg (mode0, op0);
31260 if (!insn_data[icode].operand[2].predicate (op1, mode1))
31261 op1 = copy_to_mode_reg (mode1, op1);
31263 pat = GEN_FCN (icode) (target, op0, op1);
31264 if (! pat)
31265 return 0;
31267 emit_insn (pat);
31269 return target;
31272 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
31274 static rtx
31275 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
31276 enum ix86_builtin_func_type m_type,
31277 enum rtx_code sub_code)
31279 rtx pat;
31280 int i;
31281 int nargs;
31282 bool comparison_p = false;
31283 bool tf_p = false;
31284 bool last_arg_constant = false;
31285 int num_memory = 0;
31286 struct {
31287 rtx op;
31288 enum machine_mode mode;
31289 } args[4];
31291 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31293 switch (m_type)
31295 case MULTI_ARG_4_DF2_DI_I:
31296 case MULTI_ARG_4_DF2_DI_I1:
31297 case MULTI_ARG_4_SF2_SI_I:
31298 case MULTI_ARG_4_SF2_SI_I1:
31299 nargs = 4;
31300 last_arg_constant = true;
31301 break;
31303 case MULTI_ARG_3_SF:
31304 case MULTI_ARG_3_DF:
31305 case MULTI_ARG_3_SF2:
31306 case MULTI_ARG_3_DF2:
31307 case MULTI_ARG_3_DI:
31308 case MULTI_ARG_3_SI:
31309 case MULTI_ARG_3_SI_DI:
31310 case MULTI_ARG_3_HI:
31311 case MULTI_ARG_3_HI_SI:
31312 case MULTI_ARG_3_QI:
31313 case MULTI_ARG_3_DI2:
31314 case MULTI_ARG_3_SI2:
31315 case MULTI_ARG_3_HI2:
31316 case MULTI_ARG_3_QI2:
31317 nargs = 3;
31318 break;
31320 case MULTI_ARG_2_SF:
31321 case MULTI_ARG_2_DF:
31322 case MULTI_ARG_2_DI:
31323 case MULTI_ARG_2_SI:
31324 case MULTI_ARG_2_HI:
31325 case MULTI_ARG_2_QI:
31326 nargs = 2;
31327 break;
31329 case MULTI_ARG_2_DI_IMM:
31330 case MULTI_ARG_2_SI_IMM:
31331 case MULTI_ARG_2_HI_IMM:
31332 case MULTI_ARG_2_QI_IMM:
31333 nargs = 2;
31334 last_arg_constant = true;
31335 break;
31337 case MULTI_ARG_1_SF:
31338 case MULTI_ARG_1_DF:
31339 case MULTI_ARG_1_SF2:
31340 case MULTI_ARG_1_DF2:
31341 case MULTI_ARG_1_DI:
31342 case MULTI_ARG_1_SI:
31343 case MULTI_ARG_1_HI:
31344 case MULTI_ARG_1_QI:
31345 case MULTI_ARG_1_SI_DI:
31346 case MULTI_ARG_1_HI_DI:
31347 case MULTI_ARG_1_HI_SI:
31348 case MULTI_ARG_1_QI_DI:
31349 case MULTI_ARG_1_QI_SI:
31350 case MULTI_ARG_1_QI_HI:
31351 nargs = 1;
31352 break;
31354 case MULTI_ARG_2_DI_CMP:
31355 case MULTI_ARG_2_SI_CMP:
31356 case MULTI_ARG_2_HI_CMP:
31357 case MULTI_ARG_2_QI_CMP:
31358 nargs = 2;
31359 comparison_p = true;
31360 break;
31362 case MULTI_ARG_2_SF_TF:
31363 case MULTI_ARG_2_DF_TF:
31364 case MULTI_ARG_2_DI_TF:
31365 case MULTI_ARG_2_SI_TF:
31366 case MULTI_ARG_2_HI_TF:
31367 case MULTI_ARG_2_QI_TF:
31368 nargs = 2;
31369 tf_p = true;
31370 break;
31372 default:
31373 gcc_unreachable ();
31376 if (optimize || !target
31377 || GET_MODE (target) != tmode
31378 || !insn_data[icode].operand[0].predicate (target, tmode))
31379 target = gen_reg_rtx (tmode);
31381 gcc_assert (nargs <= 4);
31383 for (i = 0; i < nargs; i++)
31385 tree arg = CALL_EXPR_ARG (exp, i);
31386 rtx op = expand_normal (arg);
31387 int adjust = (comparison_p) ? 1 : 0;
31388 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
31390 if (last_arg_constant && i == nargs - 1)
31392 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
31394 enum insn_code new_icode = icode;
31395 switch (icode)
31397 case CODE_FOR_xop_vpermil2v2df3:
31398 case CODE_FOR_xop_vpermil2v4sf3:
31399 case CODE_FOR_xop_vpermil2v4df3:
31400 case CODE_FOR_xop_vpermil2v8sf3:
31401 error ("the last argument must be a 2-bit immediate");
31402 return gen_reg_rtx (tmode);
31403 case CODE_FOR_xop_rotlv2di3:
31404 new_icode = CODE_FOR_rotlv2di3;
31405 goto xop_rotl;
31406 case CODE_FOR_xop_rotlv4si3:
31407 new_icode = CODE_FOR_rotlv4si3;
31408 goto xop_rotl;
31409 case CODE_FOR_xop_rotlv8hi3:
31410 new_icode = CODE_FOR_rotlv8hi3;
31411 goto xop_rotl;
31412 case CODE_FOR_xop_rotlv16qi3:
31413 new_icode = CODE_FOR_rotlv16qi3;
31414 xop_rotl:
31415 if (CONST_INT_P (op))
31417 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
31418 op = GEN_INT (INTVAL (op) & mask);
31419 gcc_checking_assert
31420 (insn_data[icode].operand[i + 1].predicate (op, mode));
31422 else
31424 gcc_checking_assert
31425 (nargs == 2
31426 && insn_data[new_icode].operand[0].mode == tmode
31427 && insn_data[new_icode].operand[1].mode == tmode
31428 && insn_data[new_icode].operand[2].mode == mode
31429 && insn_data[new_icode].operand[0].predicate
31430 == insn_data[icode].operand[0].predicate
31431 && insn_data[new_icode].operand[1].predicate
31432 == insn_data[icode].operand[1].predicate);
31433 icode = new_icode;
31434 goto non_constant;
31436 break;
31437 default:
31438 gcc_unreachable ();
31442 else
31444 non_constant:
31445 if (VECTOR_MODE_P (mode))
31446 op = safe_vector_operand (op, mode);
31448 /* If we aren't optimizing, only allow one memory operand to be
31449 generated. */
31450 if (memory_operand (op, mode))
31451 num_memory++;
31453 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
31455 if (optimize
31456 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
31457 || num_memory > 1)
31458 op = force_reg (mode, op);
31461 args[i].op = op;
31462 args[i].mode = mode;
31465 switch (nargs)
31467 case 1:
31468 pat = GEN_FCN (icode) (target, args[0].op);
31469 break;
31471 case 2:
31472 if (tf_p)
31473 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
31474 GEN_INT ((int)sub_code));
31475 else if (! comparison_p)
31476 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31477 else
31479 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
31480 args[0].op,
31481 args[1].op);
31483 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
31485 break;
31487 case 3:
31488 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31489 break;
31491 case 4:
31492 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
31493 break;
31495 default:
31496 gcc_unreachable ();
31499 if (! pat)
31500 return 0;
31502 emit_insn (pat);
31503 return target;
31506 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
31507 insns with vec_merge. */
31509 static rtx
31510 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
31511 rtx target)
31513 rtx pat;
31514 tree arg0 = CALL_EXPR_ARG (exp, 0);
31515 rtx op1, op0 = expand_normal (arg0);
31516 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31517 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31519 if (optimize || !target
31520 || GET_MODE (target) != tmode
31521 || !insn_data[icode].operand[0].predicate (target, tmode))
31522 target = gen_reg_rtx (tmode);
31524 if (VECTOR_MODE_P (mode0))
31525 op0 = safe_vector_operand (op0, mode0);
31527 if ((optimize && !register_operand (op0, mode0))
31528 || !insn_data[icode].operand[1].predicate (op0, mode0))
31529 op0 = copy_to_mode_reg (mode0, op0);
31531 op1 = op0;
31532 if (!insn_data[icode].operand[2].predicate (op1, mode0))
31533 op1 = copy_to_mode_reg (mode0, op1);
31535 pat = GEN_FCN (icode) (target, op0, op1);
31536 if (! pat)
31537 return 0;
31538 emit_insn (pat);
31539 return target;
31542 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
31544 static rtx
31545 ix86_expand_sse_compare (const struct builtin_description *d,
31546 tree exp, rtx target, bool swap)
31548 rtx pat;
31549 tree arg0 = CALL_EXPR_ARG (exp, 0);
31550 tree arg1 = CALL_EXPR_ARG (exp, 1);
31551 rtx op0 = expand_normal (arg0);
31552 rtx op1 = expand_normal (arg1);
31553 rtx op2;
31554 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31555 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31556 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31557 enum rtx_code comparison = d->comparison;
31559 if (VECTOR_MODE_P (mode0))
31560 op0 = safe_vector_operand (op0, mode0);
31561 if (VECTOR_MODE_P (mode1))
31562 op1 = safe_vector_operand (op1, mode1);
31564 /* Swap operands if we have a comparison that isn't available in
31565 hardware. */
31566 if (swap)
31568 rtx tmp = gen_reg_rtx (mode1);
31569 emit_move_insn (tmp, op1);
31570 op1 = op0;
31571 op0 = tmp;
31574 if (optimize || !target
31575 || GET_MODE (target) != tmode
31576 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31577 target = gen_reg_rtx (tmode);
31579 if ((optimize && !register_operand (op0, mode0))
31580 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
31581 op0 = copy_to_mode_reg (mode0, op0);
31582 if ((optimize && !register_operand (op1, mode1))
31583 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
31584 op1 = copy_to_mode_reg (mode1, op1);
31586 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
31587 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31588 if (! pat)
31589 return 0;
31590 emit_insn (pat);
31591 return target;
31594 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
31596 static rtx
31597 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
31598 rtx target)
31600 rtx pat;
31601 tree arg0 = CALL_EXPR_ARG (exp, 0);
31602 tree arg1 = CALL_EXPR_ARG (exp, 1);
31603 rtx op0 = expand_normal (arg0);
31604 rtx op1 = expand_normal (arg1);
31605 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31606 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31607 enum rtx_code comparison = d->comparison;
31609 if (VECTOR_MODE_P (mode0))
31610 op0 = safe_vector_operand (op0, mode0);
31611 if (VECTOR_MODE_P (mode1))
31612 op1 = safe_vector_operand (op1, mode1);
31614 /* Swap operands if we have a comparison that isn't available in
31615 hardware. */
31616 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
31618 rtx tmp = op1;
31619 op1 = op0;
31620 op0 = tmp;
31623 target = gen_reg_rtx (SImode);
31624 emit_move_insn (target, const0_rtx);
31625 target = gen_rtx_SUBREG (QImode, target, 0);
31627 if ((optimize && !register_operand (op0, mode0))
31628 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31629 op0 = copy_to_mode_reg (mode0, op0);
31630 if ((optimize && !register_operand (op1, mode1))
31631 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31632 op1 = copy_to_mode_reg (mode1, op1);
31634 pat = GEN_FCN (d->icode) (op0, op1);
31635 if (! pat)
31636 return 0;
31637 emit_insn (pat);
31638 emit_insn (gen_rtx_SET (VOIDmode,
31639 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31640 gen_rtx_fmt_ee (comparison, QImode,
31641 SET_DEST (pat),
31642 const0_rtx)));
31644 return SUBREG_REG (target);
31647 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
31649 static rtx
31650 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
31651 rtx target)
31653 rtx pat;
31654 tree arg0 = CALL_EXPR_ARG (exp, 0);
31655 rtx op1, op0 = expand_normal (arg0);
31656 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31657 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31659 if (optimize || target == 0
31660 || GET_MODE (target) != tmode
31661 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31662 target = gen_reg_rtx (tmode);
31664 if (VECTOR_MODE_P (mode0))
31665 op0 = safe_vector_operand (op0, mode0);
31667 if ((optimize && !register_operand (op0, mode0))
31668 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31669 op0 = copy_to_mode_reg (mode0, op0);
31671 op1 = GEN_INT (d->comparison);
31673 pat = GEN_FCN (d->icode) (target, op0, op1);
31674 if (! pat)
31675 return 0;
31676 emit_insn (pat);
31677 return target;
31680 static rtx
31681 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
31682 tree exp, rtx target)
31684 rtx pat;
31685 tree arg0 = CALL_EXPR_ARG (exp, 0);
31686 tree arg1 = CALL_EXPR_ARG (exp, 1);
31687 rtx op0 = expand_normal (arg0);
31688 rtx op1 = expand_normal (arg1);
31689 rtx op2;
31690 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31691 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31692 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31694 if (optimize || target == 0
31695 || GET_MODE (target) != tmode
31696 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31697 target = gen_reg_rtx (tmode);
31699 op0 = safe_vector_operand (op0, mode0);
31700 op1 = safe_vector_operand (op1, mode1);
31702 if ((optimize && !register_operand (op0, mode0))
31703 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31704 op0 = copy_to_mode_reg (mode0, op0);
31705 if ((optimize && !register_operand (op1, mode1))
31706 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31707 op1 = copy_to_mode_reg (mode1, op1);
31709 op2 = GEN_INT (d->comparison);
31711 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31712 if (! pat)
31713 return 0;
31714 emit_insn (pat);
31715 return target;
31718 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31720 static rtx
31721 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31722 rtx target)
31724 rtx pat;
31725 tree arg0 = CALL_EXPR_ARG (exp, 0);
31726 tree arg1 = CALL_EXPR_ARG (exp, 1);
31727 rtx op0 = expand_normal (arg0);
31728 rtx op1 = expand_normal (arg1);
31729 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31730 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31731 enum rtx_code comparison = d->comparison;
31733 if (VECTOR_MODE_P (mode0))
31734 op0 = safe_vector_operand (op0, mode0);
31735 if (VECTOR_MODE_P (mode1))
31736 op1 = safe_vector_operand (op1, mode1);
31738 target = gen_reg_rtx (SImode);
31739 emit_move_insn (target, const0_rtx);
31740 target = gen_rtx_SUBREG (QImode, target, 0);
31742 if ((optimize && !register_operand (op0, mode0))
31743 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31744 op0 = copy_to_mode_reg (mode0, op0);
31745 if ((optimize && !register_operand (op1, mode1))
31746 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31747 op1 = copy_to_mode_reg (mode1, op1);
31749 pat = GEN_FCN (d->icode) (op0, op1);
31750 if (! pat)
31751 return 0;
31752 emit_insn (pat);
31753 emit_insn (gen_rtx_SET (VOIDmode,
31754 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31755 gen_rtx_fmt_ee (comparison, QImode,
31756 SET_DEST (pat),
31757 const0_rtx)));
31759 return SUBREG_REG (target);
31762 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31764 static rtx
31765 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31766 tree exp, rtx target)
31768 rtx pat;
31769 tree arg0 = CALL_EXPR_ARG (exp, 0);
31770 tree arg1 = CALL_EXPR_ARG (exp, 1);
31771 tree arg2 = CALL_EXPR_ARG (exp, 2);
31772 tree arg3 = CALL_EXPR_ARG (exp, 3);
31773 tree arg4 = CALL_EXPR_ARG (exp, 4);
31774 rtx scratch0, scratch1;
31775 rtx op0 = expand_normal (arg0);
31776 rtx op1 = expand_normal (arg1);
31777 rtx op2 = expand_normal (arg2);
31778 rtx op3 = expand_normal (arg3);
31779 rtx op4 = expand_normal (arg4);
31780 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31782 tmode0 = insn_data[d->icode].operand[0].mode;
31783 tmode1 = insn_data[d->icode].operand[1].mode;
31784 modev2 = insn_data[d->icode].operand[2].mode;
31785 modei3 = insn_data[d->icode].operand[3].mode;
31786 modev4 = insn_data[d->icode].operand[4].mode;
31787 modei5 = insn_data[d->icode].operand[5].mode;
31788 modeimm = insn_data[d->icode].operand[6].mode;
31790 if (VECTOR_MODE_P (modev2))
31791 op0 = safe_vector_operand (op0, modev2);
31792 if (VECTOR_MODE_P (modev4))
31793 op2 = safe_vector_operand (op2, modev4);
31795 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31796 op0 = copy_to_mode_reg (modev2, op0);
31797 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31798 op1 = copy_to_mode_reg (modei3, op1);
31799 if ((optimize && !register_operand (op2, modev4))
31800 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31801 op2 = copy_to_mode_reg (modev4, op2);
31802 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31803 op3 = copy_to_mode_reg (modei5, op3);
31805 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31807 error ("the fifth argument must be an 8-bit immediate");
31808 return const0_rtx;
31811 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31813 if (optimize || !target
31814 || GET_MODE (target) != tmode0
31815 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31816 target = gen_reg_rtx (tmode0);
31818 scratch1 = gen_reg_rtx (tmode1);
31820 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31822 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31824 if (optimize || !target
31825 || GET_MODE (target) != tmode1
31826 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31827 target = gen_reg_rtx (tmode1);
31829 scratch0 = gen_reg_rtx (tmode0);
31831 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31833 else
31835 gcc_assert (d->flag);
31837 scratch0 = gen_reg_rtx (tmode0);
31838 scratch1 = gen_reg_rtx (tmode1);
31840 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31843 if (! pat)
31844 return 0;
31846 emit_insn (pat);
31848 if (d->flag)
31850 target = gen_reg_rtx (SImode);
31851 emit_move_insn (target, const0_rtx);
31852 target = gen_rtx_SUBREG (QImode, target, 0);
31854 emit_insn
31855 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31856 gen_rtx_fmt_ee (EQ, QImode,
31857 gen_rtx_REG ((enum machine_mode) d->flag,
31858 FLAGS_REG),
31859 const0_rtx)));
31860 return SUBREG_REG (target);
31862 else
31863 return target;
31867 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31869 static rtx
31870 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31871 tree exp, rtx target)
31873 rtx pat;
31874 tree arg0 = CALL_EXPR_ARG (exp, 0);
31875 tree arg1 = CALL_EXPR_ARG (exp, 1);
31876 tree arg2 = CALL_EXPR_ARG (exp, 2);
31877 rtx scratch0, scratch1;
31878 rtx op0 = expand_normal (arg0);
31879 rtx op1 = expand_normal (arg1);
31880 rtx op2 = expand_normal (arg2);
31881 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31883 tmode0 = insn_data[d->icode].operand[0].mode;
31884 tmode1 = insn_data[d->icode].operand[1].mode;
31885 modev2 = insn_data[d->icode].operand[2].mode;
31886 modev3 = insn_data[d->icode].operand[3].mode;
31887 modeimm = insn_data[d->icode].operand[4].mode;
31889 if (VECTOR_MODE_P (modev2))
31890 op0 = safe_vector_operand (op0, modev2);
31891 if (VECTOR_MODE_P (modev3))
31892 op1 = safe_vector_operand (op1, modev3);
31894 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31895 op0 = copy_to_mode_reg (modev2, op0);
31896 if ((optimize && !register_operand (op1, modev3))
31897 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31898 op1 = copy_to_mode_reg (modev3, op1);
31900 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31902 error ("the third argument must be an 8-bit immediate");
31903 return const0_rtx;
31906 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31908 if (optimize || !target
31909 || GET_MODE (target) != tmode0
31910 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31911 target = gen_reg_rtx (tmode0);
31913 scratch1 = gen_reg_rtx (tmode1);
31915 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31917 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31919 if (optimize || !target
31920 || GET_MODE (target) != tmode1
31921 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31922 target = gen_reg_rtx (tmode1);
31924 scratch0 = gen_reg_rtx (tmode0);
31926 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31928 else
31930 gcc_assert (d->flag);
31932 scratch0 = gen_reg_rtx (tmode0);
31933 scratch1 = gen_reg_rtx (tmode1);
31935 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31938 if (! pat)
31939 return 0;
31941 emit_insn (pat);
31943 if (d->flag)
31945 target = gen_reg_rtx (SImode);
31946 emit_move_insn (target, const0_rtx);
31947 target = gen_rtx_SUBREG (QImode, target, 0);
31949 emit_insn
31950 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31951 gen_rtx_fmt_ee (EQ, QImode,
31952 gen_rtx_REG ((enum machine_mode) d->flag,
31953 FLAGS_REG),
31954 const0_rtx)));
31955 return SUBREG_REG (target);
31957 else
31958 return target;
31961 /* Subroutine of ix86_expand_builtin to take care of insns with
31962 variable number of operands. */
31964 static rtx
31965 ix86_expand_args_builtin (const struct builtin_description *d,
31966 tree exp, rtx target)
31968 rtx pat, real_target;
31969 unsigned int i, nargs;
31970 unsigned int nargs_constant = 0;
31971 int num_memory = 0;
31972 struct
31974 rtx op;
31975 enum machine_mode mode;
31976 } args[4];
31977 bool last_arg_count = false;
31978 enum insn_code icode = d->icode;
31979 const struct insn_data_d *insn_p = &insn_data[icode];
31980 enum machine_mode tmode = insn_p->operand[0].mode;
31981 enum machine_mode rmode = VOIDmode;
31982 bool swap = false;
31983 enum rtx_code comparison = d->comparison;
31985 switch ((enum ix86_builtin_func_type) d->flag)
31987 case V2DF_FTYPE_V2DF_ROUND:
31988 case V4DF_FTYPE_V4DF_ROUND:
31989 case V4SF_FTYPE_V4SF_ROUND:
31990 case V8SF_FTYPE_V8SF_ROUND:
31991 case V4SI_FTYPE_V4SF_ROUND:
31992 case V8SI_FTYPE_V8SF_ROUND:
31993 return ix86_expand_sse_round (d, exp, target);
31994 case V4SI_FTYPE_V2DF_V2DF_ROUND:
31995 case V8SI_FTYPE_V4DF_V4DF_ROUND:
31996 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
31997 case INT_FTYPE_V8SF_V8SF_PTEST:
31998 case INT_FTYPE_V4DI_V4DI_PTEST:
31999 case INT_FTYPE_V4DF_V4DF_PTEST:
32000 case INT_FTYPE_V4SF_V4SF_PTEST:
32001 case INT_FTYPE_V2DI_V2DI_PTEST:
32002 case INT_FTYPE_V2DF_V2DF_PTEST:
32003 return ix86_expand_sse_ptest (d, exp, target);
32004 case FLOAT128_FTYPE_FLOAT128:
32005 case FLOAT_FTYPE_FLOAT:
32006 case INT_FTYPE_INT:
32007 case UINT64_FTYPE_INT:
32008 case UINT16_FTYPE_UINT16:
32009 case INT64_FTYPE_INT64:
32010 case INT64_FTYPE_V4SF:
32011 case INT64_FTYPE_V2DF:
32012 case INT_FTYPE_V16QI:
32013 case INT_FTYPE_V8QI:
32014 case INT_FTYPE_V8SF:
32015 case INT_FTYPE_V4DF:
32016 case INT_FTYPE_V4SF:
32017 case INT_FTYPE_V2DF:
32018 case INT_FTYPE_V32QI:
32019 case V16QI_FTYPE_V16QI:
32020 case V8SI_FTYPE_V8SF:
32021 case V8SI_FTYPE_V4SI:
32022 case V8HI_FTYPE_V8HI:
32023 case V8HI_FTYPE_V16QI:
32024 case V8QI_FTYPE_V8QI:
32025 case V8SF_FTYPE_V8SF:
32026 case V8SF_FTYPE_V8SI:
32027 case V8SF_FTYPE_V4SF:
32028 case V8SF_FTYPE_V8HI:
32029 case V4SI_FTYPE_V4SI:
32030 case V4SI_FTYPE_V16QI:
32031 case V4SI_FTYPE_V4SF:
32032 case V4SI_FTYPE_V8SI:
32033 case V4SI_FTYPE_V8HI:
32034 case V4SI_FTYPE_V4DF:
32035 case V4SI_FTYPE_V2DF:
32036 case V4HI_FTYPE_V4HI:
32037 case V4DF_FTYPE_V4DF:
32038 case V4DF_FTYPE_V4SI:
32039 case V4DF_FTYPE_V4SF:
32040 case V4DF_FTYPE_V2DF:
32041 case V4SF_FTYPE_V4SF:
32042 case V4SF_FTYPE_V4SI:
32043 case V4SF_FTYPE_V8SF:
32044 case V4SF_FTYPE_V4DF:
32045 case V4SF_FTYPE_V8HI:
32046 case V4SF_FTYPE_V2DF:
32047 case V2DI_FTYPE_V2DI:
32048 case V2DI_FTYPE_V16QI:
32049 case V2DI_FTYPE_V8HI:
32050 case V2DI_FTYPE_V4SI:
32051 case V2DF_FTYPE_V2DF:
32052 case V2DF_FTYPE_V4SI:
32053 case V2DF_FTYPE_V4DF:
32054 case V2DF_FTYPE_V4SF:
32055 case V2DF_FTYPE_V2SI:
32056 case V2SI_FTYPE_V2SI:
32057 case V2SI_FTYPE_V4SF:
32058 case V2SI_FTYPE_V2SF:
32059 case V2SI_FTYPE_V2DF:
32060 case V2SF_FTYPE_V2SF:
32061 case V2SF_FTYPE_V2SI:
32062 case V32QI_FTYPE_V32QI:
32063 case V32QI_FTYPE_V16QI:
32064 case V16HI_FTYPE_V16HI:
32065 case V16HI_FTYPE_V8HI:
32066 case V8SI_FTYPE_V8SI:
32067 case V16HI_FTYPE_V16QI:
32068 case V8SI_FTYPE_V16QI:
32069 case V4DI_FTYPE_V16QI:
32070 case V8SI_FTYPE_V8HI:
32071 case V4DI_FTYPE_V8HI:
32072 case V4DI_FTYPE_V4SI:
32073 case V4DI_FTYPE_V2DI:
32074 nargs = 1;
32075 break;
32076 case V4SF_FTYPE_V4SF_VEC_MERGE:
32077 case V2DF_FTYPE_V2DF_VEC_MERGE:
32078 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
32079 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
32080 case V16QI_FTYPE_V16QI_V16QI:
32081 case V16QI_FTYPE_V8HI_V8HI:
32082 case V8QI_FTYPE_V8QI_V8QI:
32083 case V8QI_FTYPE_V4HI_V4HI:
32084 case V8HI_FTYPE_V8HI_V8HI:
32085 case V8HI_FTYPE_V16QI_V16QI:
32086 case V8HI_FTYPE_V4SI_V4SI:
32087 case V8SF_FTYPE_V8SF_V8SF:
32088 case V8SF_FTYPE_V8SF_V8SI:
32089 case V4SI_FTYPE_V4SI_V4SI:
32090 case V4SI_FTYPE_V8HI_V8HI:
32091 case V4SI_FTYPE_V4SF_V4SF:
32092 case V4SI_FTYPE_V2DF_V2DF:
32093 case V4HI_FTYPE_V4HI_V4HI:
32094 case V4HI_FTYPE_V8QI_V8QI:
32095 case V4HI_FTYPE_V2SI_V2SI:
32096 case V4DF_FTYPE_V4DF_V4DF:
32097 case V4DF_FTYPE_V4DF_V4DI:
32098 case V4SF_FTYPE_V4SF_V4SF:
32099 case V4SF_FTYPE_V4SF_V4SI:
32100 case V4SF_FTYPE_V4SF_V2SI:
32101 case V4SF_FTYPE_V4SF_V2DF:
32102 case V4SF_FTYPE_V4SF_DI:
32103 case V4SF_FTYPE_V4SF_SI:
32104 case V2DI_FTYPE_V2DI_V2DI:
32105 case V2DI_FTYPE_V16QI_V16QI:
32106 case V2DI_FTYPE_V4SI_V4SI:
32107 case V2UDI_FTYPE_V4USI_V4USI:
32108 case V2DI_FTYPE_V2DI_V16QI:
32109 case V2DI_FTYPE_V2DF_V2DF:
32110 case V2SI_FTYPE_V2SI_V2SI:
32111 case V2SI_FTYPE_V4HI_V4HI:
32112 case V2SI_FTYPE_V2SF_V2SF:
32113 case V2DF_FTYPE_V2DF_V2DF:
32114 case V2DF_FTYPE_V2DF_V4SF:
32115 case V2DF_FTYPE_V2DF_V2DI:
32116 case V2DF_FTYPE_V2DF_DI:
32117 case V2DF_FTYPE_V2DF_SI:
32118 case V2SF_FTYPE_V2SF_V2SF:
32119 case V1DI_FTYPE_V1DI_V1DI:
32120 case V1DI_FTYPE_V8QI_V8QI:
32121 case V1DI_FTYPE_V2SI_V2SI:
32122 case V32QI_FTYPE_V16HI_V16HI:
32123 case V16HI_FTYPE_V8SI_V8SI:
32124 case V32QI_FTYPE_V32QI_V32QI:
32125 case V16HI_FTYPE_V32QI_V32QI:
32126 case V16HI_FTYPE_V16HI_V16HI:
32127 case V8SI_FTYPE_V4DF_V4DF:
32128 case V8SI_FTYPE_V8SI_V8SI:
32129 case V8SI_FTYPE_V16HI_V16HI:
32130 case V4DI_FTYPE_V4DI_V4DI:
32131 case V4DI_FTYPE_V8SI_V8SI:
32132 case V4UDI_FTYPE_V8USI_V8USI:
32133 if (comparison == UNKNOWN)
32134 return ix86_expand_binop_builtin (icode, exp, target);
32135 nargs = 2;
32136 break;
32137 case V4SF_FTYPE_V4SF_V4SF_SWAP:
32138 case V2DF_FTYPE_V2DF_V2DF_SWAP:
32139 gcc_assert (comparison != UNKNOWN);
32140 nargs = 2;
32141 swap = true;
32142 break;
32143 case V16HI_FTYPE_V16HI_V8HI_COUNT:
32144 case V16HI_FTYPE_V16HI_SI_COUNT:
32145 case V8SI_FTYPE_V8SI_V4SI_COUNT:
32146 case V8SI_FTYPE_V8SI_SI_COUNT:
32147 case V4DI_FTYPE_V4DI_V2DI_COUNT:
32148 case V4DI_FTYPE_V4DI_INT_COUNT:
32149 case V8HI_FTYPE_V8HI_V8HI_COUNT:
32150 case V8HI_FTYPE_V8HI_SI_COUNT:
32151 case V4SI_FTYPE_V4SI_V4SI_COUNT:
32152 case V4SI_FTYPE_V4SI_SI_COUNT:
32153 case V4HI_FTYPE_V4HI_V4HI_COUNT:
32154 case V4HI_FTYPE_V4HI_SI_COUNT:
32155 case V2DI_FTYPE_V2DI_V2DI_COUNT:
32156 case V2DI_FTYPE_V2DI_SI_COUNT:
32157 case V2SI_FTYPE_V2SI_V2SI_COUNT:
32158 case V2SI_FTYPE_V2SI_SI_COUNT:
32159 case V1DI_FTYPE_V1DI_V1DI_COUNT:
32160 case V1DI_FTYPE_V1DI_SI_COUNT:
32161 nargs = 2;
32162 last_arg_count = true;
32163 break;
32164 case UINT64_FTYPE_UINT64_UINT64:
32165 case UINT_FTYPE_UINT_UINT:
32166 case UINT_FTYPE_UINT_USHORT:
32167 case UINT_FTYPE_UINT_UCHAR:
32168 case UINT16_FTYPE_UINT16_INT:
32169 case UINT8_FTYPE_UINT8_INT:
32170 nargs = 2;
32171 break;
32172 case V2DI_FTYPE_V2DI_INT_CONVERT:
32173 nargs = 2;
32174 rmode = V1TImode;
32175 nargs_constant = 1;
32176 break;
32177 case V4DI_FTYPE_V4DI_INT_CONVERT:
32178 nargs = 2;
32179 rmode = V2TImode;
32180 nargs_constant = 1;
32181 break;
32182 case V8HI_FTYPE_V8HI_INT:
32183 case V8HI_FTYPE_V8SF_INT:
32184 case V8HI_FTYPE_V4SF_INT:
32185 case V8SF_FTYPE_V8SF_INT:
32186 case V4SI_FTYPE_V4SI_INT:
32187 case V4SI_FTYPE_V8SI_INT:
32188 case V4HI_FTYPE_V4HI_INT:
32189 case V4DF_FTYPE_V4DF_INT:
32190 case V4SF_FTYPE_V4SF_INT:
32191 case V4SF_FTYPE_V8SF_INT:
32192 case V2DI_FTYPE_V2DI_INT:
32193 case V2DF_FTYPE_V2DF_INT:
32194 case V2DF_FTYPE_V4DF_INT:
32195 case V16HI_FTYPE_V16HI_INT:
32196 case V8SI_FTYPE_V8SI_INT:
32197 case V4DI_FTYPE_V4DI_INT:
32198 case V2DI_FTYPE_V4DI_INT:
32199 nargs = 2;
32200 nargs_constant = 1;
32201 break;
32202 case V16QI_FTYPE_V16QI_V16QI_V16QI:
32203 case V8SF_FTYPE_V8SF_V8SF_V8SF:
32204 case V4DF_FTYPE_V4DF_V4DF_V4DF:
32205 case V4SF_FTYPE_V4SF_V4SF_V4SF:
32206 case V2DF_FTYPE_V2DF_V2DF_V2DF:
32207 case V32QI_FTYPE_V32QI_V32QI_V32QI:
32208 nargs = 3;
32209 break;
32210 case V32QI_FTYPE_V32QI_V32QI_INT:
32211 case V16HI_FTYPE_V16HI_V16HI_INT:
32212 case V16QI_FTYPE_V16QI_V16QI_INT:
32213 case V4DI_FTYPE_V4DI_V4DI_INT:
32214 case V8HI_FTYPE_V8HI_V8HI_INT:
32215 case V8SI_FTYPE_V8SI_V8SI_INT:
32216 case V8SI_FTYPE_V8SI_V4SI_INT:
32217 case V8SF_FTYPE_V8SF_V8SF_INT:
32218 case V8SF_FTYPE_V8SF_V4SF_INT:
32219 case V4SI_FTYPE_V4SI_V4SI_INT:
32220 case V4DF_FTYPE_V4DF_V4DF_INT:
32221 case V4DF_FTYPE_V4DF_V2DF_INT:
32222 case V4SF_FTYPE_V4SF_V4SF_INT:
32223 case V2DI_FTYPE_V2DI_V2DI_INT:
32224 case V4DI_FTYPE_V4DI_V2DI_INT:
32225 case V2DF_FTYPE_V2DF_V2DF_INT:
32226 nargs = 3;
32227 nargs_constant = 1;
32228 break;
32229 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
32230 nargs = 3;
32231 rmode = V4DImode;
32232 nargs_constant = 1;
32233 break;
32234 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
32235 nargs = 3;
32236 rmode = V2DImode;
32237 nargs_constant = 1;
32238 break;
32239 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
32240 nargs = 3;
32241 rmode = DImode;
32242 nargs_constant = 1;
32243 break;
32244 case V2DI_FTYPE_V2DI_UINT_UINT:
32245 nargs = 3;
32246 nargs_constant = 2;
32247 break;
32248 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
32249 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
32250 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
32251 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
32252 nargs = 4;
32253 nargs_constant = 1;
32254 break;
32255 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
32256 nargs = 4;
32257 nargs_constant = 2;
32258 break;
32259 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
32260 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
32261 nargs = 4;
32262 break;
32263 default:
32264 gcc_unreachable ();
32267 gcc_assert (nargs <= ARRAY_SIZE (args));
32269 if (comparison != UNKNOWN)
32271 gcc_assert (nargs == 2);
32272 return ix86_expand_sse_compare (d, exp, target, swap);
32275 if (rmode == VOIDmode || rmode == tmode)
32277 if (optimize
32278 || target == 0
32279 || GET_MODE (target) != tmode
32280 || !insn_p->operand[0].predicate (target, tmode))
32281 target = gen_reg_rtx (tmode);
32282 real_target = target;
32284 else
32286 real_target = gen_reg_rtx (tmode);
32287 target = simplify_gen_subreg (rmode, real_target, tmode, 0);
32290 for (i = 0; i < nargs; i++)
32292 tree arg = CALL_EXPR_ARG (exp, i);
32293 rtx op = expand_normal (arg);
32294 enum machine_mode mode = insn_p->operand[i + 1].mode;
32295 bool match = insn_p->operand[i + 1].predicate (op, mode);
32297 if (last_arg_count && (i + 1) == nargs)
32299 /* SIMD shift insns take either an 8-bit immediate or
32300 register as count. But builtin functions take int as
32301 count. If count doesn't match, we put it in register. */
32302 if (!match)
32304 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
32305 if (!insn_p->operand[i + 1].predicate (op, mode))
32306 op = copy_to_reg (op);
32309 else if ((nargs - i) <= nargs_constant)
32311 if (!match)
32312 switch (icode)
32314 case CODE_FOR_avx2_inserti128:
32315 case CODE_FOR_avx2_extracti128:
32316 error ("the last argument must be an 1-bit immediate");
32317 return const0_rtx;
32319 case CODE_FOR_sse4_1_roundsd:
32320 case CODE_FOR_sse4_1_roundss:
32322 case CODE_FOR_sse4_1_roundpd:
32323 case CODE_FOR_sse4_1_roundps:
32324 case CODE_FOR_avx_roundpd256:
32325 case CODE_FOR_avx_roundps256:
32327 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
32328 case CODE_FOR_sse4_1_roundps_sfix:
32329 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
32330 case CODE_FOR_avx_roundps_sfix256:
32332 case CODE_FOR_sse4_1_blendps:
32333 case CODE_FOR_avx_blendpd256:
32334 case CODE_FOR_avx_vpermilv4df:
32335 error ("the last argument must be a 4-bit immediate");
32336 return const0_rtx;
32338 case CODE_FOR_sse4_1_blendpd:
32339 case CODE_FOR_avx_vpermilv2df:
32340 case CODE_FOR_xop_vpermil2v2df3:
32341 case CODE_FOR_xop_vpermil2v4sf3:
32342 case CODE_FOR_xop_vpermil2v4df3:
32343 case CODE_FOR_xop_vpermil2v8sf3:
32344 error ("the last argument must be a 2-bit immediate");
32345 return const0_rtx;
32347 case CODE_FOR_avx_vextractf128v4df:
32348 case CODE_FOR_avx_vextractf128v8sf:
32349 case CODE_FOR_avx_vextractf128v8si:
32350 case CODE_FOR_avx_vinsertf128v4df:
32351 case CODE_FOR_avx_vinsertf128v8sf:
32352 case CODE_FOR_avx_vinsertf128v8si:
32353 error ("the last argument must be a 1-bit immediate");
32354 return const0_rtx;
32356 case CODE_FOR_avx_vmcmpv2df3:
32357 case CODE_FOR_avx_vmcmpv4sf3:
32358 case CODE_FOR_avx_cmpv2df3:
32359 case CODE_FOR_avx_cmpv4sf3:
32360 case CODE_FOR_avx_cmpv4df3:
32361 case CODE_FOR_avx_cmpv8sf3:
32362 error ("the last argument must be a 5-bit immediate");
32363 return const0_rtx;
32365 default:
32366 switch (nargs_constant)
32368 case 2:
32369 if ((nargs - i) == nargs_constant)
32371 error ("the next to last argument must be an 8-bit immediate");
32372 break;
32374 case 1:
32375 error ("the last argument must be an 8-bit immediate");
32376 break;
32377 default:
32378 gcc_unreachable ();
32380 return const0_rtx;
32383 else
32385 if (VECTOR_MODE_P (mode))
32386 op = safe_vector_operand (op, mode);
32388 /* If we aren't optimizing, only allow one memory operand to
32389 be generated. */
32390 if (memory_operand (op, mode))
32391 num_memory++;
32393 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
32395 if (optimize || !match || num_memory > 1)
32396 op = copy_to_mode_reg (mode, op);
32398 else
32400 op = copy_to_reg (op);
32401 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
32405 args[i].op = op;
32406 args[i].mode = mode;
32409 switch (nargs)
32411 case 1:
32412 pat = GEN_FCN (icode) (real_target, args[0].op);
32413 break;
32414 case 2:
32415 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
32416 break;
32417 case 3:
32418 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32419 args[2].op);
32420 break;
32421 case 4:
32422 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32423 args[2].op, args[3].op);
32424 break;
32425 default:
32426 gcc_unreachable ();
32429 if (! pat)
32430 return 0;
32432 emit_insn (pat);
32433 return target;
32436 /* Subroutine of ix86_expand_builtin to take care of special insns
32437 with variable number of operands. */
32439 static rtx
32440 ix86_expand_special_args_builtin (const struct builtin_description *d,
32441 tree exp, rtx target)
32443 tree arg;
32444 rtx pat, op;
32445 unsigned int i, nargs, arg_adjust, memory;
32446 struct
32448 rtx op;
32449 enum machine_mode mode;
32450 } args[3];
32451 enum insn_code icode = d->icode;
32452 bool last_arg_constant = false;
32453 const struct insn_data_d *insn_p = &insn_data[icode];
32454 enum machine_mode tmode = insn_p->operand[0].mode;
32455 enum { load, store } klass;
32457 switch ((enum ix86_builtin_func_type) d->flag)
32459 case VOID_FTYPE_VOID:
32460 emit_insn (GEN_FCN (icode) (target));
32461 return 0;
32462 case VOID_FTYPE_UINT64:
32463 case VOID_FTYPE_UNSIGNED:
32464 nargs = 0;
32465 klass = store;
32466 memory = 0;
32467 break;
32469 case INT_FTYPE_VOID:
32470 case UINT64_FTYPE_VOID:
32471 case UNSIGNED_FTYPE_VOID:
32472 nargs = 0;
32473 klass = load;
32474 memory = 0;
32475 break;
32476 case UINT64_FTYPE_PUNSIGNED:
32477 case V2DI_FTYPE_PV2DI:
32478 case V4DI_FTYPE_PV4DI:
32479 case V32QI_FTYPE_PCCHAR:
32480 case V16QI_FTYPE_PCCHAR:
32481 case V8SF_FTYPE_PCV4SF:
32482 case V8SF_FTYPE_PCFLOAT:
32483 case V4SF_FTYPE_PCFLOAT:
32484 case V4DF_FTYPE_PCV2DF:
32485 case V4DF_FTYPE_PCDOUBLE:
32486 case V2DF_FTYPE_PCDOUBLE:
32487 case VOID_FTYPE_PVOID:
32488 nargs = 1;
32489 klass = load;
32490 memory = 0;
32491 break;
32492 case VOID_FTYPE_PV2SF_V4SF:
32493 case VOID_FTYPE_PV4DI_V4DI:
32494 case VOID_FTYPE_PV2DI_V2DI:
32495 case VOID_FTYPE_PCHAR_V32QI:
32496 case VOID_FTYPE_PCHAR_V16QI:
32497 case VOID_FTYPE_PFLOAT_V8SF:
32498 case VOID_FTYPE_PFLOAT_V4SF:
32499 case VOID_FTYPE_PDOUBLE_V4DF:
32500 case VOID_FTYPE_PDOUBLE_V2DF:
32501 case VOID_FTYPE_PLONGLONG_LONGLONG:
32502 case VOID_FTYPE_PULONGLONG_ULONGLONG:
32503 case VOID_FTYPE_PINT_INT:
32504 nargs = 1;
32505 klass = store;
32506 /* Reserve memory operand for target. */
32507 memory = ARRAY_SIZE (args);
32508 break;
32509 case V4SF_FTYPE_V4SF_PCV2SF:
32510 case V2DF_FTYPE_V2DF_PCDOUBLE:
32511 nargs = 2;
32512 klass = load;
32513 memory = 1;
32514 break;
32515 case V8SF_FTYPE_PCV8SF_V8SI:
32516 case V4DF_FTYPE_PCV4DF_V4DI:
32517 case V4SF_FTYPE_PCV4SF_V4SI:
32518 case V2DF_FTYPE_PCV2DF_V2DI:
32519 case V8SI_FTYPE_PCV8SI_V8SI:
32520 case V4DI_FTYPE_PCV4DI_V4DI:
32521 case V4SI_FTYPE_PCV4SI_V4SI:
32522 case V2DI_FTYPE_PCV2DI_V2DI:
32523 nargs = 2;
32524 klass = load;
32525 memory = 0;
32526 break;
32527 case VOID_FTYPE_PV8SF_V8SI_V8SF:
32528 case VOID_FTYPE_PV4DF_V4DI_V4DF:
32529 case VOID_FTYPE_PV4SF_V4SI_V4SF:
32530 case VOID_FTYPE_PV2DF_V2DI_V2DF:
32531 case VOID_FTYPE_PV8SI_V8SI_V8SI:
32532 case VOID_FTYPE_PV4DI_V4DI_V4DI:
32533 case VOID_FTYPE_PV4SI_V4SI_V4SI:
32534 case VOID_FTYPE_PV2DI_V2DI_V2DI:
32535 nargs = 2;
32536 klass = store;
32537 /* Reserve memory operand for target. */
32538 memory = ARRAY_SIZE (args);
32539 break;
32540 case VOID_FTYPE_UINT_UINT_UINT:
32541 case VOID_FTYPE_UINT64_UINT_UINT:
32542 case UCHAR_FTYPE_UINT_UINT_UINT:
32543 case UCHAR_FTYPE_UINT64_UINT_UINT:
32544 nargs = 3;
32545 klass = load;
32546 memory = ARRAY_SIZE (args);
32547 last_arg_constant = true;
32548 break;
32549 default:
32550 gcc_unreachable ();
32553 gcc_assert (nargs <= ARRAY_SIZE (args));
32555 if (klass == store)
32557 arg = CALL_EXPR_ARG (exp, 0);
32558 op = expand_normal (arg);
32559 gcc_assert (target == 0);
32560 if (memory)
32562 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32563 target = gen_rtx_MEM (tmode, op);
32565 else
32566 target = force_reg (tmode, op);
32567 arg_adjust = 1;
32569 else
32571 arg_adjust = 0;
32572 if (optimize
32573 || target == 0
32574 || !register_operand (target, tmode)
32575 || GET_MODE (target) != tmode)
32576 target = gen_reg_rtx (tmode);
32579 for (i = 0; i < nargs; i++)
32581 enum machine_mode mode = insn_p->operand[i + 1].mode;
32582 bool match;
32584 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
32585 op = expand_normal (arg);
32586 match = insn_p->operand[i + 1].predicate (op, mode);
32588 if (last_arg_constant && (i + 1) == nargs)
32590 if (!match)
32592 if (icode == CODE_FOR_lwp_lwpvalsi3
32593 || icode == CODE_FOR_lwp_lwpinssi3
32594 || icode == CODE_FOR_lwp_lwpvaldi3
32595 || icode == CODE_FOR_lwp_lwpinsdi3)
32596 error ("the last argument must be a 32-bit immediate");
32597 else
32598 error ("the last argument must be an 8-bit immediate");
32599 return const0_rtx;
32602 else
32604 if (i == memory)
32606 /* This must be the memory operand. */
32607 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32608 op = gen_rtx_MEM (mode, op);
32609 gcc_assert (GET_MODE (op) == mode
32610 || GET_MODE (op) == VOIDmode);
32612 else
32614 /* This must be register. */
32615 if (VECTOR_MODE_P (mode))
32616 op = safe_vector_operand (op, mode);
32618 gcc_assert (GET_MODE (op) == mode
32619 || GET_MODE (op) == VOIDmode);
32620 op = copy_to_mode_reg (mode, op);
32624 args[i].op = op;
32625 args[i].mode = mode;
32628 switch (nargs)
32630 case 0:
32631 pat = GEN_FCN (icode) (target);
32632 break;
32633 case 1:
32634 pat = GEN_FCN (icode) (target, args[0].op);
32635 break;
32636 case 2:
32637 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32638 break;
32639 case 3:
32640 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32641 break;
32642 default:
32643 gcc_unreachable ();
32646 if (! pat)
32647 return 0;
32648 emit_insn (pat);
32649 return klass == store ? 0 : target;
32652 /* Return the integer constant in ARG. Constrain it to be in the range
32653 of the subparts of VEC_TYPE; issue an error if not. */
32655 static int
32656 get_element_number (tree vec_type, tree arg)
32658 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
32660 if (!host_integerp (arg, 1)
32661 || (elt = tree_low_cst (arg, 1), elt > max))
32663 error ("selector must be an integer constant in the range 0..%wi", max);
32664 return 0;
32667 return elt;
32670 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32671 ix86_expand_vector_init. We DO have language-level syntax for this, in
32672 the form of (type){ init-list }. Except that since we can't place emms
32673 instructions from inside the compiler, we can't allow the use of MMX
32674 registers unless the user explicitly asks for it. So we do *not* define
32675 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
32676 we have builtins invoked by mmintrin.h that gives us license to emit
32677 these sorts of instructions. */
32679 static rtx
32680 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
32682 enum machine_mode tmode = TYPE_MODE (type);
32683 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
32684 int i, n_elt = GET_MODE_NUNITS (tmode);
32685 rtvec v = rtvec_alloc (n_elt);
32687 gcc_assert (VECTOR_MODE_P (tmode));
32688 gcc_assert (call_expr_nargs (exp) == n_elt);
32690 for (i = 0; i < n_elt; ++i)
32692 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
32693 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
32696 if (!target || !register_operand (target, tmode))
32697 target = gen_reg_rtx (tmode);
32699 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
32700 return target;
32703 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32704 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
32705 had a language-level syntax for referencing vector elements. */
32707 static rtx
32708 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32710 enum machine_mode tmode, mode0;
32711 tree arg0, arg1;
32712 int elt;
32713 rtx op0;
32715 arg0 = CALL_EXPR_ARG (exp, 0);
32716 arg1 = CALL_EXPR_ARG (exp, 1);
32718 op0 = expand_normal (arg0);
32719 elt = get_element_number (TREE_TYPE (arg0), arg1);
32721 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32722 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32723 gcc_assert (VECTOR_MODE_P (mode0));
32725 op0 = force_reg (mode0, op0);
32727 if (optimize || !target || !register_operand (target, tmode))
32728 target = gen_reg_rtx (tmode);
32730 ix86_expand_vector_extract (true, target, op0, elt);
32732 return target;
32735 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32736 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32737 a language-level syntax for referencing vector elements. */
32739 static rtx
32740 ix86_expand_vec_set_builtin (tree exp)
32742 enum machine_mode tmode, mode1;
32743 tree arg0, arg1, arg2;
32744 int elt;
32745 rtx op0, op1, target;
32747 arg0 = CALL_EXPR_ARG (exp, 0);
32748 arg1 = CALL_EXPR_ARG (exp, 1);
32749 arg2 = CALL_EXPR_ARG (exp, 2);
32751 tmode = TYPE_MODE (TREE_TYPE (arg0));
32752 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32753 gcc_assert (VECTOR_MODE_P (tmode));
32755 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32756 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32757 elt = get_element_number (TREE_TYPE (arg0), arg2);
32759 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32760 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32762 op0 = force_reg (tmode, op0);
32763 op1 = force_reg (mode1, op1);
32765 /* OP0 is the source of these builtin functions and shouldn't be
32766 modified. Create a copy, use it and return it as target. */
32767 target = gen_reg_rtx (tmode);
32768 emit_move_insn (target, op0);
32769 ix86_expand_vector_set (true, target, op1, elt);
32771 return target;
32774 /* Expand an expression EXP that calls a built-in function,
32775 with result going to TARGET if that's convenient
32776 (and in mode MODE if that's convenient).
32777 SUBTARGET may be used as the target for computing one of EXP's operands.
32778 IGNORE is nonzero if the value is to be ignored. */
32780 static rtx
32781 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
32782 enum machine_mode mode, int ignore)
32784 const struct builtin_description *d;
32785 size_t i;
32786 enum insn_code icode;
32787 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32788 tree arg0, arg1, arg2, arg3, arg4;
32789 rtx op0, op1, op2, op3, op4, pat, insn;
32790 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32791 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32793 /* For CPU builtins that can be folded, fold first and expand the fold. */
32794 switch (fcode)
32796 case IX86_BUILTIN_CPU_INIT:
32798 /* Make it call __cpu_indicator_init in libgcc. */
32799 tree call_expr, fndecl, type;
32800 type = build_function_type_list (integer_type_node, NULL_TREE);
32801 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32802 call_expr = build_call_expr (fndecl, 0);
32803 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32805 case IX86_BUILTIN_CPU_IS:
32806 case IX86_BUILTIN_CPU_SUPPORTS:
32808 tree arg0 = CALL_EXPR_ARG (exp, 0);
32809 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32810 gcc_assert (fold_expr != NULL_TREE);
32811 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32815 /* Determine whether the builtin function is available under the current ISA.
32816 Originally the builtin was not created if it wasn't applicable to the
32817 current ISA based on the command line switches. With function specific
32818 options, we need to check in the context of the function making the call
32819 whether it is supported. */
32820 if (ix86_builtins_isa[fcode].isa
32821 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32823 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32824 NULL, (enum fpmath_unit) 0, false);
32826 if (!opts)
32827 error ("%qE needs unknown isa option", fndecl);
32828 else
32830 gcc_assert (opts != NULL);
32831 error ("%qE needs isa option %s", fndecl, opts);
32832 free (opts);
32834 return const0_rtx;
32837 switch (fcode)
32839 case IX86_BUILTIN_MASKMOVQ:
32840 case IX86_BUILTIN_MASKMOVDQU:
32841 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32842 ? CODE_FOR_mmx_maskmovq
32843 : CODE_FOR_sse2_maskmovdqu);
32844 /* Note the arg order is different from the operand order. */
32845 arg1 = CALL_EXPR_ARG (exp, 0);
32846 arg2 = CALL_EXPR_ARG (exp, 1);
32847 arg0 = CALL_EXPR_ARG (exp, 2);
32848 op0 = expand_normal (arg0);
32849 op1 = expand_normal (arg1);
32850 op2 = expand_normal (arg2);
32851 mode0 = insn_data[icode].operand[0].mode;
32852 mode1 = insn_data[icode].operand[1].mode;
32853 mode2 = insn_data[icode].operand[2].mode;
32855 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32856 op0 = gen_rtx_MEM (mode1, op0);
32858 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32859 op0 = copy_to_mode_reg (mode0, op0);
32860 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32861 op1 = copy_to_mode_reg (mode1, op1);
32862 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32863 op2 = copy_to_mode_reg (mode2, op2);
32864 pat = GEN_FCN (icode) (op0, op1, op2);
32865 if (! pat)
32866 return 0;
32867 emit_insn (pat);
32868 return 0;
32870 case IX86_BUILTIN_LDMXCSR:
32871 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32872 target = assign_386_stack_local (SImode, SLOT_TEMP);
32873 emit_move_insn (target, op0);
32874 emit_insn (gen_sse_ldmxcsr (target));
32875 return 0;
32877 case IX86_BUILTIN_STMXCSR:
32878 target = assign_386_stack_local (SImode, SLOT_TEMP);
32879 emit_insn (gen_sse_stmxcsr (target));
32880 return copy_to_mode_reg (SImode, target);
32882 case IX86_BUILTIN_CLFLUSH:
32883 arg0 = CALL_EXPR_ARG (exp, 0);
32884 op0 = expand_normal (arg0);
32885 icode = CODE_FOR_sse2_clflush;
32886 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32887 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32889 emit_insn (gen_sse2_clflush (op0));
32890 return 0;
32892 case IX86_BUILTIN_MONITOR:
32893 arg0 = CALL_EXPR_ARG (exp, 0);
32894 arg1 = CALL_EXPR_ARG (exp, 1);
32895 arg2 = CALL_EXPR_ARG (exp, 2);
32896 op0 = expand_normal (arg0);
32897 op1 = expand_normal (arg1);
32898 op2 = expand_normal (arg2);
32899 if (!REG_P (op0))
32900 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32901 if (!REG_P (op1))
32902 op1 = copy_to_mode_reg (SImode, op1);
32903 if (!REG_P (op2))
32904 op2 = copy_to_mode_reg (SImode, op2);
32905 emit_insn (ix86_gen_monitor (op0, op1, op2));
32906 return 0;
32908 case IX86_BUILTIN_MWAIT:
32909 arg0 = CALL_EXPR_ARG (exp, 0);
32910 arg1 = CALL_EXPR_ARG (exp, 1);
32911 op0 = expand_normal (arg0);
32912 op1 = expand_normal (arg1);
32913 if (!REG_P (op0))
32914 op0 = copy_to_mode_reg (SImode, op0);
32915 if (!REG_P (op1))
32916 op1 = copy_to_mode_reg (SImode, op1);
32917 emit_insn (gen_sse3_mwait (op0, op1));
32918 return 0;
32920 case IX86_BUILTIN_VEC_INIT_V2SI:
32921 case IX86_BUILTIN_VEC_INIT_V4HI:
32922 case IX86_BUILTIN_VEC_INIT_V8QI:
32923 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32925 case IX86_BUILTIN_VEC_EXT_V2DF:
32926 case IX86_BUILTIN_VEC_EXT_V2DI:
32927 case IX86_BUILTIN_VEC_EXT_V4SF:
32928 case IX86_BUILTIN_VEC_EXT_V4SI:
32929 case IX86_BUILTIN_VEC_EXT_V8HI:
32930 case IX86_BUILTIN_VEC_EXT_V2SI:
32931 case IX86_BUILTIN_VEC_EXT_V4HI:
32932 case IX86_BUILTIN_VEC_EXT_V16QI:
32933 return ix86_expand_vec_ext_builtin (exp, target);
32935 case IX86_BUILTIN_VEC_SET_V2DI:
32936 case IX86_BUILTIN_VEC_SET_V4SF:
32937 case IX86_BUILTIN_VEC_SET_V4SI:
32938 case IX86_BUILTIN_VEC_SET_V8HI:
32939 case IX86_BUILTIN_VEC_SET_V4HI:
32940 case IX86_BUILTIN_VEC_SET_V16QI:
32941 return ix86_expand_vec_set_builtin (exp);
32943 case IX86_BUILTIN_INFQ:
32944 case IX86_BUILTIN_HUGE_VALQ:
32946 REAL_VALUE_TYPE inf;
32947 rtx tmp;
32949 real_inf (&inf);
32950 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32952 tmp = validize_mem (force_const_mem (mode, tmp));
32954 if (target == 0)
32955 target = gen_reg_rtx (mode);
32957 emit_move_insn (target, tmp);
32958 return target;
32961 case IX86_BUILTIN_RDPMC:
32962 case IX86_BUILTIN_RDTSC:
32963 case IX86_BUILTIN_RDTSCP:
32965 op0 = gen_reg_rtx (DImode);
32966 op1 = gen_reg_rtx (DImode);
32968 if (fcode == IX86_BUILTIN_RDPMC)
32970 arg0 = CALL_EXPR_ARG (exp, 0);
32971 op2 = expand_normal (arg0);
32972 if (!register_operand (op2, SImode))
32973 op2 = copy_to_mode_reg (SImode, op2);
32975 insn = (TARGET_64BIT
32976 ? gen_rdpmc_rex64 (op0, op1, op2)
32977 : gen_rdpmc (op0, op2));
32978 emit_insn (insn);
32980 else if (fcode == IX86_BUILTIN_RDTSC)
32982 insn = (TARGET_64BIT
32983 ? gen_rdtsc_rex64 (op0, op1)
32984 : gen_rdtsc (op0));
32985 emit_insn (insn);
32987 else
32989 op2 = gen_reg_rtx (SImode);
32991 insn = (TARGET_64BIT
32992 ? gen_rdtscp_rex64 (op0, op1, op2)
32993 : gen_rdtscp (op0, op2));
32994 emit_insn (insn);
32996 arg0 = CALL_EXPR_ARG (exp, 0);
32997 op4 = expand_normal (arg0);
32998 if (!address_operand (op4, VOIDmode))
33000 op4 = convert_memory_address (Pmode, op4);
33001 op4 = copy_addr_to_reg (op4);
33003 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
33006 if (target == 0)
33008 /* mode is VOIDmode if __builtin_rd* has been called
33009 without lhs. */
33010 if (mode == VOIDmode)
33011 return target;
33012 target = gen_reg_rtx (mode);
33015 if (TARGET_64BIT)
33017 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
33018 op1, 1, OPTAB_DIRECT);
33019 op0 = expand_simple_binop (DImode, IOR, op0, op1,
33020 op0, 1, OPTAB_DIRECT);
33023 emit_move_insn (target, op0);
33024 return target;
33026 case IX86_BUILTIN_FXSAVE:
33027 case IX86_BUILTIN_FXRSTOR:
33028 case IX86_BUILTIN_FXSAVE64:
33029 case IX86_BUILTIN_FXRSTOR64:
33030 case IX86_BUILTIN_FNSTENV:
33031 case IX86_BUILTIN_FLDENV:
33032 case IX86_BUILTIN_FNSTSW:
33033 mode0 = BLKmode;
33034 switch (fcode)
33036 case IX86_BUILTIN_FXSAVE:
33037 icode = CODE_FOR_fxsave;
33038 break;
33039 case IX86_BUILTIN_FXRSTOR:
33040 icode = CODE_FOR_fxrstor;
33041 break;
33042 case IX86_BUILTIN_FXSAVE64:
33043 icode = CODE_FOR_fxsave64;
33044 break;
33045 case IX86_BUILTIN_FXRSTOR64:
33046 icode = CODE_FOR_fxrstor64;
33047 break;
33048 case IX86_BUILTIN_FNSTENV:
33049 icode = CODE_FOR_fnstenv;
33050 break;
33051 case IX86_BUILTIN_FLDENV:
33052 icode = CODE_FOR_fldenv;
33053 break;
33054 case IX86_BUILTIN_FNSTSW:
33055 icode = CODE_FOR_fnstsw;
33056 mode0 = HImode;
33057 break;
33058 default:
33059 gcc_unreachable ();
33062 arg0 = CALL_EXPR_ARG (exp, 0);
33063 op0 = expand_normal (arg0);
33065 if (!address_operand (op0, VOIDmode))
33067 op0 = convert_memory_address (Pmode, op0);
33068 op0 = copy_addr_to_reg (op0);
33070 op0 = gen_rtx_MEM (mode0, op0);
33072 pat = GEN_FCN (icode) (op0);
33073 if (pat)
33074 emit_insn (pat);
33075 return 0;
33077 case IX86_BUILTIN_XSAVE:
33078 case IX86_BUILTIN_XRSTOR:
33079 case IX86_BUILTIN_XSAVE64:
33080 case IX86_BUILTIN_XRSTOR64:
33081 case IX86_BUILTIN_XSAVEOPT:
33082 case IX86_BUILTIN_XSAVEOPT64:
33083 arg0 = CALL_EXPR_ARG (exp, 0);
33084 arg1 = CALL_EXPR_ARG (exp, 1);
33085 op0 = expand_normal (arg0);
33086 op1 = expand_normal (arg1);
33088 if (!address_operand (op0, VOIDmode))
33090 op0 = convert_memory_address (Pmode, op0);
33091 op0 = copy_addr_to_reg (op0);
33093 op0 = gen_rtx_MEM (BLKmode, op0);
33095 op1 = force_reg (DImode, op1);
33097 if (TARGET_64BIT)
33099 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
33100 NULL, 1, OPTAB_DIRECT);
33101 switch (fcode)
33103 case IX86_BUILTIN_XSAVE:
33104 icode = CODE_FOR_xsave_rex64;
33105 break;
33106 case IX86_BUILTIN_XRSTOR:
33107 icode = CODE_FOR_xrstor_rex64;
33108 break;
33109 case IX86_BUILTIN_XSAVE64:
33110 icode = CODE_FOR_xsave64;
33111 break;
33112 case IX86_BUILTIN_XRSTOR64:
33113 icode = CODE_FOR_xrstor64;
33114 break;
33115 case IX86_BUILTIN_XSAVEOPT:
33116 icode = CODE_FOR_xsaveopt_rex64;
33117 break;
33118 case IX86_BUILTIN_XSAVEOPT64:
33119 icode = CODE_FOR_xsaveopt64;
33120 break;
33121 default:
33122 gcc_unreachable ();
33125 op2 = gen_lowpart (SImode, op2);
33126 op1 = gen_lowpart (SImode, op1);
33127 pat = GEN_FCN (icode) (op0, op1, op2);
33129 else
33131 switch (fcode)
33133 case IX86_BUILTIN_XSAVE:
33134 icode = CODE_FOR_xsave;
33135 break;
33136 case IX86_BUILTIN_XRSTOR:
33137 icode = CODE_FOR_xrstor;
33138 break;
33139 case IX86_BUILTIN_XSAVEOPT:
33140 icode = CODE_FOR_xsaveopt;
33141 break;
33142 default:
33143 gcc_unreachable ();
33145 pat = GEN_FCN (icode) (op0, op1);
33148 if (pat)
33149 emit_insn (pat);
33150 return 0;
33152 case IX86_BUILTIN_LLWPCB:
33153 arg0 = CALL_EXPR_ARG (exp, 0);
33154 op0 = expand_normal (arg0);
33155 icode = CODE_FOR_lwp_llwpcb;
33156 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
33157 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
33158 emit_insn (gen_lwp_llwpcb (op0));
33159 return 0;
33161 case IX86_BUILTIN_SLWPCB:
33162 icode = CODE_FOR_lwp_slwpcb;
33163 if (!target
33164 || !insn_data[icode].operand[0].predicate (target, Pmode))
33165 target = gen_reg_rtx (Pmode);
33166 emit_insn (gen_lwp_slwpcb (target));
33167 return target;
33169 case IX86_BUILTIN_BEXTRI32:
33170 case IX86_BUILTIN_BEXTRI64:
33171 arg0 = CALL_EXPR_ARG (exp, 0);
33172 arg1 = CALL_EXPR_ARG (exp, 1);
33173 op0 = expand_normal (arg0);
33174 op1 = expand_normal (arg1);
33175 icode = (fcode == IX86_BUILTIN_BEXTRI32
33176 ? CODE_FOR_tbm_bextri_si
33177 : CODE_FOR_tbm_bextri_di);
33178 if (!CONST_INT_P (op1))
33180 error ("last argument must be an immediate");
33181 return const0_rtx;
33183 else
33185 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
33186 unsigned char lsb_index = INTVAL (op1) & 0xFF;
33187 op1 = GEN_INT (length);
33188 op2 = GEN_INT (lsb_index);
33189 pat = GEN_FCN (icode) (target, op0, op1, op2);
33190 if (pat)
33191 emit_insn (pat);
33192 return target;
33195 case IX86_BUILTIN_RDRAND16_STEP:
33196 icode = CODE_FOR_rdrandhi_1;
33197 mode0 = HImode;
33198 goto rdrand_step;
33200 case IX86_BUILTIN_RDRAND32_STEP:
33201 icode = CODE_FOR_rdrandsi_1;
33202 mode0 = SImode;
33203 goto rdrand_step;
33205 case IX86_BUILTIN_RDRAND64_STEP:
33206 icode = CODE_FOR_rdranddi_1;
33207 mode0 = DImode;
33209 rdrand_step:
33210 op0 = gen_reg_rtx (mode0);
33211 emit_insn (GEN_FCN (icode) (op0));
33213 arg0 = CALL_EXPR_ARG (exp, 0);
33214 op1 = expand_normal (arg0);
33215 if (!address_operand (op1, VOIDmode))
33217 op1 = convert_memory_address (Pmode, op1);
33218 op1 = copy_addr_to_reg (op1);
33220 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33222 op1 = gen_reg_rtx (SImode);
33223 emit_move_insn (op1, CONST1_RTX (SImode));
33225 /* Emit SImode conditional move. */
33226 if (mode0 == HImode)
33228 op2 = gen_reg_rtx (SImode);
33229 emit_insn (gen_zero_extendhisi2 (op2, op0));
33231 else if (mode0 == SImode)
33232 op2 = op0;
33233 else
33234 op2 = gen_rtx_SUBREG (SImode, op0, 0);
33236 if (target == 0)
33237 target = gen_reg_rtx (SImode);
33239 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
33240 const0_rtx);
33241 emit_insn (gen_rtx_SET (VOIDmode, target,
33242 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
33243 return target;
33245 case IX86_BUILTIN_RDSEED16_STEP:
33246 icode = CODE_FOR_rdseedhi_1;
33247 mode0 = HImode;
33248 goto rdseed_step;
33250 case IX86_BUILTIN_RDSEED32_STEP:
33251 icode = CODE_FOR_rdseedsi_1;
33252 mode0 = SImode;
33253 goto rdseed_step;
33255 case IX86_BUILTIN_RDSEED64_STEP:
33256 icode = CODE_FOR_rdseeddi_1;
33257 mode0 = DImode;
33259 rdseed_step:
33260 op0 = gen_reg_rtx (mode0);
33261 emit_insn (GEN_FCN (icode) (op0));
33263 arg0 = CALL_EXPR_ARG (exp, 0);
33264 op1 = expand_normal (arg0);
33265 if (!address_operand (op1, VOIDmode))
33267 op1 = convert_memory_address (Pmode, op1);
33268 op1 = copy_addr_to_reg (op1);
33270 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33272 op2 = gen_reg_rtx (QImode);
33274 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
33275 const0_rtx);
33276 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
33278 if (target == 0)
33279 target = gen_reg_rtx (SImode);
33281 emit_insn (gen_zero_extendqisi2 (target, op2));
33282 return target;
33284 case IX86_BUILTIN_ADDCARRYX32:
33285 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
33286 mode0 = SImode;
33287 goto addcarryx;
33289 case IX86_BUILTIN_ADDCARRYX64:
33290 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
33291 mode0 = DImode;
33293 addcarryx:
33294 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
33295 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
33296 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
33297 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
33299 op0 = gen_reg_rtx (QImode);
33301 /* Generate CF from input operand. */
33302 op1 = expand_normal (arg0);
33303 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
33304 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
33306 /* Gen ADCX instruction to compute X+Y+CF. */
33307 op2 = expand_normal (arg1);
33308 op3 = expand_normal (arg2);
33310 if (!REG_P (op2))
33311 op2 = copy_to_mode_reg (mode0, op2);
33312 if (!REG_P (op3))
33313 op3 = copy_to_mode_reg (mode0, op3);
33315 op0 = gen_reg_rtx (mode0);
33317 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
33318 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
33319 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
33321 /* Store the result. */
33322 op4 = expand_normal (arg3);
33323 if (!address_operand (op4, VOIDmode))
33325 op4 = convert_memory_address (Pmode, op4);
33326 op4 = copy_addr_to_reg (op4);
33328 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
33330 /* Return current CF value. */
33331 if (target == 0)
33332 target = gen_reg_rtx (QImode);
33334 PUT_MODE (pat, QImode);
33335 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
33336 return target;
33338 case IX86_BUILTIN_GATHERSIV2DF:
33339 icode = CODE_FOR_avx2_gathersiv2df;
33340 goto gather_gen;
33341 case IX86_BUILTIN_GATHERSIV4DF:
33342 icode = CODE_FOR_avx2_gathersiv4df;
33343 goto gather_gen;
33344 case IX86_BUILTIN_GATHERDIV2DF:
33345 icode = CODE_FOR_avx2_gatherdiv2df;
33346 goto gather_gen;
33347 case IX86_BUILTIN_GATHERDIV4DF:
33348 icode = CODE_FOR_avx2_gatherdiv4df;
33349 goto gather_gen;
33350 case IX86_BUILTIN_GATHERSIV4SF:
33351 icode = CODE_FOR_avx2_gathersiv4sf;
33352 goto gather_gen;
33353 case IX86_BUILTIN_GATHERSIV8SF:
33354 icode = CODE_FOR_avx2_gathersiv8sf;
33355 goto gather_gen;
33356 case IX86_BUILTIN_GATHERDIV4SF:
33357 icode = CODE_FOR_avx2_gatherdiv4sf;
33358 goto gather_gen;
33359 case IX86_BUILTIN_GATHERDIV8SF:
33360 icode = CODE_FOR_avx2_gatherdiv8sf;
33361 goto gather_gen;
33362 case IX86_BUILTIN_GATHERSIV2DI:
33363 icode = CODE_FOR_avx2_gathersiv2di;
33364 goto gather_gen;
33365 case IX86_BUILTIN_GATHERSIV4DI:
33366 icode = CODE_FOR_avx2_gathersiv4di;
33367 goto gather_gen;
33368 case IX86_BUILTIN_GATHERDIV2DI:
33369 icode = CODE_FOR_avx2_gatherdiv2di;
33370 goto gather_gen;
33371 case IX86_BUILTIN_GATHERDIV4DI:
33372 icode = CODE_FOR_avx2_gatherdiv4di;
33373 goto gather_gen;
33374 case IX86_BUILTIN_GATHERSIV4SI:
33375 icode = CODE_FOR_avx2_gathersiv4si;
33376 goto gather_gen;
33377 case IX86_BUILTIN_GATHERSIV8SI:
33378 icode = CODE_FOR_avx2_gathersiv8si;
33379 goto gather_gen;
33380 case IX86_BUILTIN_GATHERDIV4SI:
33381 icode = CODE_FOR_avx2_gatherdiv4si;
33382 goto gather_gen;
33383 case IX86_BUILTIN_GATHERDIV8SI:
33384 icode = CODE_FOR_avx2_gatherdiv8si;
33385 goto gather_gen;
33386 case IX86_BUILTIN_GATHERALTSIV4DF:
33387 icode = CODE_FOR_avx2_gathersiv4df;
33388 goto gather_gen;
33389 case IX86_BUILTIN_GATHERALTDIV8SF:
33390 icode = CODE_FOR_avx2_gatherdiv8sf;
33391 goto gather_gen;
33392 case IX86_BUILTIN_GATHERALTSIV4DI:
33393 icode = CODE_FOR_avx2_gathersiv4di;
33394 goto gather_gen;
33395 case IX86_BUILTIN_GATHERALTDIV8SI:
33396 icode = CODE_FOR_avx2_gatherdiv8si;
33397 goto gather_gen;
33399 gather_gen:
33400 arg0 = CALL_EXPR_ARG (exp, 0);
33401 arg1 = CALL_EXPR_ARG (exp, 1);
33402 arg2 = CALL_EXPR_ARG (exp, 2);
33403 arg3 = CALL_EXPR_ARG (exp, 3);
33404 arg4 = CALL_EXPR_ARG (exp, 4);
33405 op0 = expand_normal (arg0);
33406 op1 = expand_normal (arg1);
33407 op2 = expand_normal (arg2);
33408 op3 = expand_normal (arg3);
33409 op4 = expand_normal (arg4);
33410 /* Note the arg order is different from the operand order. */
33411 mode0 = insn_data[icode].operand[1].mode;
33412 mode2 = insn_data[icode].operand[3].mode;
33413 mode3 = insn_data[icode].operand[4].mode;
33414 mode4 = insn_data[icode].operand[5].mode;
33416 if (target == NULL_RTX
33417 || GET_MODE (target) != insn_data[icode].operand[0].mode)
33418 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
33419 else
33420 subtarget = target;
33422 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
33423 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
33425 rtx half = gen_reg_rtx (V4SImode);
33426 if (!nonimmediate_operand (op2, V8SImode))
33427 op2 = copy_to_mode_reg (V8SImode, op2);
33428 emit_insn (gen_vec_extract_lo_v8si (half, op2));
33429 op2 = half;
33431 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
33432 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
33434 rtx (*gen) (rtx, rtx);
33435 rtx half = gen_reg_rtx (mode0);
33436 if (mode0 == V4SFmode)
33437 gen = gen_vec_extract_lo_v8sf;
33438 else
33439 gen = gen_vec_extract_lo_v8si;
33440 if (!nonimmediate_operand (op0, GET_MODE (op0)))
33441 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
33442 emit_insn (gen (half, op0));
33443 op0 = half;
33444 if (!nonimmediate_operand (op3, GET_MODE (op3)))
33445 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
33446 emit_insn (gen (half, op3));
33447 op3 = half;
33450 /* Force memory operand only with base register here. But we
33451 don't want to do it on memory operand for other builtin
33452 functions. */
33453 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
33455 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33456 op0 = copy_to_mode_reg (mode0, op0);
33457 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
33458 op1 = copy_to_mode_reg (Pmode, op1);
33459 if (!insn_data[icode].operand[3].predicate (op2, mode2))
33460 op2 = copy_to_mode_reg (mode2, op2);
33461 if (!insn_data[icode].operand[4].predicate (op3, mode3))
33462 op3 = copy_to_mode_reg (mode3, op3);
33463 if (!insn_data[icode].operand[5].predicate (op4, mode4))
33465 error ("last argument must be scale 1, 2, 4, 8");
33466 return const0_rtx;
33469 /* Optimize. If mask is known to have all high bits set,
33470 replace op0 with pc_rtx to signal that the instruction
33471 overwrites the whole destination and doesn't use its
33472 previous contents. */
33473 if (optimize)
33475 if (TREE_CODE (arg3) == VECTOR_CST)
33477 unsigned int negative = 0;
33478 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
33480 tree cst = VECTOR_CST_ELT (arg3, i);
33481 if (TREE_CODE (cst) == INTEGER_CST
33482 && tree_int_cst_sign_bit (cst))
33483 negative++;
33484 else if (TREE_CODE (cst) == REAL_CST
33485 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
33486 negative++;
33488 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
33489 op0 = pc_rtx;
33491 else if (TREE_CODE (arg3) == SSA_NAME)
33493 /* Recognize also when mask is like:
33494 __v2df src = _mm_setzero_pd ();
33495 __v2df mask = _mm_cmpeq_pd (src, src);
33497 __v8sf src = _mm256_setzero_ps ();
33498 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
33499 as that is a cheaper way to load all ones into
33500 a register than having to load a constant from
33501 memory. */
33502 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
33503 if (is_gimple_call (def_stmt))
33505 tree fndecl = gimple_call_fndecl (def_stmt);
33506 if (fndecl
33507 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33508 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
33510 case IX86_BUILTIN_CMPPD:
33511 case IX86_BUILTIN_CMPPS:
33512 case IX86_BUILTIN_CMPPD256:
33513 case IX86_BUILTIN_CMPPS256:
33514 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
33515 break;
33516 /* FALLTHRU */
33517 case IX86_BUILTIN_CMPEQPD:
33518 case IX86_BUILTIN_CMPEQPS:
33519 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
33520 && initializer_zerop (gimple_call_arg (def_stmt,
33521 1)))
33522 op0 = pc_rtx;
33523 break;
33524 default:
33525 break;
33531 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
33532 if (! pat)
33533 return const0_rtx;
33534 emit_insn (pat);
33536 if (fcode == IX86_BUILTIN_GATHERDIV8SF
33537 || fcode == IX86_BUILTIN_GATHERDIV8SI)
33539 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
33540 ? V4SFmode : V4SImode;
33541 if (target == NULL_RTX)
33542 target = gen_reg_rtx (tmode);
33543 if (tmode == V4SFmode)
33544 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
33545 else
33546 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
33548 else
33549 target = subtarget;
33551 return target;
33553 case IX86_BUILTIN_XABORT:
33554 icode = CODE_FOR_xabort;
33555 arg0 = CALL_EXPR_ARG (exp, 0);
33556 op0 = expand_normal (arg0);
33557 mode0 = insn_data[icode].operand[0].mode;
33558 if (!insn_data[icode].operand[0].predicate (op0, mode0))
33560 error ("the xabort's argument must be an 8-bit immediate");
33561 return const0_rtx;
33563 emit_insn (gen_xabort (op0));
33564 return 0;
33566 default:
33567 break;
33570 for (i = 0, d = bdesc_special_args;
33571 i < ARRAY_SIZE (bdesc_special_args);
33572 i++, d++)
33573 if (d->code == fcode)
33574 return ix86_expand_special_args_builtin (d, exp, target);
33576 for (i = 0, d = bdesc_args;
33577 i < ARRAY_SIZE (bdesc_args);
33578 i++, d++)
33579 if (d->code == fcode)
33580 switch (fcode)
33582 case IX86_BUILTIN_FABSQ:
33583 case IX86_BUILTIN_COPYSIGNQ:
33584 if (!TARGET_SSE)
33585 /* Emit a normal call if SSE isn't available. */
33586 return expand_call (exp, target, ignore);
33587 default:
33588 return ix86_expand_args_builtin (d, exp, target);
33591 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
33592 if (d->code == fcode)
33593 return ix86_expand_sse_comi (d, exp, target);
33595 for (i = 0, d = bdesc_pcmpestr;
33596 i < ARRAY_SIZE (bdesc_pcmpestr);
33597 i++, d++)
33598 if (d->code == fcode)
33599 return ix86_expand_sse_pcmpestr (d, exp, target);
33601 for (i = 0, d = bdesc_pcmpistr;
33602 i < ARRAY_SIZE (bdesc_pcmpistr);
33603 i++, d++)
33604 if (d->code == fcode)
33605 return ix86_expand_sse_pcmpistr (d, exp, target);
33607 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33608 if (d->code == fcode)
33609 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
33610 (enum ix86_builtin_func_type)
33611 d->flag, d->comparison);
33613 gcc_unreachable ();
33616 /* Returns a function decl for a vectorized version of the builtin function
33617 with builtin function code FN and the result vector type TYPE, or NULL_TREE
33618 if it is not available. */
33620 static tree
33621 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
33622 tree type_in)
33624 enum machine_mode in_mode, out_mode;
33625 int in_n, out_n;
33626 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
33628 if (TREE_CODE (type_out) != VECTOR_TYPE
33629 || TREE_CODE (type_in) != VECTOR_TYPE
33630 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
33631 return NULL_TREE;
33633 out_mode = TYPE_MODE (TREE_TYPE (type_out));
33634 out_n = TYPE_VECTOR_SUBPARTS (type_out);
33635 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33636 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33638 switch (fn)
33640 case BUILT_IN_SQRT:
33641 if (out_mode == DFmode && in_mode == DFmode)
33643 if (out_n == 2 && in_n == 2)
33644 return ix86_builtins[IX86_BUILTIN_SQRTPD];
33645 else if (out_n == 4 && in_n == 4)
33646 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
33648 break;
33650 case BUILT_IN_SQRTF:
33651 if (out_mode == SFmode && in_mode == SFmode)
33653 if (out_n == 4 && in_n == 4)
33654 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
33655 else if (out_n == 8 && in_n == 8)
33656 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
33658 break;
33660 case BUILT_IN_IFLOOR:
33661 case BUILT_IN_LFLOOR:
33662 case BUILT_IN_LLFLOOR:
33663 /* The round insn does not trap on denormals. */
33664 if (flag_trapping_math || !TARGET_ROUND)
33665 break;
33667 if (out_mode == SImode && in_mode == DFmode)
33669 if (out_n == 4 && in_n == 2)
33670 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
33671 else if (out_n == 8 && in_n == 4)
33672 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
33674 break;
33676 case BUILT_IN_IFLOORF:
33677 case BUILT_IN_LFLOORF:
33678 case BUILT_IN_LLFLOORF:
33679 /* The round insn does not trap on denormals. */
33680 if (flag_trapping_math || !TARGET_ROUND)
33681 break;
33683 if (out_mode == SImode && in_mode == SFmode)
33685 if (out_n == 4 && in_n == 4)
33686 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
33687 else if (out_n == 8 && in_n == 8)
33688 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
33690 break;
33692 case BUILT_IN_ICEIL:
33693 case BUILT_IN_LCEIL:
33694 case BUILT_IN_LLCEIL:
33695 /* The round insn does not trap on denormals. */
33696 if (flag_trapping_math || !TARGET_ROUND)
33697 break;
33699 if (out_mode == SImode && in_mode == DFmode)
33701 if (out_n == 4 && in_n == 2)
33702 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
33703 else if (out_n == 8 && in_n == 4)
33704 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
33706 break;
33708 case BUILT_IN_ICEILF:
33709 case BUILT_IN_LCEILF:
33710 case BUILT_IN_LLCEILF:
33711 /* The round insn does not trap on denormals. */
33712 if (flag_trapping_math || !TARGET_ROUND)
33713 break;
33715 if (out_mode == SImode && in_mode == SFmode)
33717 if (out_n == 4 && in_n == 4)
33718 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
33719 else if (out_n == 8 && in_n == 8)
33720 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
33722 break;
33724 case BUILT_IN_IRINT:
33725 case BUILT_IN_LRINT:
33726 case BUILT_IN_LLRINT:
33727 if (out_mode == SImode && in_mode == DFmode)
33729 if (out_n == 4 && in_n == 2)
33730 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33731 else if (out_n == 8 && in_n == 4)
33732 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33734 break;
33736 case BUILT_IN_IRINTF:
33737 case BUILT_IN_LRINTF:
33738 case BUILT_IN_LLRINTF:
33739 if (out_mode == SImode && in_mode == SFmode)
33741 if (out_n == 4 && in_n == 4)
33742 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33743 else if (out_n == 8 && in_n == 8)
33744 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33746 break;
33748 case BUILT_IN_IROUND:
33749 case BUILT_IN_LROUND:
33750 case BUILT_IN_LLROUND:
33751 /* The round insn does not trap on denormals. */
33752 if (flag_trapping_math || !TARGET_ROUND)
33753 break;
33755 if (out_mode == SImode && in_mode == DFmode)
33757 if (out_n == 4 && in_n == 2)
33758 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33759 else if (out_n == 8 && in_n == 4)
33760 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33762 break;
33764 case BUILT_IN_IROUNDF:
33765 case BUILT_IN_LROUNDF:
33766 case BUILT_IN_LLROUNDF:
33767 /* The round insn does not trap on denormals. */
33768 if (flag_trapping_math || !TARGET_ROUND)
33769 break;
33771 if (out_mode == SImode && in_mode == SFmode)
33773 if (out_n == 4 && in_n == 4)
33774 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33775 else if (out_n == 8 && in_n == 8)
33776 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33778 break;
33780 case BUILT_IN_COPYSIGN:
33781 if (out_mode == DFmode && in_mode == DFmode)
33783 if (out_n == 2 && in_n == 2)
33784 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33785 else if (out_n == 4 && in_n == 4)
33786 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33788 break;
33790 case BUILT_IN_COPYSIGNF:
33791 if (out_mode == SFmode && in_mode == SFmode)
33793 if (out_n == 4 && in_n == 4)
33794 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33795 else if (out_n == 8 && in_n == 8)
33796 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33798 break;
33800 case BUILT_IN_FLOOR:
33801 /* The round insn does not trap on denormals. */
33802 if (flag_trapping_math || !TARGET_ROUND)
33803 break;
33805 if (out_mode == DFmode && in_mode == DFmode)
33807 if (out_n == 2 && in_n == 2)
33808 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33809 else if (out_n == 4 && in_n == 4)
33810 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33812 break;
33814 case BUILT_IN_FLOORF:
33815 /* The round insn does not trap on denormals. */
33816 if (flag_trapping_math || !TARGET_ROUND)
33817 break;
33819 if (out_mode == SFmode && in_mode == SFmode)
33821 if (out_n == 4 && in_n == 4)
33822 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33823 else if (out_n == 8 && in_n == 8)
33824 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33826 break;
33828 case BUILT_IN_CEIL:
33829 /* The round insn does not trap on denormals. */
33830 if (flag_trapping_math || !TARGET_ROUND)
33831 break;
33833 if (out_mode == DFmode && in_mode == DFmode)
33835 if (out_n == 2 && in_n == 2)
33836 return ix86_builtins[IX86_BUILTIN_CEILPD];
33837 else if (out_n == 4 && in_n == 4)
33838 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33840 break;
33842 case BUILT_IN_CEILF:
33843 /* The round insn does not trap on denormals. */
33844 if (flag_trapping_math || !TARGET_ROUND)
33845 break;
33847 if (out_mode == SFmode && in_mode == SFmode)
33849 if (out_n == 4 && in_n == 4)
33850 return ix86_builtins[IX86_BUILTIN_CEILPS];
33851 else if (out_n == 8 && in_n == 8)
33852 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33854 break;
33856 case BUILT_IN_TRUNC:
33857 /* The round insn does not trap on denormals. */
33858 if (flag_trapping_math || !TARGET_ROUND)
33859 break;
33861 if (out_mode == DFmode && in_mode == DFmode)
33863 if (out_n == 2 && in_n == 2)
33864 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33865 else if (out_n == 4 && in_n == 4)
33866 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33868 break;
33870 case BUILT_IN_TRUNCF:
33871 /* The round insn does not trap on denormals. */
33872 if (flag_trapping_math || !TARGET_ROUND)
33873 break;
33875 if (out_mode == SFmode && in_mode == SFmode)
33877 if (out_n == 4 && in_n == 4)
33878 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33879 else if (out_n == 8 && in_n == 8)
33880 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33882 break;
33884 case BUILT_IN_RINT:
33885 /* The round insn does not trap on denormals. */
33886 if (flag_trapping_math || !TARGET_ROUND)
33887 break;
33889 if (out_mode == DFmode && in_mode == DFmode)
33891 if (out_n == 2 && in_n == 2)
33892 return ix86_builtins[IX86_BUILTIN_RINTPD];
33893 else if (out_n == 4 && in_n == 4)
33894 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33896 break;
33898 case BUILT_IN_RINTF:
33899 /* The round insn does not trap on denormals. */
33900 if (flag_trapping_math || !TARGET_ROUND)
33901 break;
33903 if (out_mode == SFmode && in_mode == SFmode)
33905 if (out_n == 4 && in_n == 4)
33906 return ix86_builtins[IX86_BUILTIN_RINTPS];
33907 else if (out_n == 8 && in_n == 8)
33908 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33910 break;
33912 case BUILT_IN_ROUND:
33913 /* The round insn does not trap on denormals. */
33914 if (flag_trapping_math || !TARGET_ROUND)
33915 break;
33917 if (out_mode == DFmode && in_mode == DFmode)
33919 if (out_n == 2 && in_n == 2)
33920 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33921 else if (out_n == 4 && in_n == 4)
33922 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33924 break;
33926 case BUILT_IN_ROUNDF:
33927 /* The round insn does not trap on denormals. */
33928 if (flag_trapping_math || !TARGET_ROUND)
33929 break;
33931 if (out_mode == SFmode && in_mode == SFmode)
33933 if (out_n == 4 && in_n == 4)
33934 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33935 else if (out_n == 8 && in_n == 8)
33936 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33938 break;
33940 case BUILT_IN_FMA:
33941 if (out_mode == DFmode && in_mode == DFmode)
33943 if (out_n == 2 && in_n == 2)
33944 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33945 if (out_n == 4 && in_n == 4)
33946 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33948 break;
33950 case BUILT_IN_FMAF:
33951 if (out_mode == SFmode && in_mode == SFmode)
33953 if (out_n == 4 && in_n == 4)
33954 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33955 if (out_n == 8 && in_n == 8)
33956 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33958 break;
33960 default:
33961 break;
33964 /* Dispatch to a handler for a vectorization library. */
33965 if (ix86_veclib_handler)
33966 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33967 type_in);
33969 return NULL_TREE;
33972 /* Handler for an SVML-style interface to
33973 a library with vectorized intrinsics. */
33975 static tree
33976 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
33978 char name[20];
33979 tree fntype, new_fndecl, args;
33980 unsigned arity;
33981 const char *bname;
33982 enum machine_mode el_mode, in_mode;
33983 int n, in_n;
33985 /* The SVML is suitable for unsafe math only. */
33986 if (!flag_unsafe_math_optimizations)
33987 return NULL_TREE;
33989 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33990 n = TYPE_VECTOR_SUBPARTS (type_out);
33991 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33992 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33993 if (el_mode != in_mode
33994 || n != in_n)
33995 return NULL_TREE;
33997 switch (fn)
33999 case BUILT_IN_EXP:
34000 case BUILT_IN_LOG:
34001 case BUILT_IN_LOG10:
34002 case BUILT_IN_POW:
34003 case BUILT_IN_TANH:
34004 case BUILT_IN_TAN:
34005 case BUILT_IN_ATAN:
34006 case BUILT_IN_ATAN2:
34007 case BUILT_IN_ATANH:
34008 case BUILT_IN_CBRT:
34009 case BUILT_IN_SINH:
34010 case BUILT_IN_SIN:
34011 case BUILT_IN_ASINH:
34012 case BUILT_IN_ASIN:
34013 case BUILT_IN_COSH:
34014 case BUILT_IN_COS:
34015 case BUILT_IN_ACOSH:
34016 case BUILT_IN_ACOS:
34017 if (el_mode != DFmode || n != 2)
34018 return NULL_TREE;
34019 break;
34021 case BUILT_IN_EXPF:
34022 case BUILT_IN_LOGF:
34023 case BUILT_IN_LOG10F:
34024 case BUILT_IN_POWF:
34025 case BUILT_IN_TANHF:
34026 case BUILT_IN_TANF:
34027 case BUILT_IN_ATANF:
34028 case BUILT_IN_ATAN2F:
34029 case BUILT_IN_ATANHF:
34030 case BUILT_IN_CBRTF:
34031 case BUILT_IN_SINHF:
34032 case BUILT_IN_SINF:
34033 case BUILT_IN_ASINHF:
34034 case BUILT_IN_ASINF:
34035 case BUILT_IN_COSHF:
34036 case BUILT_IN_COSF:
34037 case BUILT_IN_ACOSHF:
34038 case BUILT_IN_ACOSF:
34039 if (el_mode != SFmode || n != 4)
34040 return NULL_TREE;
34041 break;
34043 default:
34044 return NULL_TREE;
34047 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34049 if (fn == BUILT_IN_LOGF)
34050 strcpy (name, "vmlsLn4");
34051 else if (fn == BUILT_IN_LOG)
34052 strcpy (name, "vmldLn2");
34053 else if (n == 4)
34055 sprintf (name, "vmls%s", bname+10);
34056 name[strlen (name)-1] = '4';
34058 else
34059 sprintf (name, "vmld%s2", bname+10);
34061 /* Convert to uppercase. */
34062 name[4] &= ~0x20;
34064 arity = 0;
34065 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34066 args;
34067 args = TREE_CHAIN (args))
34068 arity++;
34070 if (arity == 1)
34071 fntype = build_function_type_list (type_out, type_in, NULL);
34072 else
34073 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34075 /* Build a function declaration for the vectorized function. */
34076 new_fndecl = build_decl (BUILTINS_LOCATION,
34077 FUNCTION_DECL, get_identifier (name), fntype);
34078 TREE_PUBLIC (new_fndecl) = 1;
34079 DECL_EXTERNAL (new_fndecl) = 1;
34080 DECL_IS_NOVOPS (new_fndecl) = 1;
34081 TREE_READONLY (new_fndecl) = 1;
34083 return new_fndecl;
34086 /* Handler for an ACML-style interface to
34087 a library with vectorized intrinsics. */
34089 static tree
34090 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
34092 char name[20] = "__vr.._";
34093 tree fntype, new_fndecl, args;
34094 unsigned arity;
34095 const char *bname;
34096 enum machine_mode el_mode, in_mode;
34097 int n, in_n;
34099 /* The ACML is 64bits only and suitable for unsafe math only as
34100 it does not correctly support parts of IEEE with the required
34101 precision such as denormals. */
34102 if (!TARGET_64BIT
34103 || !flag_unsafe_math_optimizations)
34104 return NULL_TREE;
34106 el_mode = TYPE_MODE (TREE_TYPE (type_out));
34107 n = TYPE_VECTOR_SUBPARTS (type_out);
34108 in_mode = TYPE_MODE (TREE_TYPE (type_in));
34109 in_n = TYPE_VECTOR_SUBPARTS (type_in);
34110 if (el_mode != in_mode
34111 || n != in_n)
34112 return NULL_TREE;
34114 switch (fn)
34116 case BUILT_IN_SIN:
34117 case BUILT_IN_COS:
34118 case BUILT_IN_EXP:
34119 case BUILT_IN_LOG:
34120 case BUILT_IN_LOG2:
34121 case BUILT_IN_LOG10:
34122 name[4] = 'd';
34123 name[5] = '2';
34124 if (el_mode != DFmode
34125 || n != 2)
34126 return NULL_TREE;
34127 break;
34129 case BUILT_IN_SINF:
34130 case BUILT_IN_COSF:
34131 case BUILT_IN_EXPF:
34132 case BUILT_IN_POWF:
34133 case BUILT_IN_LOGF:
34134 case BUILT_IN_LOG2F:
34135 case BUILT_IN_LOG10F:
34136 name[4] = 's';
34137 name[5] = '4';
34138 if (el_mode != SFmode
34139 || n != 4)
34140 return NULL_TREE;
34141 break;
34143 default:
34144 return NULL_TREE;
34147 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34148 sprintf (name + 7, "%s", bname+10);
34150 arity = 0;
34151 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34152 args;
34153 args = TREE_CHAIN (args))
34154 arity++;
34156 if (arity == 1)
34157 fntype = build_function_type_list (type_out, type_in, NULL);
34158 else
34159 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34161 /* Build a function declaration for the vectorized function. */
34162 new_fndecl = build_decl (BUILTINS_LOCATION,
34163 FUNCTION_DECL, get_identifier (name), fntype);
34164 TREE_PUBLIC (new_fndecl) = 1;
34165 DECL_EXTERNAL (new_fndecl) = 1;
34166 DECL_IS_NOVOPS (new_fndecl) = 1;
34167 TREE_READONLY (new_fndecl) = 1;
34169 return new_fndecl;
34172 /* Returns a decl of a function that implements gather load with
34173 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
34174 Return NULL_TREE if it is not available. */
34176 static tree
34177 ix86_vectorize_builtin_gather (const_tree mem_vectype,
34178 const_tree index_type, int scale)
34180 bool si;
34181 enum ix86_builtins code;
34183 if (! TARGET_AVX2)
34184 return NULL_TREE;
34186 if ((TREE_CODE (index_type) != INTEGER_TYPE
34187 && !POINTER_TYPE_P (index_type))
34188 || (TYPE_MODE (index_type) != SImode
34189 && TYPE_MODE (index_type) != DImode))
34190 return NULL_TREE;
34192 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
34193 return NULL_TREE;
34195 /* v*gather* insn sign extends index to pointer mode. */
34196 if (TYPE_PRECISION (index_type) < POINTER_SIZE
34197 && TYPE_UNSIGNED (index_type))
34198 return NULL_TREE;
34200 if (scale <= 0
34201 || scale > 8
34202 || (scale & (scale - 1)) != 0)
34203 return NULL_TREE;
34205 si = TYPE_MODE (index_type) == SImode;
34206 switch (TYPE_MODE (mem_vectype))
34208 case V2DFmode:
34209 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
34210 break;
34211 case V4DFmode:
34212 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
34213 break;
34214 case V2DImode:
34215 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
34216 break;
34217 case V4DImode:
34218 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
34219 break;
34220 case V4SFmode:
34221 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
34222 break;
34223 case V8SFmode:
34224 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
34225 break;
34226 case V4SImode:
34227 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
34228 break;
34229 case V8SImode:
34230 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
34231 break;
34232 default:
34233 return NULL_TREE;
34236 return ix86_builtins[code];
34239 /* Returns a code for a target-specific builtin that implements
34240 reciprocal of the function, or NULL_TREE if not available. */
34242 static tree
34243 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
34244 bool sqrt ATTRIBUTE_UNUSED)
34246 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
34247 && flag_finite_math_only && !flag_trapping_math
34248 && flag_unsafe_math_optimizations))
34249 return NULL_TREE;
34251 if (md_fn)
34252 /* Machine dependent builtins. */
34253 switch (fn)
34255 /* Vectorized version of sqrt to rsqrt conversion. */
34256 case IX86_BUILTIN_SQRTPS_NR:
34257 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
34259 case IX86_BUILTIN_SQRTPS_NR256:
34260 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
34262 default:
34263 return NULL_TREE;
34265 else
34266 /* Normal builtins. */
34267 switch (fn)
34269 /* Sqrt to rsqrt conversion. */
34270 case BUILT_IN_SQRTF:
34271 return ix86_builtins[IX86_BUILTIN_RSQRTF];
34273 default:
34274 return NULL_TREE;
34278 /* Helper for avx_vpermilps256_operand et al. This is also used by
34279 the expansion functions to turn the parallel back into a mask.
34280 The return value is 0 for no match and the imm8+1 for a match. */
34283 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
34285 unsigned i, nelt = GET_MODE_NUNITS (mode);
34286 unsigned mask = 0;
34287 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34289 if (XVECLEN (par, 0) != (int) nelt)
34290 return 0;
34292 /* Validate that all of the elements are constants, and not totally
34293 out of range. Copy the data into an integral array to make the
34294 subsequent checks easier. */
34295 for (i = 0; i < nelt; ++i)
34297 rtx er = XVECEXP (par, 0, i);
34298 unsigned HOST_WIDE_INT ei;
34300 if (!CONST_INT_P (er))
34301 return 0;
34302 ei = INTVAL (er);
34303 if (ei >= nelt)
34304 return 0;
34305 ipar[i] = ei;
34308 switch (mode)
34310 case V4DFmode:
34311 /* In the 256-bit DFmode case, we can only move elements within
34312 a 128-bit lane. */
34313 for (i = 0; i < 2; ++i)
34315 if (ipar[i] >= 2)
34316 return 0;
34317 mask |= ipar[i] << i;
34319 for (i = 2; i < 4; ++i)
34321 if (ipar[i] < 2)
34322 return 0;
34323 mask |= (ipar[i] - 2) << i;
34325 break;
34327 case V8SFmode:
34328 /* In the 256-bit SFmode case, we have full freedom of movement
34329 within the low 128-bit lane, but the high 128-bit lane must
34330 mirror the exact same pattern. */
34331 for (i = 0; i < 4; ++i)
34332 if (ipar[i] + 4 != ipar[i + 4])
34333 return 0;
34334 nelt = 4;
34335 /* FALLTHRU */
34337 case V2DFmode:
34338 case V4SFmode:
34339 /* In the 128-bit case, we've full freedom in the placement of
34340 the elements from the source operand. */
34341 for (i = 0; i < nelt; ++i)
34342 mask |= ipar[i] << (i * (nelt / 2));
34343 break;
34345 default:
34346 gcc_unreachable ();
34349 /* Make sure success has a non-zero value by adding one. */
34350 return mask + 1;
34353 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
34354 the expansion functions to turn the parallel back into a mask.
34355 The return value is 0 for no match and the imm8+1 for a match. */
34358 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
34360 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
34361 unsigned mask = 0;
34362 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34364 if (XVECLEN (par, 0) != (int) nelt)
34365 return 0;
34367 /* Validate that all of the elements are constants, and not totally
34368 out of range. Copy the data into an integral array to make the
34369 subsequent checks easier. */
34370 for (i = 0; i < nelt; ++i)
34372 rtx er = XVECEXP (par, 0, i);
34373 unsigned HOST_WIDE_INT ei;
34375 if (!CONST_INT_P (er))
34376 return 0;
34377 ei = INTVAL (er);
34378 if (ei >= 2 * nelt)
34379 return 0;
34380 ipar[i] = ei;
34383 /* Validate that the halves of the permute are halves. */
34384 for (i = 0; i < nelt2 - 1; ++i)
34385 if (ipar[i] + 1 != ipar[i + 1])
34386 return 0;
34387 for (i = nelt2; i < nelt - 1; ++i)
34388 if (ipar[i] + 1 != ipar[i + 1])
34389 return 0;
34391 /* Reconstruct the mask. */
34392 for (i = 0; i < 2; ++i)
34394 unsigned e = ipar[i * nelt2];
34395 if (e % nelt2)
34396 return 0;
34397 e /= nelt2;
34398 mask |= e << (i * 4);
34401 /* Make sure success has a non-zero value by adding one. */
34402 return mask + 1;
34405 /* Store OPERAND to the memory after reload is completed. This means
34406 that we can't easily use assign_stack_local. */
34408 ix86_force_to_memory (enum machine_mode mode, rtx operand)
34410 rtx result;
34412 gcc_assert (reload_completed);
34413 if (ix86_using_red_zone ())
34415 result = gen_rtx_MEM (mode,
34416 gen_rtx_PLUS (Pmode,
34417 stack_pointer_rtx,
34418 GEN_INT (-RED_ZONE_SIZE)));
34419 emit_move_insn (result, operand);
34421 else if (TARGET_64BIT)
34423 switch (mode)
34425 case HImode:
34426 case SImode:
34427 operand = gen_lowpart (DImode, operand);
34428 /* FALLTHRU */
34429 case DImode:
34430 emit_insn (
34431 gen_rtx_SET (VOIDmode,
34432 gen_rtx_MEM (DImode,
34433 gen_rtx_PRE_DEC (DImode,
34434 stack_pointer_rtx)),
34435 operand));
34436 break;
34437 default:
34438 gcc_unreachable ();
34440 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34442 else
34444 switch (mode)
34446 case DImode:
34448 rtx operands[2];
34449 split_double_mode (mode, &operand, 1, operands, operands + 1);
34450 emit_insn (
34451 gen_rtx_SET (VOIDmode,
34452 gen_rtx_MEM (SImode,
34453 gen_rtx_PRE_DEC (Pmode,
34454 stack_pointer_rtx)),
34455 operands[1]));
34456 emit_insn (
34457 gen_rtx_SET (VOIDmode,
34458 gen_rtx_MEM (SImode,
34459 gen_rtx_PRE_DEC (Pmode,
34460 stack_pointer_rtx)),
34461 operands[0]));
34463 break;
34464 case HImode:
34465 /* Store HImodes as SImodes. */
34466 operand = gen_lowpart (SImode, operand);
34467 /* FALLTHRU */
34468 case SImode:
34469 emit_insn (
34470 gen_rtx_SET (VOIDmode,
34471 gen_rtx_MEM (GET_MODE (operand),
34472 gen_rtx_PRE_DEC (SImode,
34473 stack_pointer_rtx)),
34474 operand));
34475 break;
34476 default:
34477 gcc_unreachable ();
34479 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34481 return result;
34484 /* Free operand from the memory. */
34485 void
34486 ix86_free_from_memory (enum machine_mode mode)
34488 if (!ix86_using_red_zone ())
34490 int size;
34492 if (mode == DImode || TARGET_64BIT)
34493 size = 8;
34494 else
34495 size = 4;
34496 /* Use LEA to deallocate stack space. In peephole2 it will be converted
34497 to pop or add instruction if registers are available. */
34498 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
34499 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
34500 GEN_INT (size))));
34504 /* Return a register priority for hard reg REGNO. */
34505 static int
34506 ix86_register_priority (int hard_regno)
34508 /* ebp and r13 as the base always wants a displacement, r12 as the
34509 base always wants an index. So discourage their usage in an
34510 address. */
34511 if (hard_regno == R12_REG || hard_regno == R13_REG)
34512 return 0;
34513 if (hard_regno == BP_REG)
34514 return 1;
34515 /* New x86-64 int registers result in bigger code size. Discourage
34516 them. */
34517 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
34518 return 2;
34519 /* New x86-64 SSE registers result in bigger code size. Discourage
34520 them. */
34521 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
34522 return 2;
34523 /* Usage of AX register results in smaller code. Prefer it. */
34524 if (hard_regno == 0)
34525 return 4;
34526 return 3;
34529 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
34531 Put float CONST_DOUBLE in the constant pool instead of fp regs.
34532 QImode must go into class Q_REGS.
34533 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
34534 movdf to do mem-to-mem moves through integer regs. */
34536 static reg_class_t
34537 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
34539 enum machine_mode mode = GET_MODE (x);
34541 /* We're only allowed to return a subclass of CLASS. Many of the
34542 following checks fail for NO_REGS, so eliminate that early. */
34543 if (regclass == NO_REGS)
34544 return NO_REGS;
34546 /* All classes can load zeros. */
34547 if (x == CONST0_RTX (mode))
34548 return regclass;
34550 /* Force constants into memory if we are loading a (nonzero) constant into
34551 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
34552 instructions to load from a constant. */
34553 if (CONSTANT_P (x)
34554 && (MAYBE_MMX_CLASS_P (regclass)
34555 || MAYBE_SSE_CLASS_P (regclass)
34556 || MAYBE_MASK_CLASS_P (regclass)))
34557 return NO_REGS;
34559 /* Prefer SSE regs only, if we can use them for math. */
34560 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
34561 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
34563 /* Floating-point constants need more complex checks. */
34564 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
34566 /* General regs can load everything. */
34567 if (reg_class_subset_p (regclass, GENERAL_REGS))
34568 return regclass;
34570 /* Floats can load 0 and 1 plus some others. Note that we eliminated
34571 zero above. We only want to wind up preferring 80387 registers if
34572 we plan on doing computation with them. */
34573 if (TARGET_80387
34574 && standard_80387_constant_p (x) > 0)
34576 /* Limit class to non-sse. */
34577 if (regclass == FLOAT_SSE_REGS)
34578 return FLOAT_REGS;
34579 if (regclass == FP_TOP_SSE_REGS)
34580 return FP_TOP_REG;
34581 if (regclass == FP_SECOND_SSE_REGS)
34582 return FP_SECOND_REG;
34583 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
34584 return regclass;
34587 return NO_REGS;
34590 /* Generally when we see PLUS here, it's the function invariant
34591 (plus soft-fp const_int). Which can only be computed into general
34592 regs. */
34593 if (GET_CODE (x) == PLUS)
34594 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
34596 /* QImode constants are easy to load, but non-constant QImode data
34597 must go into Q_REGS. */
34598 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
34600 if (reg_class_subset_p (regclass, Q_REGS))
34601 return regclass;
34602 if (reg_class_subset_p (Q_REGS, regclass))
34603 return Q_REGS;
34604 return NO_REGS;
34607 return regclass;
34610 /* Discourage putting floating-point values in SSE registers unless
34611 SSE math is being used, and likewise for the 387 registers. */
34612 static reg_class_t
34613 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
34615 enum machine_mode mode = GET_MODE (x);
34617 /* Restrict the output reload class to the register bank that we are doing
34618 math on. If we would like not to return a subset of CLASS, reject this
34619 alternative: if reload cannot do this, it will still use its choice. */
34620 mode = GET_MODE (x);
34621 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
34622 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
34624 if (X87_FLOAT_MODE_P (mode))
34626 if (regclass == FP_TOP_SSE_REGS)
34627 return FP_TOP_REG;
34628 else if (regclass == FP_SECOND_SSE_REGS)
34629 return FP_SECOND_REG;
34630 else
34631 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
34634 return regclass;
34637 static reg_class_t
34638 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
34639 enum machine_mode mode, secondary_reload_info *sri)
34641 /* Double-word spills from general registers to non-offsettable memory
34642 references (zero-extended addresses) require special handling. */
34643 if (TARGET_64BIT
34644 && MEM_P (x)
34645 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
34646 && INTEGER_CLASS_P (rclass)
34647 && !offsettable_memref_p (x))
34649 sri->icode = (in_p
34650 ? CODE_FOR_reload_noff_load
34651 : CODE_FOR_reload_noff_store);
34652 /* Add the cost of moving address to a temporary. */
34653 sri->extra_cost = 1;
34655 return NO_REGS;
34658 /* QImode spills from non-QI registers require
34659 intermediate register on 32bit targets. */
34660 if (mode == QImode
34661 && (MAYBE_MASK_CLASS_P (rclass)
34662 || (!TARGET_64BIT && !in_p
34663 && INTEGER_CLASS_P (rclass)
34664 && MAYBE_NON_Q_CLASS_P (rclass))))
34666 int regno;
34668 if (REG_P (x))
34669 regno = REGNO (x);
34670 else
34671 regno = -1;
34673 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
34674 regno = true_regnum (x);
34676 /* Return Q_REGS if the operand is in memory. */
34677 if (regno == -1)
34678 return Q_REGS;
34681 /* This condition handles corner case where an expression involving
34682 pointers gets vectorized. We're trying to use the address of a
34683 stack slot as a vector initializer.
34685 (set (reg:V2DI 74 [ vect_cst_.2 ])
34686 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
34688 Eventually frame gets turned into sp+offset like this:
34690 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34691 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34692 (const_int 392 [0x188]))))
34694 That later gets turned into:
34696 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34697 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34698 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
34700 We'll have the following reload recorded:
34702 Reload 0: reload_in (DI) =
34703 (plus:DI (reg/f:DI 7 sp)
34704 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
34705 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34706 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
34707 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
34708 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34709 reload_reg_rtx: (reg:V2DI 22 xmm1)
34711 Which isn't going to work since SSE instructions can't handle scalar
34712 additions. Returning GENERAL_REGS forces the addition into integer
34713 register and reload can handle subsequent reloads without problems. */
34715 if (in_p && GET_CODE (x) == PLUS
34716 && SSE_CLASS_P (rclass)
34717 && SCALAR_INT_MODE_P (mode))
34718 return GENERAL_REGS;
34720 return NO_REGS;
34723 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34725 static bool
34726 ix86_class_likely_spilled_p (reg_class_t rclass)
34728 switch (rclass)
34730 case AREG:
34731 case DREG:
34732 case CREG:
34733 case BREG:
34734 case AD_REGS:
34735 case SIREG:
34736 case DIREG:
34737 case SSE_FIRST_REG:
34738 case FP_TOP_REG:
34739 case FP_SECOND_REG:
34740 case BND_REGS:
34741 return true;
34743 default:
34744 break;
34747 return false;
34750 /* If we are copying between general and FP registers, we need a memory
34751 location. The same is true for SSE and MMX registers.
34753 To optimize register_move_cost performance, allow inline variant.
34755 The macro can't work reliably when one of the CLASSES is class containing
34756 registers from multiple units (SSE, MMX, integer). We avoid this by never
34757 combining those units in single alternative in the machine description.
34758 Ensure that this constraint holds to avoid unexpected surprises.
34760 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34761 enforce these sanity checks. */
34763 static inline bool
34764 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34765 enum machine_mode mode, int strict)
34767 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34768 return false;
34769 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34770 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34771 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34772 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34773 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34774 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34776 gcc_assert (!strict || lra_in_progress);
34777 return true;
34780 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34781 return true;
34783 /* ??? This is a lie. We do have moves between mmx/general, and for
34784 mmx/sse2. But by saying we need secondary memory we discourage the
34785 register allocator from using the mmx registers unless needed. */
34786 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34787 return true;
34789 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34791 /* SSE1 doesn't have any direct moves from other classes. */
34792 if (!TARGET_SSE2)
34793 return true;
34795 /* If the target says that inter-unit moves are more expensive
34796 than moving through memory, then don't generate them. */
34797 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34798 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34799 return true;
34801 /* Between SSE and general, we have moves no larger than word size. */
34802 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34803 return true;
34806 return false;
34809 bool
34810 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34811 enum machine_mode mode, int strict)
34813 return inline_secondary_memory_needed (class1, class2, mode, strict);
34816 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34818 On the 80386, this is the size of MODE in words,
34819 except in the FP regs, where a single reg is always enough. */
34821 static unsigned char
34822 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34824 if (MAYBE_INTEGER_CLASS_P (rclass))
34826 if (mode == XFmode)
34827 return (TARGET_64BIT ? 2 : 3);
34828 else if (mode == XCmode)
34829 return (TARGET_64BIT ? 4 : 6);
34830 else
34831 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34833 else
34835 if (COMPLEX_MODE_P (mode))
34836 return 2;
34837 else
34838 return 1;
34842 /* Return true if the registers in CLASS cannot represent the change from
34843 modes FROM to TO. */
34845 bool
34846 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34847 enum reg_class regclass)
34849 if (from == to)
34850 return false;
34852 /* x87 registers can't do subreg at all, as all values are reformatted
34853 to extended precision. */
34854 if (MAYBE_FLOAT_CLASS_P (regclass))
34855 return true;
34857 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34859 /* Vector registers do not support QI or HImode loads. If we don't
34860 disallow a change to these modes, reload will assume it's ok to
34861 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34862 the vec_dupv4hi pattern. */
34863 if (GET_MODE_SIZE (from) < 4)
34864 return true;
34866 /* Vector registers do not support subreg with nonzero offsets, which
34867 are otherwise valid for integer registers. Since we can't see
34868 whether we have a nonzero offset from here, prohibit all
34869 nonparadoxical subregs changing size. */
34870 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34871 return true;
34874 return false;
34877 /* Return the cost of moving data of mode M between a
34878 register and memory. A value of 2 is the default; this cost is
34879 relative to those in `REGISTER_MOVE_COST'.
34881 This function is used extensively by register_move_cost that is used to
34882 build tables at startup. Make it inline in this case.
34883 When IN is 2, return maximum of in and out move cost.
34885 If moving between registers and memory is more expensive than
34886 between two registers, you should define this macro to express the
34887 relative cost.
34889 Model also increased moving costs of QImode registers in non
34890 Q_REGS classes.
34892 static inline int
34893 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34894 int in)
34896 int cost;
34897 if (FLOAT_CLASS_P (regclass))
34899 int index;
34900 switch (mode)
34902 case SFmode:
34903 index = 0;
34904 break;
34905 case DFmode:
34906 index = 1;
34907 break;
34908 case XFmode:
34909 index = 2;
34910 break;
34911 default:
34912 return 100;
34914 if (in == 2)
34915 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34916 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34918 if (SSE_CLASS_P (regclass))
34920 int index;
34921 switch (GET_MODE_SIZE (mode))
34923 case 4:
34924 index = 0;
34925 break;
34926 case 8:
34927 index = 1;
34928 break;
34929 case 16:
34930 index = 2;
34931 break;
34932 default:
34933 return 100;
34935 if (in == 2)
34936 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34937 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34939 if (MMX_CLASS_P (regclass))
34941 int index;
34942 switch (GET_MODE_SIZE (mode))
34944 case 4:
34945 index = 0;
34946 break;
34947 case 8:
34948 index = 1;
34949 break;
34950 default:
34951 return 100;
34953 if (in)
34954 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34955 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34957 switch (GET_MODE_SIZE (mode))
34959 case 1:
34960 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34962 if (!in)
34963 return ix86_cost->int_store[0];
34964 if (TARGET_PARTIAL_REG_DEPENDENCY
34965 && optimize_function_for_speed_p (cfun))
34966 cost = ix86_cost->movzbl_load;
34967 else
34968 cost = ix86_cost->int_load[0];
34969 if (in == 2)
34970 return MAX (cost, ix86_cost->int_store[0]);
34971 return cost;
34973 else
34975 if (in == 2)
34976 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
34977 if (in)
34978 return ix86_cost->movzbl_load;
34979 else
34980 return ix86_cost->int_store[0] + 4;
34982 break;
34983 case 2:
34984 if (in == 2)
34985 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
34986 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
34987 default:
34988 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
34989 if (mode == TFmode)
34990 mode = XFmode;
34991 if (in == 2)
34992 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
34993 else if (in)
34994 cost = ix86_cost->int_load[2];
34995 else
34996 cost = ix86_cost->int_store[2];
34997 return (cost * (((int) GET_MODE_SIZE (mode)
34998 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
35002 static int
35003 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
35004 bool in)
35006 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
35010 /* Return the cost of moving data from a register in class CLASS1 to
35011 one in class CLASS2.
35013 It is not required that the cost always equal 2 when FROM is the same as TO;
35014 on some machines it is expensive to move between registers if they are not
35015 general registers. */
35017 static int
35018 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
35019 reg_class_t class2_i)
35021 enum reg_class class1 = (enum reg_class) class1_i;
35022 enum reg_class class2 = (enum reg_class) class2_i;
35024 /* In case we require secondary memory, compute cost of the store followed
35025 by load. In order to avoid bad register allocation choices, we need
35026 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
35028 if (inline_secondary_memory_needed (class1, class2, mode, 0))
35030 int cost = 1;
35032 cost += inline_memory_move_cost (mode, class1, 2);
35033 cost += inline_memory_move_cost (mode, class2, 2);
35035 /* In case of copying from general_purpose_register we may emit multiple
35036 stores followed by single load causing memory size mismatch stall.
35037 Count this as arbitrarily high cost of 20. */
35038 if (targetm.class_max_nregs (class1, mode)
35039 > targetm.class_max_nregs (class2, mode))
35040 cost += 20;
35042 /* In the case of FP/MMX moves, the registers actually overlap, and we
35043 have to switch modes in order to treat them differently. */
35044 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
35045 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
35046 cost += 20;
35048 return cost;
35051 /* Moves between SSE/MMX and integer unit are expensive. */
35052 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
35053 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
35055 /* ??? By keeping returned value relatively high, we limit the number
35056 of moves between integer and MMX/SSE registers for all targets.
35057 Additionally, high value prevents problem with x86_modes_tieable_p(),
35058 where integer modes in MMX/SSE registers are not tieable
35059 because of missing QImode and HImode moves to, from or between
35060 MMX/SSE registers. */
35061 return MAX (8, ix86_cost->mmxsse_to_integer);
35063 if (MAYBE_FLOAT_CLASS_P (class1))
35064 return ix86_cost->fp_move;
35065 if (MAYBE_SSE_CLASS_P (class1))
35066 return ix86_cost->sse_move;
35067 if (MAYBE_MMX_CLASS_P (class1))
35068 return ix86_cost->mmx_move;
35069 return 2;
35072 /* Return TRUE if hard register REGNO can hold a value of machine-mode
35073 MODE. */
35075 bool
35076 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
35078 /* Flags and only flags can only hold CCmode values. */
35079 if (CC_REGNO_P (regno))
35080 return GET_MODE_CLASS (mode) == MODE_CC;
35081 if (GET_MODE_CLASS (mode) == MODE_CC
35082 || GET_MODE_CLASS (mode) == MODE_RANDOM
35083 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
35084 return false;
35085 if (STACK_REGNO_P (regno))
35086 return VALID_FP_MODE_P (mode);
35087 if (MASK_REGNO_P (regno))
35088 return VALID_MASK_REG_MODE (mode);
35089 if (BND_REGNO_P (regno))
35090 return VALID_BND_REG_MODE (mode);
35091 if (SSE_REGNO_P (regno))
35093 /* We implement the move patterns for all vector modes into and
35094 out of SSE registers, even when no operation instructions
35095 are available. */
35097 /* For AVX-512 we allow, regardless of regno:
35098 - XI mode
35099 - any of 512-bit wide vector mode
35100 - any scalar mode. */
35101 if (TARGET_AVX512F
35102 && (mode == XImode
35103 || VALID_AVX512F_REG_MODE (mode)
35104 || VALID_AVX512F_SCALAR_MODE (mode)))
35105 return true;
35107 /* xmm16-xmm31 are only available for AVX-512. */
35108 if (EXT_REX_SSE_REGNO_P (regno))
35109 return false;
35111 /* OImode move is available only when AVX is enabled. */
35112 return ((TARGET_AVX && mode == OImode)
35113 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35114 || VALID_SSE_REG_MODE (mode)
35115 || VALID_SSE2_REG_MODE (mode)
35116 || VALID_MMX_REG_MODE (mode)
35117 || VALID_MMX_REG_MODE_3DNOW (mode));
35119 if (MMX_REGNO_P (regno))
35121 /* We implement the move patterns for 3DNOW modes even in MMX mode,
35122 so if the register is available at all, then we can move data of
35123 the given mode into or out of it. */
35124 return (VALID_MMX_REG_MODE (mode)
35125 || VALID_MMX_REG_MODE_3DNOW (mode));
35128 if (mode == QImode)
35130 /* Take care for QImode values - they can be in non-QI regs,
35131 but then they do cause partial register stalls. */
35132 if (ANY_QI_REGNO_P (regno))
35133 return true;
35134 if (!TARGET_PARTIAL_REG_STALL)
35135 return true;
35136 /* LRA checks if the hard register is OK for the given mode.
35137 QImode values can live in non-QI regs, so we allow all
35138 registers here. */
35139 if (lra_in_progress)
35140 return true;
35141 return !can_create_pseudo_p ();
35143 /* We handle both integer and floats in the general purpose registers. */
35144 else if (VALID_INT_MODE_P (mode))
35145 return true;
35146 else if (VALID_FP_MODE_P (mode))
35147 return true;
35148 else if (VALID_DFP_MODE_P (mode))
35149 return true;
35150 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
35151 on to use that value in smaller contexts, this can easily force a
35152 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
35153 supporting DImode, allow it. */
35154 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
35155 return true;
35157 return false;
35160 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
35161 tieable integer mode. */
35163 static bool
35164 ix86_tieable_integer_mode_p (enum machine_mode mode)
35166 switch (mode)
35168 case HImode:
35169 case SImode:
35170 return true;
35172 case QImode:
35173 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
35175 case DImode:
35176 return TARGET_64BIT;
35178 default:
35179 return false;
35183 /* Return true if MODE1 is accessible in a register that can hold MODE2
35184 without copying. That is, all register classes that can hold MODE2
35185 can also hold MODE1. */
35187 bool
35188 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
35190 if (mode1 == mode2)
35191 return true;
35193 if (ix86_tieable_integer_mode_p (mode1)
35194 && ix86_tieable_integer_mode_p (mode2))
35195 return true;
35197 /* MODE2 being XFmode implies fp stack or general regs, which means we
35198 can tie any smaller floating point modes to it. Note that we do not
35199 tie this with TFmode. */
35200 if (mode2 == XFmode)
35201 return mode1 == SFmode || mode1 == DFmode;
35203 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
35204 that we can tie it with SFmode. */
35205 if (mode2 == DFmode)
35206 return mode1 == SFmode;
35208 /* If MODE2 is only appropriate for an SSE register, then tie with
35209 any other mode acceptable to SSE registers. */
35210 if (GET_MODE_SIZE (mode2) == 32
35211 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35212 return (GET_MODE_SIZE (mode1) == 32
35213 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35214 if (GET_MODE_SIZE (mode2) == 16
35215 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35216 return (GET_MODE_SIZE (mode1) == 16
35217 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35219 /* If MODE2 is appropriate for an MMX register, then tie
35220 with any other mode acceptable to MMX registers. */
35221 if (GET_MODE_SIZE (mode2) == 8
35222 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
35223 return (GET_MODE_SIZE (mode1) == 8
35224 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
35226 return false;
35229 /* Return the cost of moving between two registers of mode MODE. */
35231 static int
35232 ix86_set_reg_reg_cost (enum machine_mode mode)
35234 unsigned int units = UNITS_PER_WORD;
35236 switch (GET_MODE_CLASS (mode))
35238 default:
35239 break;
35241 case MODE_CC:
35242 units = GET_MODE_SIZE (CCmode);
35243 break;
35245 case MODE_FLOAT:
35246 if ((TARGET_SSE && mode == TFmode)
35247 || (TARGET_80387 && mode == XFmode)
35248 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
35249 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
35250 units = GET_MODE_SIZE (mode);
35251 break;
35253 case MODE_COMPLEX_FLOAT:
35254 if ((TARGET_SSE && mode == TCmode)
35255 || (TARGET_80387 && mode == XCmode)
35256 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
35257 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
35258 units = GET_MODE_SIZE (mode);
35259 break;
35261 case MODE_VECTOR_INT:
35262 case MODE_VECTOR_FLOAT:
35263 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
35264 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35265 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
35266 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
35267 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
35268 units = GET_MODE_SIZE (mode);
35271 /* Return the cost of moving between two registers of mode MODE,
35272 assuming that the move will be in pieces of at most UNITS bytes. */
35273 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
35276 /* Compute a (partial) cost for rtx X. Return true if the complete
35277 cost has been computed, and false if subexpressions should be
35278 scanned. In either case, *TOTAL contains the cost result. */
35280 static bool
35281 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
35282 bool speed)
35284 enum rtx_code code = (enum rtx_code) code_i;
35285 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
35286 enum machine_mode mode = GET_MODE (x);
35287 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
35289 switch (code)
35291 case SET:
35292 if (register_operand (SET_DEST (x), VOIDmode)
35293 && reg_or_0_operand (SET_SRC (x), VOIDmode))
35295 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
35296 return true;
35298 return false;
35300 case CONST_INT:
35301 case CONST:
35302 case LABEL_REF:
35303 case SYMBOL_REF:
35304 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
35305 *total = 3;
35306 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
35307 *total = 2;
35308 else if (flag_pic && SYMBOLIC_CONST (x)
35309 && (!TARGET_64BIT
35310 || (!GET_CODE (x) != LABEL_REF
35311 && (GET_CODE (x) != SYMBOL_REF
35312 || !SYMBOL_REF_LOCAL_P (x)))))
35313 *total = 1;
35314 else
35315 *total = 0;
35316 return true;
35318 case CONST_DOUBLE:
35319 if (mode == VOIDmode)
35321 *total = 0;
35322 return true;
35324 switch (standard_80387_constant_p (x))
35326 case 1: /* 0.0 */
35327 *total = 1;
35328 return true;
35329 default: /* Other constants */
35330 *total = 2;
35331 return true;
35332 case 0:
35333 case -1:
35334 break;
35336 if (SSE_FLOAT_MODE_P (mode))
35338 case CONST_VECTOR:
35339 switch (standard_sse_constant_p (x))
35341 case 0:
35342 break;
35343 case 1: /* 0: xor eliminates false dependency */
35344 *total = 0;
35345 return true;
35346 default: /* -1: cmp contains false dependency */
35347 *total = 1;
35348 return true;
35351 /* Fall back to (MEM (SYMBOL_REF)), since that's where
35352 it'll probably end up. Add a penalty for size. */
35353 *total = (COSTS_N_INSNS (1)
35354 + (flag_pic != 0 && !TARGET_64BIT)
35355 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
35356 return true;
35358 case ZERO_EXTEND:
35359 /* The zero extensions is often completely free on x86_64, so make
35360 it as cheap as possible. */
35361 if (TARGET_64BIT && mode == DImode
35362 && GET_MODE (XEXP (x, 0)) == SImode)
35363 *total = 1;
35364 else if (TARGET_ZERO_EXTEND_WITH_AND)
35365 *total = cost->add;
35366 else
35367 *total = cost->movzx;
35368 return false;
35370 case SIGN_EXTEND:
35371 *total = cost->movsx;
35372 return false;
35374 case ASHIFT:
35375 if (SCALAR_INT_MODE_P (mode)
35376 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
35377 && CONST_INT_P (XEXP (x, 1)))
35379 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35380 if (value == 1)
35382 *total = cost->add;
35383 return false;
35385 if ((value == 2 || value == 3)
35386 && cost->lea <= cost->shift_const)
35388 *total = cost->lea;
35389 return false;
35392 /* FALLTHRU */
35394 case ROTATE:
35395 case ASHIFTRT:
35396 case LSHIFTRT:
35397 case ROTATERT:
35398 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35400 /* ??? Should be SSE vector operation cost. */
35401 /* At least for published AMD latencies, this really is the same
35402 as the latency for a simple fpu operation like fabs. */
35403 /* V*QImode is emulated with 1-11 insns. */
35404 if (mode == V16QImode || mode == V32QImode)
35406 int count = 11;
35407 if (TARGET_XOP && mode == V16QImode)
35409 /* For XOP we use vpshab, which requires a broadcast of the
35410 value to the variable shift insn. For constants this
35411 means a V16Q const in mem; even when we can perform the
35412 shift with one insn set the cost to prefer paddb. */
35413 if (CONSTANT_P (XEXP (x, 1)))
35415 *total = (cost->fabs
35416 + rtx_cost (XEXP (x, 0), code, 0, speed)
35417 + (speed ? 2 : COSTS_N_BYTES (16)));
35418 return true;
35420 count = 3;
35422 else if (TARGET_SSSE3)
35423 count = 7;
35424 *total = cost->fabs * count;
35426 else
35427 *total = cost->fabs;
35429 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35431 if (CONST_INT_P (XEXP (x, 1)))
35433 if (INTVAL (XEXP (x, 1)) > 32)
35434 *total = cost->shift_const + COSTS_N_INSNS (2);
35435 else
35436 *total = cost->shift_const * 2;
35438 else
35440 if (GET_CODE (XEXP (x, 1)) == AND)
35441 *total = cost->shift_var * 2;
35442 else
35443 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
35446 else
35448 if (CONST_INT_P (XEXP (x, 1)))
35449 *total = cost->shift_const;
35450 else if (GET_CODE (XEXP (x, 1)) == SUBREG
35451 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
35453 /* Return the cost after shift-and truncation. */
35454 *total = cost->shift_var;
35455 return true;
35457 else
35458 *total = cost->shift_var;
35460 return false;
35462 case FMA:
35464 rtx sub;
35466 gcc_assert (FLOAT_MODE_P (mode));
35467 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
35469 /* ??? SSE scalar/vector cost should be used here. */
35470 /* ??? Bald assumption that fma has the same cost as fmul. */
35471 *total = cost->fmul;
35472 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
35474 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
35475 sub = XEXP (x, 0);
35476 if (GET_CODE (sub) == NEG)
35477 sub = XEXP (sub, 0);
35478 *total += rtx_cost (sub, FMA, 0, speed);
35480 sub = XEXP (x, 2);
35481 if (GET_CODE (sub) == NEG)
35482 sub = XEXP (sub, 0);
35483 *total += rtx_cost (sub, FMA, 2, speed);
35484 return true;
35487 case MULT:
35488 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35490 /* ??? SSE scalar cost should be used here. */
35491 *total = cost->fmul;
35492 return false;
35494 else if (X87_FLOAT_MODE_P (mode))
35496 *total = cost->fmul;
35497 return false;
35499 else if (FLOAT_MODE_P (mode))
35501 /* ??? SSE vector cost should be used here. */
35502 *total = cost->fmul;
35503 return false;
35505 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35507 /* V*QImode is emulated with 7-13 insns. */
35508 if (mode == V16QImode || mode == V32QImode)
35510 int extra = 11;
35511 if (TARGET_XOP && mode == V16QImode)
35512 extra = 5;
35513 else if (TARGET_SSSE3)
35514 extra = 6;
35515 *total = cost->fmul * 2 + cost->fabs * extra;
35517 /* V*DImode is emulated with 5-8 insns. */
35518 else if (mode == V2DImode || mode == V4DImode)
35520 if (TARGET_XOP && mode == V2DImode)
35521 *total = cost->fmul * 2 + cost->fabs * 3;
35522 else
35523 *total = cost->fmul * 3 + cost->fabs * 5;
35525 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
35526 insns, including two PMULUDQ. */
35527 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
35528 *total = cost->fmul * 2 + cost->fabs * 5;
35529 else
35530 *total = cost->fmul;
35531 return false;
35533 else
35535 rtx op0 = XEXP (x, 0);
35536 rtx op1 = XEXP (x, 1);
35537 int nbits;
35538 if (CONST_INT_P (XEXP (x, 1)))
35540 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35541 for (nbits = 0; value != 0; value &= value - 1)
35542 nbits++;
35544 else
35545 /* This is arbitrary. */
35546 nbits = 7;
35548 /* Compute costs correctly for widening multiplication. */
35549 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
35550 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
35551 == GET_MODE_SIZE (mode))
35553 int is_mulwiden = 0;
35554 enum machine_mode inner_mode = GET_MODE (op0);
35556 if (GET_CODE (op0) == GET_CODE (op1))
35557 is_mulwiden = 1, op1 = XEXP (op1, 0);
35558 else if (CONST_INT_P (op1))
35560 if (GET_CODE (op0) == SIGN_EXTEND)
35561 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
35562 == INTVAL (op1);
35563 else
35564 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
35567 if (is_mulwiden)
35568 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
35571 *total = (cost->mult_init[MODE_INDEX (mode)]
35572 + nbits * cost->mult_bit
35573 + rtx_cost (op0, outer_code, opno, speed)
35574 + rtx_cost (op1, outer_code, opno, speed));
35576 return true;
35579 case DIV:
35580 case UDIV:
35581 case MOD:
35582 case UMOD:
35583 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35584 /* ??? SSE cost should be used here. */
35585 *total = cost->fdiv;
35586 else if (X87_FLOAT_MODE_P (mode))
35587 *total = cost->fdiv;
35588 else if (FLOAT_MODE_P (mode))
35589 /* ??? SSE vector cost should be used here. */
35590 *total = cost->fdiv;
35591 else
35592 *total = cost->divide[MODE_INDEX (mode)];
35593 return false;
35595 case PLUS:
35596 if (GET_MODE_CLASS (mode) == MODE_INT
35597 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
35599 if (GET_CODE (XEXP (x, 0)) == PLUS
35600 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
35601 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
35602 && CONSTANT_P (XEXP (x, 1)))
35604 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
35605 if (val == 2 || val == 4 || val == 8)
35607 *total = cost->lea;
35608 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35609 outer_code, opno, speed);
35610 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
35611 outer_code, opno, speed);
35612 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35613 return true;
35616 else if (GET_CODE (XEXP (x, 0)) == MULT
35617 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
35619 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
35620 if (val == 2 || val == 4 || val == 8)
35622 *total = cost->lea;
35623 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35624 outer_code, opno, speed);
35625 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35626 return true;
35629 else if (GET_CODE (XEXP (x, 0)) == PLUS)
35631 *total = cost->lea;
35632 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35633 outer_code, opno, speed);
35634 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35635 outer_code, opno, speed);
35636 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35637 return true;
35640 /* FALLTHRU */
35642 case MINUS:
35643 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35645 /* ??? SSE cost should be used here. */
35646 *total = cost->fadd;
35647 return false;
35649 else if (X87_FLOAT_MODE_P (mode))
35651 *total = cost->fadd;
35652 return false;
35654 else if (FLOAT_MODE_P (mode))
35656 /* ??? SSE vector cost should be used here. */
35657 *total = cost->fadd;
35658 return false;
35660 /* FALLTHRU */
35662 case AND:
35663 case IOR:
35664 case XOR:
35665 if (GET_MODE_CLASS (mode) == MODE_INT
35666 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35668 *total = (cost->add * 2
35669 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
35670 << (GET_MODE (XEXP (x, 0)) != DImode))
35671 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
35672 << (GET_MODE (XEXP (x, 1)) != DImode)));
35673 return true;
35675 /* FALLTHRU */
35677 case NEG:
35678 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35680 /* ??? SSE cost should be used here. */
35681 *total = cost->fchs;
35682 return false;
35684 else if (X87_FLOAT_MODE_P (mode))
35686 *total = cost->fchs;
35687 return false;
35689 else if (FLOAT_MODE_P (mode))
35691 /* ??? SSE vector cost should be used here. */
35692 *total = cost->fchs;
35693 return false;
35695 /* FALLTHRU */
35697 case NOT:
35698 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35700 /* ??? Should be SSE vector operation cost. */
35701 /* At least for published AMD latencies, this really is the same
35702 as the latency for a simple fpu operation like fabs. */
35703 *total = cost->fabs;
35705 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35706 *total = cost->add * 2;
35707 else
35708 *total = cost->add;
35709 return false;
35711 case COMPARE:
35712 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
35713 && XEXP (XEXP (x, 0), 1) == const1_rtx
35714 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
35715 && XEXP (x, 1) == const0_rtx)
35717 /* This kind of construct is implemented using test[bwl].
35718 Treat it as if we had an AND. */
35719 *total = (cost->add
35720 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
35721 + rtx_cost (const1_rtx, outer_code, opno, speed));
35722 return true;
35724 return false;
35726 case FLOAT_EXTEND:
35727 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35728 *total = 0;
35729 return false;
35731 case ABS:
35732 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35733 /* ??? SSE cost should be used here. */
35734 *total = cost->fabs;
35735 else if (X87_FLOAT_MODE_P (mode))
35736 *total = cost->fabs;
35737 else if (FLOAT_MODE_P (mode))
35738 /* ??? SSE vector cost should be used here. */
35739 *total = cost->fabs;
35740 return false;
35742 case SQRT:
35743 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35744 /* ??? SSE cost should be used here. */
35745 *total = cost->fsqrt;
35746 else if (X87_FLOAT_MODE_P (mode))
35747 *total = cost->fsqrt;
35748 else if (FLOAT_MODE_P (mode))
35749 /* ??? SSE vector cost should be used here. */
35750 *total = cost->fsqrt;
35751 return false;
35753 case UNSPEC:
35754 if (XINT (x, 1) == UNSPEC_TP)
35755 *total = 0;
35756 return false;
35758 case VEC_SELECT:
35759 case VEC_CONCAT:
35760 case VEC_MERGE:
35761 case VEC_DUPLICATE:
35762 /* ??? Assume all of these vector manipulation patterns are
35763 recognizable. In which case they all pretty much have the
35764 same cost. */
35765 *total = cost->fabs;
35766 return true;
35768 default:
35769 return false;
35773 #if TARGET_MACHO
35775 static int current_machopic_label_num;
35777 /* Given a symbol name and its associated stub, write out the
35778 definition of the stub. */
35780 void
35781 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35783 unsigned int length;
35784 char *binder_name, *symbol_name, lazy_ptr_name[32];
35785 int label = ++current_machopic_label_num;
35787 /* For 64-bit we shouldn't get here. */
35788 gcc_assert (!TARGET_64BIT);
35790 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35791 symb = targetm.strip_name_encoding (symb);
35793 length = strlen (stub);
35794 binder_name = XALLOCAVEC (char, length + 32);
35795 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35797 length = strlen (symb);
35798 symbol_name = XALLOCAVEC (char, length + 32);
35799 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35801 sprintf (lazy_ptr_name, "L%d$lz", label);
35803 if (MACHOPIC_ATT_STUB)
35804 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35805 else if (MACHOPIC_PURE)
35806 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35807 else
35808 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35810 fprintf (file, "%s:\n", stub);
35811 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35813 if (MACHOPIC_ATT_STUB)
35815 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35817 else if (MACHOPIC_PURE)
35819 /* PIC stub. */
35820 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35821 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35822 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35823 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35824 label, lazy_ptr_name, label);
35825 fprintf (file, "\tjmp\t*%%ecx\n");
35827 else
35828 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35830 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35831 it needs no stub-binding-helper. */
35832 if (MACHOPIC_ATT_STUB)
35833 return;
35835 fprintf (file, "%s:\n", binder_name);
35837 if (MACHOPIC_PURE)
35839 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35840 fprintf (file, "\tpushl\t%%ecx\n");
35842 else
35843 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35845 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35847 /* N.B. Keep the correspondence of these
35848 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35849 old-pic/new-pic/non-pic stubs; altering this will break
35850 compatibility with existing dylibs. */
35851 if (MACHOPIC_PURE)
35853 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35854 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35856 else
35857 /* 16-byte -mdynamic-no-pic stub. */
35858 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35860 fprintf (file, "%s:\n", lazy_ptr_name);
35861 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35862 fprintf (file, ASM_LONG "%s\n", binder_name);
35864 #endif /* TARGET_MACHO */
35866 /* Order the registers for register allocator. */
35868 void
35869 x86_order_regs_for_local_alloc (void)
35871 int pos = 0;
35872 int i;
35874 /* First allocate the local general purpose registers. */
35875 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35876 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35877 reg_alloc_order [pos++] = i;
35879 /* Global general purpose registers. */
35880 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35881 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35882 reg_alloc_order [pos++] = i;
35884 /* x87 registers come first in case we are doing FP math
35885 using them. */
35886 if (!TARGET_SSE_MATH)
35887 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35888 reg_alloc_order [pos++] = i;
35890 /* SSE registers. */
35891 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35892 reg_alloc_order [pos++] = i;
35893 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35894 reg_alloc_order [pos++] = i;
35896 /* Extended REX SSE registers. */
35897 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
35898 reg_alloc_order [pos++] = i;
35900 /* Mask register. */
35901 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
35902 reg_alloc_order [pos++] = i;
35904 /* MPX bound registers. */
35905 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
35906 reg_alloc_order [pos++] = i;
35908 /* x87 registers. */
35909 if (TARGET_SSE_MATH)
35910 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35911 reg_alloc_order [pos++] = i;
35913 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35914 reg_alloc_order [pos++] = i;
35916 /* Initialize the rest of array as we do not allocate some registers
35917 at all. */
35918 while (pos < FIRST_PSEUDO_REGISTER)
35919 reg_alloc_order [pos++] = 0;
35922 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35923 in struct attribute_spec handler. */
35924 static tree
35925 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35926 tree args,
35927 int flags ATTRIBUTE_UNUSED,
35928 bool *no_add_attrs)
35930 if (TREE_CODE (*node) != FUNCTION_TYPE
35931 && TREE_CODE (*node) != METHOD_TYPE
35932 && TREE_CODE (*node) != FIELD_DECL
35933 && TREE_CODE (*node) != TYPE_DECL)
35935 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35936 name);
35937 *no_add_attrs = true;
35938 return NULL_TREE;
35940 if (TARGET_64BIT)
35942 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35943 name);
35944 *no_add_attrs = true;
35945 return NULL_TREE;
35947 if (is_attribute_p ("callee_pop_aggregate_return", name))
35949 tree cst;
35951 cst = TREE_VALUE (args);
35952 if (TREE_CODE (cst) != INTEGER_CST)
35954 warning (OPT_Wattributes,
35955 "%qE attribute requires an integer constant argument",
35956 name);
35957 *no_add_attrs = true;
35959 else if (compare_tree_int (cst, 0) != 0
35960 && compare_tree_int (cst, 1) != 0)
35962 warning (OPT_Wattributes,
35963 "argument to %qE attribute is neither zero, nor one",
35964 name);
35965 *no_add_attrs = true;
35968 return NULL_TREE;
35971 return NULL_TREE;
35974 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
35975 struct attribute_spec.handler. */
35976 static tree
35977 ix86_handle_abi_attribute (tree *node, tree name,
35978 tree args ATTRIBUTE_UNUSED,
35979 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35981 if (TREE_CODE (*node) != FUNCTION_TYPE
35982 && TREE_CODE (*node) != METHOD_TYPE
35983 && TREE_CODE (*node) != FIELD_DECL
35984 && TREE_CODE (*node) != TYPE_DECL)
35986 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35987 name);
35988 *no_add_attrs = true;
35989 return NULL_TREE;
35992 /* Can combine regparm with all attributes but fastcall. */
35993 if (is_attribute_p ("ms_abi", name))
35995 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
35997 error ("ms_abi and sysv_abi attributes are not compatible");
36000 return NULL_TREE;
36002 else if (is_attribute_p ("sysv_abi", name))
36004 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
36006 error ("ms_abi and sysv_abi attributes are not compatible");
36009 return NULL_TREE;
36012 return NULL_TREE;
36015 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
36016 struct attribute_spec.handler. */
36017 static tree
36018 ix86_handle_struct_attribute (tree *node, tree name,
36019 tree args ATTRIBUTE_UNUSED,
36020 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36022 tree *type = NULL;
36023 if (DECL_P (*node))
36025 if (TREE_CODE (*node) == TYPE_DECL)
36026 type = &TREE_TYPE (*node);
36028 else
36029 type = node;
36031 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
36033 warning (OPT_Wattributes, "%qE attribute ignored",
36034 name);
36035 *no_add_attrs = true;
36038 else if ((is_attribute_p ("ms_struct", name)
36039 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
36040 || ((is_attribute_p ("gcc_struct", name)
36041 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
36043 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
36044 name);
36045 *no_add_attrs = true;
36048 return NULL_TREE;
36051 static tree
36052 ix86_handle_fndecl_attribute (tree *node, tree name,
36053 tree args ATTRIBUTE_UNUSED,
36054 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36056 if (TREE_CODE (*node) != FUNCTION_DECL)
36058 warning (OPT_Wattributes, "%qE attribute only applies to functions",
36059 name);
36060 *no_add_attrs = true;
36062 return NULL_TREE;
36065 static bool
36066 ix86_ms_bitfield_layout_p (const_tree record_type)
36068 return ((TARGET_MS_BITFIELD_LAYOUT
36069 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
36070 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
36073 /* Returns an expression indicating where the this parameter is
36074 located on entry to the FUNCTION. */
36076 static rtx
36077 x86_this_parameter (tree function)
36079 tree type = TREE_TYPE (function);
36080 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
36081 int nregs;
36083 if (TARGET_64BIT)
36085 const int *parm_regs;
36087 if (ix86_function_type_abi (type) == MS_ABI)
36088 parm_regs = x86_64_ms_abi_int_parameter_registers;
36089 else
36090 parm_regs = x86_64_int_parameter_registers;
36091 return gen_rtx_REG (Pmode, parm_regs[aggr]);
36094 nregs = ix86_function_regparm (type, function);
36096 if (nregs > 0 && !stdarg_p (type))
36098 int regno;
36099 unsigned int ccvt = ix86_get_callcvt (type);
36101 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36102 regno = aggr ? DX_REG : CX_REG;
36103 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36105 regno = CX_REG;
36106 if (aggr)
36107 return gen_rtx_MEM (SImode,
36108 plus_constant (Pmode, stack_pointer_rtx, 4));
36110 else
36112 regno = AX_REG;
36113 if (aggr)
36115 regno = DX_REG;
36116 if (nregs == 1)
36117 return gen_rtx_MEM (SImode,
36118 plus_constant (Pmode,
36119 stack_pointer_rtx, 4));
36122 return gen_rtx_REG (SImode, regno);
36125 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
36126 aggr ? 8 : 4));
36129 /* Determine whether x86_output_mi_thunk can succeed. */
36131 static bool
36132 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
36133 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
36134 HOST_WIDE_INT vcall_offset, const_tree function)
36136 /* 64-bit can handle anything. */
36137 if (TARGET_64BIT)
36138 return true;
36140 /* For 32-bit, everything's fine if we have one free register. */
36141 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
36142 return true;
36144 /* Need a free register for vcall_offset. */
36145 if (vcall_offset)
36146 return false;
36148 /* Need a free register for GOT references. */
36149 if (flag_pic && !targetm.binds_local_p (function))
36150 return false;
36152 /* Otherwise ok. */
36153 return true;
36156 /* Output the assembler code for a thunk function. THUNK_DECL is the
36157 declaration for the thunk function itself, FUNCTION is the decl for
36158 the target function. DELTA is an immediate constant offset to be
36159 added to THIS. If VCALL_OFFSET is nonzero, the word at
36160 *(*this + vcall_offset) should be added to THIS. */
36162 static void
36163 x86_output_mi_thunk (FILE *file,
36164 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
36165 HOST_WIDE_INT vcall_offset, tree function)
36167 rtx this_param = x86_this_parameter (function);
36168 rtx this_reg, tmp, fnaddr;
36169 unsigned int tmp_regno;
36171 if (TARGET_64BIT)
36172 tmp_regno = R10_REG;
36173 else
36175 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
36176 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36177 tmp_regno = AX_REG;
36178 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36179 tmp_regno = DX_REG;
36180 else
36181 tmp_regno = CX_REG;
36184 emit_note (NOTE_INSN_PROLOGUE_END);
36186 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
36187 pull it in now and let DELTA benefit. */
36188 if (REG_P (this_param))
36189 this_reg = this_param;
36190 else if (vcall_offset)
36192 /* Put the this parameter into %eax. */
36193 this_reg = gen_rtx_REG (Pmode, AX_REG);
36194 emit_move_insn (this_reg, this_param);
36196 else
36197 this_reg = NULL_RTX;
36199 /* Adjust the this parameter by a fixed constant. */
36200 if (delta)
36202 rtx delta_rtx = GEN_INT (delta);
36203 rtx delta_dst = this_reg ? this_reg : this_param;
36205 if (TARGET_64BIT)
36207 if (!x86_64_general_operand (delta_rtx, Pmode))
36209 tmp = gen_rtx_REG (Pmode, tmp_regno);
36210 emit_move_insn (tmp, delta_rtx);
36211 delta_rtx = tmp;
36215 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
36218 /* Adjust the this parameter by a value stored in the vtable. */
36219 if (vcall_offset)
36221 rtx vcall_addr, vcall_mem, this_mem;
36223 tmp = gen_rtx_REG (Pmode, tmp_regno);
36225 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
36226 if (Pmode != ptr_mode)
36227 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
36228 emit_move_insn (tmp, this_mem);
36230 /* Adjust the this parameter. */
36231 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
36232 if (TARGET_64BIT
36233 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
36235 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
36236 emit_move_insn (tmp2, GEN_INT (vcall_offset));
36237 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
36240 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
36241 if (Pmode != ptr_mode)
36242 emit_insn (gen_addsi_1_zext (this_reg,
36243 gen_rtx_REG (ptr_mode,
36244 REGNO (this_reg)),
36245 vcall_mem));
36246 else
36247 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
36250 /* If necessary, drop THIS back to its stack slot. */
36251 if (this_reg && this_reg != this_param)
36252 emit_move_insn (this_param, this_reg);
36254 fnaddr = XEXP (DECL_RTL (function), 0);
36255 if (TARGET_64BIT)
36257 if (!flag_pic || targetm.binds_local_p (function)
36258 || TARGET_PECOFF)
36260 else
36262 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
36263 tmp = gen_rtx_CONST (Pmode, tmp);
36264 fnaddr = gen_rtx_MEM (Pmode, tmp);
36267 else
36269 if (!flag_pic || targetm.binds_local_p (function))
36271 #if TARGET_MACHO
36272 else if (TARGET_MACHO)
36274 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
36275 fnaddr = XEXP (fnaddr, 0);
36277 #endif /* TARGET_MACHO */
36278 else
36280 tmp = gen_rtx_REG (Pmode, CX_REG);
36281 output_set_got (tmp, NULL_RTX);
36283 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
36284 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
36285 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
36289 /* Our sibling call patterns do not allow memories, because we have no
36290 predicate that can distinguish between frame and non-frame memory.
36291 For our purposes here, we can get away with (ab)using a jump pattern,
36292 because we're going to do no optimization. */
36293 if (MEM_P (fnaddr))
36294 emit_jump_insn (gen_indirect_jump (fnaddr));
36295 else
36297 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
36298 fnaddr = legitimize_pic_address (fnaddr,
36299 gen_rtx_REG (Pmode, tmp_regno));
36301 if (!sibcall_insn_operand (fnaddr, word_mode))
36303 tmp = gen_rtx_REG (word_mode, tmp_regno);
36304 if (GET_MODE (fnaddr) != word_mode)
36305 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
36306 emit_move_insn (tmp, fnaddr);
36307 fnaddr = tmp;
36310 tmp = gen_rtx_MEM (QImode, fnaddr);
36311 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
36312 tmp = emit_call_insn (tmp);
36313 SIBLING_CALL_P (tmp) = 1;
36315 emit_barrier ();
36317 /* Emit just enough of rest_of_compilation to get the insns emitted.
36318 Note that use_thunk calls assemble_start_function et al. */
36319 tmp = get_insns ();
36320 shorten_branches (tmp);
36321 final_start_function (tmp, file, 1);
36322 final (tmp, file, 1);
36323 final_end_function ();
36326 static void
36327 x86_file_start (void)
36329 default_file_start ();
36330 #if TARGET_MACHO
36331 darwin_file_start ();
36332 #endif
36333 if (X86_FILE_START_VERSION_DIRECTIVE)
36334 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
36335 if (X86_FILE_START_FLTUSED)
36336 fputs ("\t.global\t__fltused\n", asm_out_file);
36337 if (ix86_asm_dialect == ASM_INTEL)
36338 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
36342 x86_field_alignment (tree field, int computed)
36344 enum machine_mode mode;
36345 tree type = TREE_TYPE (field);
36347 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
36348 return computed;
36349 mode = TYPE_MODE (strip_array_types (type));
36350 if (mode == DFmode || mode == DCmode
36351 || GET_MODE_CLASS (mode) == MODE_INT
36352 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
36353 return MIN (32, computed);
36354 return computed;
36357 /* Output assembler code to FILE to increment profiler label # LABELNO
36358 for profiling a function entry. */
36359 void
36360 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
36362 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
36363 : MCOUNT_NAME);
36365 if (TARGET_64BIT)
36367 #ifndef NO_PROFILE_COUNTERS
36368 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
36369 #endif
36371 if (!TARGET_PECOFF && flag_pic)
36372 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
36373 else
36374 fprintf (file, "\tcall\t%s\n", mcount_name);
36376 else if (flag_pic)
36378 #ifndef NO_PROFILE_COUNTERS
36379 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
36380 LPREFIX, labelno);
36381 #endif
36382 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
36384 else
36386 #ifndef NO_PROFILE_COUNTERS
36387 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
36388 LPREFIX, labelno);
36389 #endif
36390 fprintf (file, "\tcall\t%s\n", mcount_name);
36394 /* We don't have exact information about the insn sizes, but we may assume
36395 quite safely that we are informed about all 1 byte insns and memory
36396 address sizes. This is enough to eliminate unnecessary padding in
36397 99% of cases. */
36399 static int
36400 min_insn_size (rtx insn)
36402 int l = 0, len;
36404 if (!INSN_P (insn) || !active_insn_p (insn))
36405 return 0;
36407 /* Discard alignments we've emit and jump instructions. */
36408 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
36409 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
36410 return 0;
36412 /* Important case - calls are always 5 bytes.
36413 It is common to have many calls in the row. */
36414 if (CALL_P (insn)
36415 && symbolic_reference_mentioned_p (PATTERN (insn))
36416 && !SIBLING_CALL_P (insn))
36417 return 5;
36418 len = get_attr_length (insn);
36419 if (len <= 1)
36420 return 1;
36422 /* For normal instructions we rely on get_attr_length being exact,
36423 with a few exceptions. */
36424 if (!JUMP_P (insn))
36426 enum attr_type type = get_attr_type (insn);
36428 switch (type)
36430 case TYPE_MULTI:
36431 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
36432 || asm_noperands (PATTERN (insn)) >= 0)
36433 return 0;
36434 break;
36435 case TYPE_OTHER:
36436 case TYPE_FCMP:
36437 break;
36438 default:
36439 /* Otherwise trust get_attr_length. */
36440 return len;
36443 l = get_attr_length_address (insn);
36444 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
36445 l = 4;
36447 if (l)
36448 return 1+l;
36449 else
36450 return 2;
36453 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36455 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
36456 window. */
36458 static void
36459 ix86_avoid_jump_mispredicts (void)
36461 rtx insn, start = get_insns ();
36462 int nbytes = 0, njumps = 0;
36463 int isjump = 0;
36465 /* Look for all minimal intervals of instructions containing 4 jumps.
36466 The intervals are bounded by START and INSN. NBYTES is the total
36467 size of instructions in the interval including INSN and not including
36468 START. When the NBYTES is smaller than 16 bytes, it is possible
36469 that the end of START and INSN ends up in the same 16byte page.
36471 The smallest offset in the page INSN can start is the case where START
36472 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
36473 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
36475 for (insn = start; insn; insn = NEXT_INSN (insn))
36477 int min_size;
36479 if (LABEL_P (insn))
36481 int align = label_to_alignment (insn);
36482 int max_skip = label_to_max_skip (insn);
36484 if (max_skip > 15)
36485 max_skip = 15;
36486 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
36487 already in the current 16 byte page, because otherwise
36488 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
36489 bytes to reach 16 byte boundary. */
36490 if (align <= 0
36491 || (align <= 3 && max_skip != (1 << align) - 1))
36492 max_skip = 0;
36493 if (dump_file)
36494 fprintf (dump_file, "Label %i with max_skip %i\n",
36495 INSN_UID (insn), max_skip);
36496 if (max_skip)
36498 while (nbytes + max_skip >= 16)
36500 start = NEXT_INSN (start);
36501 if (JUMP_P (start) || CALL_P (start))
36502 njumps--, isjump = 1;
36503 else
36504 isjump = 0;
36505 nbytes -= min_insn_size (start);
36508 continue;
36511 min_size = min_insn_size (insn);
36512 nbytes += min_size;
36513 if (dump_file)
36514 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
36515 INSN_UID (insn), min_size);
36516 if (JUMP_P (insn) || CALL_P (insn))
36517 njumps++;
36518 else
36519 continue;
36521 while (njumps > 3)
36523 start = NEXT_INSN (start);
36524 if (JUMP_P (start) || CALL_P (start))
36525 njumps--, isjump = 1;
36526 else
36527 isjump = 0;
36528 nbytes -= min_insn_size (start);
36530 gcc_assert (njumps >= 0);
36531 if (dump_file)
36532 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
36533 INSN_UID (start), INSN_UID (insn), nbytes);
36535 if (njumps == 3 && isjump && nbytes < 16)
36537 int padsize = 15 - nbytes + min_insn_size (insn);
36539 if (dump_file)
36540 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
36541 INSN_UID (insn), padsize);
36542 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
36546 #endif
36548 /* AMD Athlon works faster
36549 when RET is not destination of conditional jump or directly preceded
36550 by other jump instruction. We avoid the penalty by inserting NOP just
36551 before the RET instructions in such cases. */
36552 static void
36553 ix86_pad_returns (void)
36555 edge e;
36556 edge_iterator ei;
36558 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36560 basic_block bb = e->src;
36561 rtx ret = BB_END (bb);
36562 rtx prev;
36563 bool replace = false;
36565 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
36566 || optimize_bb_for_size_p (bb))
36567 continue;
36568 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
36569 if (active_insn_p (prev) || LABEL_P (prev))
36570 break;
36571 if (prev && LABEL_P (prev))
36573 edge e;
36574 edge_iterator ei;
36576 FOR_EACH_EDGE (e, ei, bb->preds)
36577 if (EDGE_FREQUENCY (e) && e->src->index >= 0
36578 && !(e->flags & EDGE_FALLTHRU))
36580 replace = true;
36581 break;
36584 if (!replace)
36586 prev = prev_active_insn (ret);
36587 if (prev
36588 && ((JUMP_P (prev) && any_condjump_p (prev))
36589 || CALL_P (prev)))
36590 replace = true;
36591 /* Empty functions get branch mispredict even when
36592 the jump destination is not visible to us. */
36593 if (!prev && !optimize_function_for_size_p (cfun))
36594 replace = true;
36596 if (replace)
36598 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
36599 delete_insn (ret);
36604 /* Count the minimum number of instructions in BB. Return 4 if the
36605 number of instructions >= 4. */
36607 static int
36608 ix86_count_insn_bb (basic_block bb)
36610 rtx insn;
36611 int insn_count = 0;
36613 /* Count number of instructions in this block. Return 4 if the number
36614 of instructions >= 4. */
36615 FOR_BB_INSNS (bb, insn)
36617 /* Only happen in exit blocks. */
36618 if (JUMP_P (insn)
36619 && ANY_RETURN_P (PATTERN (insn)))
36620 break;
36622 if (NONDEBUG_INSN_P (insn)
36623 && GET_CODE (PATTERN (insn)) != USE
36624 && GET_CODE (PATTERN (insn)) != CLOBBER)
36626 insn_count++;
36627 if (insn_count >= 4)
36628 return insn_count;
36632 return insn_count;
36636 /* Count the minimum number of instructions in code path in BB.
36637 Return 4 if the number of instructions >= 4. */
36639 static int
36640 ix86_count_insn (basic_block bb)
36642 edge e;
36643 edge_iterator ei;
36644 int min_prev_count;
36646 /* Only bother counting instructions along paths with no
36647 more than 2 basic blocks between entry and exit. Given
36648 that BB has an edge to exit, determine if a predecessor
36649 of BB has an edge from entry. If so, compute the number
36650 of instructions in the predecessor block. If there
36651 happen to be multiple such blocks, compute the minimum. */
36652 min_prev_count = 4;
36653 FOR_EACH_EDGE (e, ei, bb->preds)
36655 edge prev_e;
36656 edge_iterator prev_ei;
36658 if (e->src == ENTRY_BLOCK_PTR)
36660 min_prev_count = 0;
36661 break;
36663 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
36665 if (prev_e->src == ENTRY_BLOCK_PTR)
36667 int count = ix86_count_insn_bb (e->src);
36668 if (count < min_prev_count)
36669 min_prev_count = count;
36670 break;
36675 if (min_prev_count < 4)
36676 min_prev_count += ix86_count_insn_bb (bb);
36678 return min_prev_count;
36681 /* Pad short function to 4 instructions. */
36683 static void
36684 ix86_pad_short_function (void)
36686 edge e;
36687 edge_iterator ei;
36689 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36691 rtx ret = BB_END (e->src);
36692 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
36694 int insn_count = ix86_count_insn (e->src);
36696 /* Pad short function. */
36697 if (insn_count < 4)
36699 rtx insn = ret;
36701 /* Find epilogue. */
36702 while (insn
36703 && (!NOTE_P (insn)
36704 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
36705 insn = PREV_INSN (insn);
36707 if (!insn)
36708 insn = ret;
36710 /* Two NOPs count as one instruction. */
36711 insn_count = 2 * (4 - insn_count);
36712 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
36718 /* Fix up a Windows system unwinder issue. If an EH region falls through into
36719 the epilogue, the Windows system unwinder will apply epilogue logic and
36720 produce incorrect offsets. This can be avoided by adding a nop between
36721 the last insn that can throw and the first insn of the epilogue. */
36723 static void
36724 ix86_seh_fixup_eh_fallthru (void)
36726 edge e;
36727 edge_iterator ei;
36729 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36731 rtx insn, next;
36733 /* Find the beginning of the epilogue. */
36734 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36735 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36736 break;
36737 if (insn == NULL)
36738 continue;
36740 /* We only care about preceding insns that can throw. */
36741 insn = prev_active_insn (insn);
36742 if (insn == NULL || !can_throw_internal (insn))
36743 continue;
36745 /* Do not separate calls from their debug information. */
36746 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36747 if (NOTE_P (next)
36748 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36749 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36750 insn = next;
36751 else
36752 break;
36754 emit_insn_after (gen_nops (const1_rtx), insn);
36758 /* Implement machine specific optimizations. We implement padding of returns
36759 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36760 static void
36761 ix86_reorg (void)
36763 /* We are freeing block_for_insn in the toplev to keep compatibility
36764 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36765 compute_bb_for_insn ();
36767 if (TARGET_SEH && current_function_has_exception_handlers ())
36768 ix86_seh_fixup_eh_fallthru ();
36770 if (optimize && optimize_function_for_speed_p (cfun))
36772 if (TARGET_PAD_SHORT_FUNCTION)
36773 ix86_pad_short_function ();
36774 else if (TARGET_PAD_RETURNS)
36775 ix86_pad_returns ();
36776 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36777 if (TARGET_FOUR_JUMP_LIMIT)
36778 ix86_avoid_jump_mispredicts ();
36779 #endif
36783 /* Return nonzero when QImode register that must be represented via REX prefix
36784 is used. */
36785 bool
36786 x86_extended_QIreg_mentioned_p (rtx insn)
36788 int i;
36789 extract_insn_cached (insn);
36790 for (i = 0; i < recog_data.n_operands; i++)
36791 if (GENERAL_REG_P (recog_data.operand[i])
36792 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36793 return true;
36794 return false;
36797 /* Return nonzero when P points to register encoded via REX prefix.
36798 Called via for_each_rtx. */
36799 static int
36800 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36802 unsigned int regno;
36803 if (!REG_P (*p))
36804 return 0;
36805 regno = REGNO (*p);
36806 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36809 /* Return true when INSN mentions register that must be encoded using REX
36810 prefix. */
36811 bool
36812 x86_extended_reg_mentioned_p (rtx insn)
36814 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36815 extended_reg_mentioned_1, NULL);
36818 /* If profitable, negate (without causing overflow) integer constant
36819 of mode MODE at location LOC. Return true in this case. */
36820 bool
36821 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36823 HOST_WIDE_INT val;
36825 if (!CONST_INT_P (*loc))
36826 return false;
36828 switch (mode)
36830 case DImode:
36831 /* DImode x86_64 constants must fit in 32 bits. */
36832 gcc_assert (x86_64_immediate_operand (*loc, mode));
36834 mode = SImode;
36835 break;
36837 case SImode:
36838 case HImode:
36839 case QImode:
36840 break;
36842 default:
36843 gcc_unreachable ();
36846 /* Avoid overflows. */
36847 if (mode_signbit_p (mode, *loc))
36848 return false;
36850 val = INTVAL (*loc);
36852 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36853 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36854 if ((val < 0 && val != -128)
36855 || val == 128)
36857 *loc = GEN_INT (-val);
36858 return true;
36861 return false;
36864 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36865 optabs would emit if we didn't have TFmode patterns. */
36867 void
36868 x86_emit_floatuns (rtx operands[2])
36870 rtx neglab, donelab, i0, i1, f0, in, out;
36871 enum machine_mode mode, inmode;
36873 inmode = GET_MODE (operands[1]);
36874 gcc_assert (inmode == SImode || inmode == DImode);
36876 out = operands[0];
36877 in = force_reg (inmode, operands[1]);
36878 mode = GET_MODE (out);
36879 neglab = gen_label_rtx ();
36880 donelab = gen_label_rtx ();
36881 f0 = gen_reg_rtx (mode);
36883 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36885 expand_float (out, in, 0);
36887 emit_jump_insn (gen_jump (donelab));
36888 emit_barrier ();
36890 emit_label (neglab);
36892 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36893 1, OPTAB_DIRECT);
36894 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36895 1, OPTAB_DIRECT);
36896 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36898 expand_float (f0, i0, 0);
36900 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36902 emit_label (donelab);
36905 /* AVX512F does support 64-byte integer vector operations,
36906 thus the longest vector we are faced with is V64QImode. */
36907 #define MAX_VECT_LEN 64
36909 struct expand_vec_perm_d
36911 rtx target, op0, op1;
36912 unsigned char perm[MAX_VECT_LEN];
36913 enum machine_mode vmode;
36914 unsigned char nelt;
36915 bool one_operand_p;
36916 bool testing_p;
36919 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36920 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36921 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36923 /* Get a vector mode of the same size as the original but with elements
36924 twice as wide. This is only guaranteed to apply to integral vectors. */
36926 static inline enum machine_mode
36927 get_mode_wider_vector (enum machine_mode o)
36929 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36930 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36931 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36932 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36933 return n;
36936 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36937 with all elements equal to VAR. Return true if successful. */
36939 static bool
36940 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36941 rtx target, rtx val)
36943 bool ok;
36945 switch (mode)
36947 case V2SImode:
36948 case V2SFmode:
36949 if (!mmx_ok)
36950 return false;
36951 /* FALLTHRU */
36953 case V4DFmode:
36954 case V4DImode:
36955 case V8SFmode:
36956 case V8SImode:
36957 case V2DFmode:
36958 case V2DImode:
36959 case V4SFmode:
36960 case V4SImode:
36962 rtx insn, dup;
36964 /* First attempt to recognize VAL as-is. */
36965 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36966 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36967 if (recog_memoized (insn) < 0)
36969 rtx seq;
36970 /* If that fails, force VAL into a register. */
36972 start_sequence ();
36973 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
36974 seq = get_insns ();
36975 end_sequence ();
36976 if (seq)
36977 emit_insn_before (seq, insn);
36979 ok = recog_memoized (insn) >= 0;
36980 gcc_assert (ok);
36983 return true;
36985 case V4HImode:
36986 if (!mmx_ok)
36987 return false;
36988 if (TARGET_SSE || TARGET_3DNOW_A)
36990 rtx x;
36992 val = gen_lowpart (SImode, val);
36993 x = gen_rtx_TRUNCATE (HImode, val);
36994 x = gen_rtx_VEC_DUPLICATE (mode, x);
36995 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36996 return true;
36998 goto widen;
37000 case V8QImode:
37001 if (!mmx_ok)
37002 return false;
37003 goto widen;
37005 case V8HImode:
37006 if (TARGET_SSE2)
37008 struct expand_vec_perm_d dperm;
37009 rtx tmp1, tmp2;
37011 permute:
37012 memset (&dperm, 0, sizeof (dperm));
37013 dperm.target = target;
37014 dperm.vmode = mode;
37015 dperm.nelt = GET_MODE_NUNITS (mode);
37016 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
37017 dperm.one_operand_p = true;
37019 /* Extend to SImode using a paradoxical SUBREG. */
37020 tmp1 = gen_reg_rtx (SImode);
37021 emit_move_insn (tmp1, gen_lowpart (SImode, val));
37023 /* Insert the SImode value as low element of a V4SImode vector. */
37024 tmp2 = gen_reg_rtx (V4SImode);
37025 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
37026 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
37028 ok = (expand_vec_perm_1 (&dperm)
37029 || expand_vec_perm_broadcast_1 (&dperm));
37030 gcc_assert (ok);
37031 return ok;
37033 goto widen;
37035 case V16QImode:
37036 if (TARGET_SSE2)
37037 goto permute;
37038 goto widen;
37040 widen:
37041 /* Replicate the value once into the next wider mode and recurse. */
37043 enum machine_mode smode, wsmode, wvmode;
37044 rtx x;
37046 smode = GET_MODE_INNER (mode);
37047 wvmode = get_mode_wider_vector (mode);
37048 wsmode = GET_MODE_INNER (wvmode);
37050 val = convert_modes (wsmode, smode, val, true);
37051 x = expand_simple_binop (wsmode, ASHIFT, val,
37052 GEN_INT (GET_MODE_BITSIZE (smode)),
37053 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37054 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
37056 x = gen_reg_rtx (wvmode);
37057 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
37058 gcc_assert (ok);
37059 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
37060 return ok;
37063 case V16HImode:
37064 case V32QImode:
37066 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
37067 rtx x = gen_reg_rtx (hvmode);
37069 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
37070 gcc_assert (ok);
37072 x = gen_rtx_VEC_CONCAT (mode, x, x);
37073 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37075 return true;
37077 default:
37078 return false;
37082 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37083 whose ONE_VAR element is VAR, and other elements are zero. Return true
37084 if successful. */
37086 static bool
37087 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
37088 rtx target, rtx var, int one_var)
37090 enum machine_mode vsimode;
37091 rtx new_target;
37092 rtx x, tmp;
37093 bool use_vector_set = false;
37095 switch (mode)
37097 case V2DImode:
37098 /* For SSE4.1, we normally use vector set. But if the second
37099 element is zero and inter-unit moves are OK, we use movq
37100 instead. */
37101 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
37102 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
37103 && one_var == 0));
37104 break;
37105 case V16QImode:
37106 case V4SImode:
37107 case V4SFmode:
37108 use_vector_set = TARGET_SSE4_1;
37109 break;
37110 case V8HImode:
37111 use_vector_set = TARGET_SSE2;
37112 break;
37113 case V4HImode:
37114 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
37115 break;
37116 case V32QImode:
37117 case V16HImode:
37118 case V8SImode:
37119 case V8SFmode:
37120 case V4DFmode:
37121 use_vector_set = TARGET_AVX;
37122 break;
37123 case V4DImode:
37124 /* Use ix86_expand_vector_set in 64bit mode only. */
37125 use_vector_set = TARGET_AVX && TARGET_64BIT;
37126 break;
37127 default:
37128 break;
37131 if (use_vector_set)
37133 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
37134 var = force_reg (GET_MODE_INNER (mode), var);
37135 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37136 return true;
37139 switch (mode)
37141 case V2SFmode:
37142 case V2SImode:
37143 if (!mmx_ok)
37144 return false;
37145 /* FALLTHRU */
37147 case V2DFmode:
37148 case V2DImode:
37149 if (one_var != 0)
37150 return false;
37151 var = force_reg (GET_MODE_INNER (mode), var);
37152 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
37153 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37154 return true;
37156 case V4SFmode:
37157 case V4SImode:
37158 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
37159 new_target = gen_reg_rtx (mode);
37160 else
37161 new_target = target;
37162 var = force_reg (GET_MODE_INNER (mode), var);
37163 x = gen_rtx_VEC_DUPLICATE (mode, var);
37164 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
37165 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
37166 if (one_var != 0)
37168 /* We need to shuffle the value to the correct position, so
37169 create a new pseudo to store the intermediate result. */
37171 /* With SSE2, we can use the integer shuffle insns. */
37172 if (mode != V4SFmode && TARGET_SSE2)
37174 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
37175 const1_rtx,
37176 GEN_INT (one_var == 1 ? 0 : 1),
37177 GEN_INT (one_var == 2 ? 0 : 1),
37178 GEN_INT (one_var == 3 ? 0 : 1)));
37179 if (target != new_target)
37180 emit_move_insn (target, new_target);
37181 return true;
37184 /* Otherwise convert the intermediate result to V4SFmode and
37185 use the SSE1 shuffle instructions. */
37186 if (mode != V4SFmode)
37188 tmp = gen_reg_rtx (V4SFmode);
37189 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
37191 else
37192 tmp = new_target;
37194 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
37195 const1_rtx,
37196 GEN_INT (one_var == 1 ? 0 : 1),
37197 GEN_INT (one_var == 2 ? 0+4 : 1+4),
37198 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
37200 if (mode != V4SFmode)
37201 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
37202 else if (tmp != target)
37203 emit_move_insn (target, tmp);
37205 else if (target != new_target)
37206 emit_move_insn (target, new_target);
37207 return true;
37209 case V8HImode:
37210 case V16QImode:
37211 vsimode = V4SImode;
37212 goto widen;
37213 case V4HImode:
37214 case V8QImode:
37215 if (!mmx_ok)
37216 return false;
37217 vsimode = V2SImode;
37218 goto widen;
37219 widen:
37220 if (one_var != 0)
37221 return false;
37223 /* Zero extend the variable element to SImode and recurse. */
37224 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
37226 x = gen_reg_rtx (vsimode);
37227 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
37228 var, one_var))
37229 gcc_unreachable ();
37231 emit_move_insn (target, gen_lowpart (mode, x));
37232 return true;
37234 default:
37235 return false;
37239 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37240 consisting of the values in VALS. It is known that all elements
37241 except ONE_VAR are constants. Return true if successful. */
37243 static bool
37244 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
37245 rtx target, rtx vals, int one_var)
37247 rtx var = XVECEXP (vals, 0, one_var);
37248 enum machine_mode wmode;
37249 rtx const_vec, x;
37251 const_vec = copy_rtx (vals);
37252 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
37253 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
37255 switch (mode)
37257 case V2DFmode:
37258 case V2DImode:
37259 case V2SFmode:
37260 case V2SImode:
37261 /* For the two element vectors, it's just as easy to use
37262 the general case. */
37263 return false;
37265 case V4DImode:
37266 /* Use ix86_expand_vector_set in 64bit mode only. */
37267 if (!TARGET_64BIT)
37268 return false;
37269 case V4DFmode:
37270 case V8SFmode:
37271 case V8SImode:
37272 case V16HImode:
37273 case V32QImode:
37274 case V4SFmode:
37275 case V4SImode:
37276 case V8HImode:
37277 case V4HImode:
37278 break;
37280 case V16QImode:
37281 if (TARGET_SSE4_1)
37282 break;
37283 wmode = V8HImode;
37284 goto widen;
37285 case V8QImode:
37286 wmode = V4HImode;
37287 goto widen;
37288 widen:
37289 /* There's no way to set one QImode entry easily. Combine
37290 the variable value with its adjacent constant value, and
37291 promote to an HImode set. */
37292 x = XVECEXP (vals, 0, one_var ^ 1);
37293 if (one_var & 1)
37295 var = convert_modes (HImode, QImode, var, true);
37296 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
37297 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37298 x = GEN_INT (INTVAL (x) & 0xff);
37300 else
37302 var = convert_modes (HImode, QImode, var, true);
37303 x = gen_int_mode (INTVAL (x) << 8, HImode);
37305 if (x != const0_rtx)
37306 var = expand_simple_binop (HImode, IOR, var, x, var,
37307 1, OPTAB_LIB_WIDEN);
37309 x = gen_reg_rtx (wmode);
37310 emit_move_insn (x, gen_lowpart (wmode, const_vec));
37311 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
37313 emit_move_insn (target, gen_lowpart (mode, x));
37314 return true;
37316 default:
37317 return false;
37320 emit_move_insn (target, const_vec);
37321 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37322 return true;
37325 /* A subroutine of ix86_expand_vector_init_general. Use vector
37326 concatenate to handle the most general case: all values variable,
37327 and none identical. */
37329 static void
37330 ix86_expand_vector_init_concat (enum machine_mode mode,
37331 rtx target, rtx *ops, int n)
37333 enum machine_mode cmode, hmode = VOIDmode;
37334 rtx first[8], second[4];
37335 rtvec v;
37336 int i, j;
37338 switch (n)
37340 case 2:
37341 switch (mode)
37343 case V8SImode:
37344 cmode = V4SImode;
37345 break;
37346 case V8SFmode:
37347 cmode = V4SFmode;
37348 break;
37349 case V4DImode:
37350 cmode = V2DImode;
37351 break;
37352 case V4DFmode:
37353 cmode = V2DFmode;
37354 break;
37355 case V4SImode:
37356 cmode = V2SImode;
37357 break;
37358 case V4SFmode:
37359 cmode = V2SFmode;
37360 break;
37361 case V2DImode:
37362 cmode = DImode;
37363 break;
37364 case V2SImode:
37365 cmode = SImode;
37366 break;
37367 case V2DFmode:
37368 cmode = DFmode;
37369 break;
37370 case V2SFmode:
37371 cmode = SFmode;
37372 break;
37373 default:
37374 gcc_unreachable ();
37377 if (!register_operand (ops[1], cmode))
37378 ops[1] = force_reg (cmode, ops[1]);
37379 if (!register_operand (ops[0], cmode))
37380 ops[0] = force_reg (cmode, ops[0]);
37381 emit_insn (gen_rtx_SET (VOIDmode, target,
37382 gen_rtx_VEC_CONCAT (mode, ops[0],
37383 ops[1])));
37384 break;
37386 case 4:
37387 switch (mode)
37389 case V4DImode:
37390 cmode = V2DImode;
37391 break;
37392 case V4DFmode:
37393 cmode = V2DFmode;
37394 break;
37395 case V4SImode:
37396 cmode = V2SImode;
37397 break;
37398 case V4SFmode:
37399 cmode = V2SFmode;
37400 break;
37401 default:
37402 gcc_unreachable ();
37404 goto half;
37406 case 8:
37407 switch (mode)
37409 case V8SImode:
37410 cmode = V2SImode;
37411 hmode = V4SImode;
37412 break;
37413 case V8SFmode:
37414 cmode = V2SFmode;
37415 hmode = V4SFmode;
37416 break;
37417 default:
37418 gcc_unreachable ();
37420 goto half;
37422 half:
37423 /* FIXME: We process inputs backward to help RA. PR 36222. */
37424 i = n - 1;
37425 j = (n >> 1) - 1;
37426 for (; i > 0; i -= 2, j--)
37428 first[j] = gen_reg_rtx (cmode);
37429 v = gen_rtvec (2, ops[i - 1], ops[i]);
37430 ix86_expand_vector_init (false, first[j],
37431 gen_rtx_PARALLEL (cmode, v));
37434 n >>= 1;
37435 if (n > 2)
37437 gcc_assert (hmode != VOIDmode);
37438 for (i = j = 0; i < n; i += 2, j++)
37440 second[j] = gen_reg_rtx (hmode);
37441 ix86_expand_vector_init_concat (hmode, second [j],
37442 &first [i], 2);
37444 n >>= 1;
37445 ix86_expand_vector_init_concat (mode, target, second, n);
37447 else
37448 ix86_expand_vector_init_concat (mode, target, first, n);
37449 break;
37451 default:
37452 gcc_unreachable ();
37456 /* A subroutine of ix86_expand_vector_init_general. Use vector
37457 interleave to handle the most general case: all values variable,
37458 and none identical. */
37460 static void
37461 ix86_expand_vector_init_interleave (enum machine_mode mode,
37462 rtx target, rtx *ops, int n)
37464 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
37465 int i, j;
37466 rtx op0, op1;
37467 rtx (*gen_load_even) (rtx, rtx, rtx);
37468 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
37469 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
37471 switch (mode)
37473 case V8HImode:
37474 gen_load_even = gen_vec_setv8hi;
37475 gen_interleave_first_low = gen_vec_interleave_lowv4si;
37476 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37477 inner_mode = HImode;
37478 first_imode = V4SImode;
37479 second_imode = V2DImode;
37480 third_imode = VOIDmode;
37481 break;
37482 case V16QImode:
37483 gen_load_even = gen_vec_setv16qi;
37484 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
37485 gen_interleave_second_low = gen_vec_interleave_lowv4si;
37486 inner_mode = QImode;
37487 first_imode = V8HImode;
37488 second_imode = V4SImode;
37489 third_imode = V2DImode;
37490 break;
37491 default:
37492 gcc_unreachable ();
37495 for (i = 0; i < n; i++)
37497 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
37498 op0 = gen_reg_rtx (SImode);
37499 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
37501 /* Insert the SImode value as low element of V4SImode vector. */
37502 op1 = gen_reg_rtx (V4SImode);
37503 op0 = gen_rtx_VEC_MERGE (V4SImode,
37504 gen_rtx_VEC_DUPLICATE (V4SImode,
37505 op0),
37506 CONST0_RTX (V4SImode),
37507 const1_rtx);
37508 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
37510 /* Cast the V4SImode vector back to a vector in orignal mode. */
37511 op0 = gen_reg_rtx (mode);
37512 emit_move_insn (op0, gen_lowpart (mode, op1));
37514 /* Load even elements into the second position. */
37515 emit_insn (gen_load_even (op0,
37516 force_reg (inner_mode,
37517 ops [i + i + 1]),
37518 const1_rtx));
37520 /* Cast vector to FIRST_IMODE vector. */
37521 ops[i] = gen_reg_rtx (first_imode);
37522 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
37525 /* Interleave low FIRST_IMODE vectors. */
37526 for (i = j = 0; i < n; i += 2, j++)
37528 op0 = gen_reg_rtx (first_imode);
37529 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
37531 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
37532 ops[j] = gen_reg_rtx (second_imode);
37533 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
37536 /* Interleave low SECOND_IMODE vectors. */
37537 switch (second_imode)
37539 case V4SImode:
37540 for (i = j = 0; i < n / 2; i += 2, j++)
37542 op0 = gen_reg_rtx (second_imode);
37543 emit_insn (gen_interleave_second_low (op0, ops[i],
37544 ops[i + 1]));
37546 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
37547 vector. */
37548 ops[j] = gen_reg_rtx (third_imode);
37549 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
37551 second_imode = V2DImode;
37552 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37553 /* FALLTHRU */
37555 case V2DImode:
37556 op0 = gen_reg_rtx (second_imode);
37557 emit_insn (gen_interleave_second_low (op0, ops[0],
37558 ops[1]));
37560 /* Cast the SECOND_IMODE vector back to a vector on original
37561 mode. */
37562 emit_insn (gen_rtx_SET (VOIDmode, target,
37563 gen_lowpart (mode, op0)));
37564 break;
37566 default:
37567 gcc_unreachable ();
37571 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
37572 all values variable, and none identical. */
37574 static void
37575 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
37576 rtx target, rtx vals)
37578 rtx ops[32], op0, op1;
37579 enum machine_mode half_mode = VOIDmode;
37580 int n, i;
37582 switch (mode)
37584 case V2SFmode:
37585 case V2SImode:
37586 if (!mmx_ok && !TARGET_SSE)
37587 break;
37588 /* FALLTHRU */
37590 case V8SFmode:
37591 case V8SImode:
37592 case V4DFmode:
37593 case V4DImode:
37594 case V4SFmode:
37595 case V4SImode:
37596 case V2DFmode:
37597 case V2DImode:
37598 n = GET_MODE_NUNITS (mode);
37599 for (i = 0; i < n; i++)
37600 ops[i] = XVECEXP (vals, 0, i);
37601 ix86_expand_vector_init_concat (mode, target, ops, n);
37602 return;
37604 case V32QImode:
37605 half_mode = V16QImode;
37606 goto half;
37608 case V16HImode:
37609 half_mode = V8HImode;
37610 goto half;
37612 half:
37613 n = GET_MODE_NUNITS (mode);
37614 for (i = 0; i < n; i++)
37615 ops[i] = XVECEXP (vals, 0, i);
37616 op0 = gen_reg_rtx (half_mode);
37617 op1 = gen_reg_rtx (half_mode);
37618 ix86_expand_vector_init_interleave (half_mode, op0, ops,
37619 n >> 2);
37620 ix86_expand_vector_init_interleave (half_mode, op1,
37621 &ops [n >> 1], n >> 2);
37622 emit_insn (gen_rtx_SET (VOIDmode, target,
37623 gen_rtx_VEC_CONCAT (mode, op0, op1)));
37624 return;
37626 case V16QImode:
37627 if (!TARGET_SSE4_1)
37628 break;
37629 /* FALLTHRU */
37631 case V8HImode:
37632 if (!TARGET_SSE2)
37633 break;
37635 /* Don't use ix86_expand_vector_init_interleave if we can't
37636 move from GPR to SSE register directly. */
37637 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
37638 break;
37640 n = GET_MODE_NUNITS (mode);
37641 for (i = 0; i < n; i++)
37642 ops[i] = XVECEXP (vals, 0, i);
37643 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
37644 return;
37646 case V4HImode:
37647 case V8QImode:
37648 break;
37650 default:
37651 gcc_unreachable ();
37655 int i, j, n_elts, n_words, n_elt_per_word;
37656 enum machine_mode inner_mode;
37657 rtx words[4], shift;
37659 inner_mode = GET_MODE_INNER (mode);
37660 n_elts = GET_MODE_NUNITS (mode);
37661 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
37662 n_elt_per_word = n_elts / n_words;
37663 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
37665 for (i = 0; i < n_words; ++i)
37667 rtx word = NULL_RTX;
37669 for (j = 0; j < n_elt_per_word; ++j)
37671 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
37672 elt = convert_modes (word_mode, inner_mode, elt, true);
37674 if (j == 0)
37675 word = elt;
37676 else
37678 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
37679 word, 1, OPTAB_LIB_WIDEN);
37680 word = expand_simple_binop (word_mode, IOR, word, elt,
37681 word, 1, OPTAB_LIB_WIDEN);
37685 words[i] = word;
37688 if (n_words == 1)
37689 emit_move_insn (target, gen_lowpart (mode, words[0]));
37690 else if (n_words == 2)
37692 rtx tmp = gen_reg_rtx (mode);
37693 emit_clobber (tmp);
37694 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
37695 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
37696 emit_move_insn (target, tmp);
37698 else if (n_words == 4)
37700 rtx tmp = gen_reg_rtx (V4SImode);
37701 gcc_assert (word_mode == SImode);
37702 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
37703 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
37704 emit_move_insn (target, gen_lowpart (mode, tmp));
37706 else
37707 gcc_unreachable ();
37711 /* Initialize vector TARGET via VALS. Suppress the use of MMX
37712 instructions unless MMX_OK is true. */
37714 void
37715 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
37717 enum machine_mode mode = GET_MODE (target);
37718 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37719 int n_elts = GET_MODE_NUNITS (mode);
37720 int n_var = 0, one_var = -1;
37721 bool all_same = true, all_const_zero = true;
37722 int i;
37723 rtx x;
37725 for (i = 0; i < n_elts; ++i)
37727 x = XVECEXP (vals, 0, i);
37728 if (!(CONST_INT_P (x)
37729 || GET_CODE (x) == CONST_DOUBLE
37730 || GET_CODE (x) == CONST_FIXED))
37731 n_var++, one_var = i;
37732 else if (x != CONST0_RTX (inner_mode))
37733 all_const_zero = false;
37734 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37735 all_same = false;
37738 /* Constants are best loaded from the constant pool. */
37739 if (n_var == 0)
37741 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37742 return;
37745 /* If all values are identical, broadcast the value. */
37746 if (all_same
37747 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37748 XVECEXP (vals, 0, 0)))
37749 return;
37751 /* Values where only one field is non-constant are best loaded from
37752 the pool and overwritten via move later. */
37753 if (n_var == 1)
37755 if (all_const_zero
37756 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37757 XVECEXP (vals, 0, one_var),
37758 one_var))
37759 return;
37761 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37762 return;
37765 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37768 void
37769 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37771 enum machine_mode mode = GET_MODE (target);
37772 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37773 enum machine_mode half_mode;
37774 bool use_vec_merge = false;
37775 rtx tmp;
37776 static rtx (*gen_extract[6][2]) (rtx, rtx)
37778 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37779 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37780 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37781 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37782 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37783 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37785 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37787 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37788 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37789 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37790 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37791 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37792 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37794 int i, j, n;
37796 switch (mode)
37798 case V2SFmode:
37799 case V2SImode:
37800 if (mmx_ok)
37802 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37803 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37804 if (elt == 0)
37805 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37806 else
37807 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37808 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37809 return;
37811 break;
37813 case V2DImode:
37814 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37815 if (use_vec_merge)
37816 break;
37818 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37819 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37820 if (elt == 0)
37821 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37822 else
37823 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37824 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37825 return;
37827 case V2DFmode:
37829 rtx op0, op1;
37831 /* For the two element vectors, we implement a VEC_CONCAT with
37832 the extraction of the other element. */
37834 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37835 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37837 if (elt == 0)
37838 op0 = val, op1 = tmp;
37839 else
37840 op0 = tmp, op1 = val;
37842 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37843 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37845 return;
37847 case V4SFmode:
37848 use_vec_merge = TARGET_SSE4_1;
37849 if (use_vec_merge)
37850 break;
37852 switch (elt)
37854 case 0:
37855 use_vec_merge = true;
37856 break;
37858 case 1:
37859 /* tmp = target = A B C D */
37860 tmp = copy_to_reg (target);
37861 /* target = A A B B */
37862 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37863 /* target = X A B B */
37864 ix86_expand_vector_set (false, target, val, 0);
37865 /* target = A X C D */
37866 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37867 const1_rtx, const0_rtx,
37868 GEN_INT (2+4), GEN_INT (3+4)));
37869 return;
37871 case 2:
37872 /* tmp = target = A B C D */
37873 tmp = copy_to_reg (target);
37874 /* tmp = X B C D */
37875 ix86_expand_vector_set (false, tmp, val, 0);
37876 /* target = A B X D */
37877 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37878 const0_rtx, const1_rtx,
37879 GEN_INT (0+4), GEN_INT (3+4)));
37880 return;
37882 case 3:
37883 /* tmp = target = A B C D */
37884 tmp = copy_to_reg (target);
37885 /* tmp = X B C D */
37886 ix86_expand_vector_set (false, tmp, val, 0);
37887 /* target = A B X D */
37888 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37889 const0_rtx, const1_rtx,
37890 GEN_INT (2+4), GEN_INT (0+4)));
37891 return;
37893 default:
37894 gcc_unreachable ();
37896 break;
37898 case V4SImode:
37899 use_vec_merge = TARGET_SSE4_1;
37900 if (use_vec_merge)
37901 break;
37903 /* Element 0 handled by vec_merge below. */
37904 if (elt == 0)
37906 use_vec_merge = true;
37907 break;
37910 if (TARGET_SSE2)
37912 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37913 store into element 0, then shuffle them back. */
37915 rtx order[4];
37917 order[0] = GEN_INT (elt);
37918 order[1] = const1_rtx;
37919 order[2] = const2_rtx;
37920 order[3] = GEN_INT (3);
37921 order[elt] = const0_rtx;
37923 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37924 order[1], order[2], order[3]));
37926 ix86_expand_vector_set (false, target, val, 0);
37928 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37929 order[1], order[2], order[3]));
37931 else
37933 /* For SSE1, we have to reuse the V4SF code. */
37934 rtx t = gen_reg_rtx (V4SFmode);
37935 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
37936 emit_move_insn (target, gen_lowpart (mode, t));
37938 return;
37940 case V8HImode:
37941 use_vec_merge = TARGET_SSE2;
37942 break;
37943 case V4HImode:
37944 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37945 break;
37947 case V16QImode:
37948 use_vec_merge = TARGET_SSE4_1;
37949 break;
37951 case V8QImode:
37952 break;
37954 case V32QImode:
37955 half_mode = V16QImode;
37956 j = 0;
37957 n = 16;
37958 goto half;
37960 case V16HImode:
37961 half_mode = V8HImode;
37962 j = 1;
37963 n = 8;
37964 goto half;
37966 case V8SImode:
37967 half_mode = V4SImode;
37968 j = 2;
37969 n = 4;
37970 goto half;
37972 case V4DImode:
37973 half_mode = V2DImode;
37974 j = 3;
37975 n = 2;
37976 goto half;
37978 case V8SFmode:
37979 half_mode = V4SFmode;
37980 j = 4;
37981 n = 4;
37982 goto half;
37984 case V4DFmode:
37985 half_mode = V2DFmode;
37986 j = 5;
37987 n = 2;
37988 goto half;
37990 half:
37991 /* Compute offset. */
37992 i = elt / n;
37993 elt %= n;
37995 gcc_assert (i <= 1);
37997 /* Extract the half. */
37998 tmp = gen_reg_rtx (half_mode);
37999 emit_insn (gen_extract[j][i] (tmp, target));
38001 /* Put val in tmp at elt. */
38002 ix86_expand_vector_set (false, tmp, val, elt);
38004 /* Put it back. */
38005 emit_insn (gen_insert[j][i] (target, target, tmp));
38006 return;
38008 default:
38009 break;
38012 if (use_vec_merge)
38014 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
38015 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
38016 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38018 else
38020 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38022 emit_move_insn (mem, target);
38024 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38025 emit_move_insn (tmp, val);
38027 emit_move_insn (target, mem);
38031 void
38032 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
38034 enum machine_mode mode = GET_MODE (vec);
38035 enum machine_mode inner_mode = GET_MODE_INNER (mode);
38036 bool use_vec_extr = false;
38037 rtx tmp;
38039 switch (mode)
38041 case V2SImode:
38042 case V2SFmode:
38043 if (!mmx_ok)
38044 break;
38045 /* FALLTHRU */
38047 case V2DFmode:
38048 case V2DImode:
38049 use_vec_extr = true;
38050 break;
38052 case V4SFmode:
38053 use_vec_extr = TARGET_SSE4_1;
38054 if (use_vec_extr)
38055 break;
38057 switch (elt)
38059 case 0:
38060 tmp = vec;
38061 break;
38063 case 1:
38064 case 3:
38065 tmp = gen_reg_rtx (mode);
38066 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
38067 GEN_INT (elt), GEN_INT (elt),
38068 GEN_INT (elt+4), GEN_INT (elt+4)));
38069 break;
38071 case 2:
38072 tmp = gen_reg_rtx (mode);
38073 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
38074 break;
38076 default:
38077 gcc_unreachable ();
38079 vec = tmp;
38080 use_vec_extr = true;
38081 elt = 0;
38082 break;
38084 case V4SImode:
38085 use_vec_extr = TARGET_SSE4_1;
38086 if (use_vec_extr)
38087 break;
38089 if (TARGET_SSE2)
38091 switch (elt)
38093 case 0:
38094 tmp = vec;
38095 break;
38097 case 1:
38098 case 3:
38099 tmp = gen_reg_rtx (mode);
38100 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
38101 GEN_INT (elt), GEN_INT (elt),
38102 GEN_INT (elt), GEN_INT (elt)));
38103 break;
38105 case 2:
38106 tmp = gen_reg_rtx (mode);
38107 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
38108 break;
38110 default:
38111 gcc_unreachable ();
38113 vec = tmp;
38114 use_vec_extr = true;
38115 elt = 0;
38117 else
38119 /* For SSE1, we have to reuse the V4SF code. */
38120 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
38121 gen_lowpart (V4SFmode, vec), elt);
38122 return;
38124 break;
38126 case V8HImode:
38127 use_vec_extr = TARGET_SSE2;
38128 break;
38129 case V4HImode:
38130 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
38131 break;
38133 case V16QImode:
38134 use_vec_extr = TARGET_SSE4_1;
38135 break;
38137 case V8SFmode:
38138 if (TARGET_AVX)
38140 tmp = gen_reg_rtx (V4SFmode);
38141 if (elt < 4)
38142 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
38143 else
38144 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
38145 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38146 return;
38148 break;
38150 case V4DFmode:
38151 if (TARGET_AVX)
38153 tmp = gen_reg_rtx (V2DFmode);
38154 if (elt < 2)
38155 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
38156 else
38157 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
38158 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38159 return;
38161 break;
38163 case V32QImode:
38164 if (TARGET_AVX)
38166 tmp = gen_reg_rtx (V16QImode);
38167 if (elt < 16)
38168 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
38169 else
38170 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
38171 ix86_expand_vector_extract (false, target, tmp, elt & 15);
38172 return;
38174 break;
38176 case V16HImode:
38177 if (TARGET_AVX)
38179 tmp = gen_reg_rtx (V8HImode);
38180 if (elt < 8)
38181 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
38182 else
38183 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
38184 ix86_expand_vector_extract (false, target, tmp, elt & 7);
38185 return;
38187 break;
38189 case V8SImode:
38190 if (TARGET_AVX)
38192 tmp = gen_reg_rtx (V4SImode);
38193 if (elt < 4)
38194 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
38195 else
38196 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
38197 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38198 return;
38200 break;
38202 case V4DImode:
38203 if (TARGET_AVX)
38205 tmp = gen_reg_rtx (V2DImode);
38206 if (elt < 2)
38207 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
38208 else
38209 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
38210 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38211 return;
38213 break;
38215 case V8QImode:
38216 /* ??? Could extract the appropriate HImode element and shift. */
38217 default:
38218 break;
38221 if (use_vec_extr)
38223 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
38224 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
38226 /* Let the rtl optimizers know about the zero extension performed. */
38227 if (inner_mode == QImode || inner_mode == HImode)
38229 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
38230 target = gen_lowpart (SImode, target);
38233 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38235 else
38237 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38239 emit_move_insn (mem, vec);
38241 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38242 emit_move_insn (target, tmp);
38246 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
38247 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
38248 The upper bits of DEST are undefined, though they shouldn't cause
38249 exceptions (some bits from src or all zeros are ok). */
38251 static void
38252 emit_reduc_half (rtx dest, rtx src, int i)
38254 rtx tem, d = dest;
38255 switch (GET_MODE (src))
38257 case V4SFmode:
38258 if (i == 128)
38259 tem = gen_sse_movhlps (dest, src, src);
38260 else
38261 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
38262 GEN_INT (1 + 4), GEN_INT (1 + 4));
38263 break;
38264 case V2DFmode:
38265 tem = gen_vec_interleave_highv2df (dest, src, src);
38266 break;
38267 case V16QImode:
38268 case V8HImode:
38269 case V4SImode:
38270 case V2DImode:
38271 d = gen_reg_rtx (V1TImode);
38272 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
38273 GEN_INT (i / 2));
38274 break;
38275 case V8SFmode:
38276 if (i == 256)
38277 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
38278 else
38279 tem = gen_avx_shufps256 (dest, src, src,
38280 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
38281 break;
38282 case V4DFmode:
38283 if (i == 256)
38284 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
38285 else
38286 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
38287 break;
38288 case V32QImode:
38289 case V16HImode:
38290 case V8SImode:
38291 case V4DImode:
38292 if (i == 256)
38294 if (GET_MODE (dest) != V4DImode)
38295 d = gen_reg_rtx (V4DImode);
38296 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
38297 gen_lowpart (V4DImode, src),
38298 const1_rtx);
38300 else
38302 d = gen_reg_rtx (V2TImode);
38303 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
38304 GEN_INT (i / 2));
38306 break;
38307 default:
38308 gcc_unreachable ();
38310 emit_insn (tem);
38311 if (d != dest)
38312 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
38315 /* Expand a vector reduction. FN is the binary pattern to reduce;
38316 DEST is the destination; IN is the input vector. */
38318 void
38319 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
38321 rtx half, dst, vec = in;
38322 enum machine_mode mode = GET_MODE (in);
38323 int i;
38325 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
38326 if (TARGET_SSE4_1
38327 && mode == V8HImode
38328 && fn == gen_uminv8hi3)
38330 emit_insn (gen_sse4_1_phminposuw (dest, in));
38331 return;
38334 for (i = GET_MODE_BITSIZE (mode);
38335 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
38336 i >>= 1)
38338 half = gen_reg_rtx (mode);
38339 emit_reduc_half (half, vec, i);
38340 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
38341 dst = dest;
38342 else
38343 dst = gen_reg_rtx (mode);
38344 emit_insn (fn (dst, half, vec));
38345 vec = dst;
38349 /* Target hook for scalar_mode_supported_p. */
38350 static bool
38351 ix86_scalar_mode_supported_p (enum machine_mode mode)
38353 if (DECIMAL_FLOAT_MODE_P (mode))
38354 return default_decimal_float_supported_p ();
38355 else if (mode == TFmode)
38356 return true;
38357 else
38358 return default_scalar_mode_supported_p (mode);
38361 /* Implements target hook vector_mode_supported_p. */
38362 static bool
38363 ix86_vector_mode_supported_p (enum machine_mode mode)
38365 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
38366 return true;
38367 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
38368 return true;
38369 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
38370 return true;
38371 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
38372 return true;
38373 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
38374 return true;
38375 return false;
38378 /* Target hook for c_mode_for_suffix. */
38379 static enum machine_mode
38380 ix86_c_mode_for_suffix (char suffix)
38382 if (suffix == 'q')
38383 return TFmode;
38384 if (suffix == 'w')
38385 return XFmode;
38387 return VOIDmode;
38390 /* Worker function for TARGET_MD_ASM_CLOBBERS.
38392 We do this in the new i386 backend to maintain source compatibility
38393 with the old cc0-based compiler. */
38395 static tree
38396 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
38397 tree inputs ATTRIBUTE_UNUSED,
38398 tree clobbers)
38400 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
38401 clobbers);
38402 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
38403 clobbers);
38404 return clobbers;
38407 /* Implements target vector targetm.asm.encode_section_info. */
38409 static void ATTRIBUTE_UNUSED
38410 ix86_encode_section_info (tree decl, rtx rtl, int first)
38412 default_encode_section_info (decl, rtl, first);
38414 if (TREE_CODE (decl) == VAR_DECL
38415 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
38416 && ix86_in_large_data_p (decl))
38417 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
38420 /* Worker function for REVERSE_CONDITION. */
38422 enum rtx_code
38423 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
38425 return (mode != CCFPmode && mode != CCFPUmode
38426 ? reverse_condition (code)
38427 : reverse_condition_maybe_unordered (code));
38430 /* Output code to perform an x87 FP register move, from OPERANDS[1]
38431 to OPERANDS[0]. */
38433 const char *
38434 output_387_reg_move (rtx insn, rtx *operands)
38436 if (REG_P (operands[0]))
38438 if (REG_P (operands[1])
38439 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38441 if (REGNO (operands[0]) == FIRST_STACK_REG)
38442 return output_387_ffreep (operands, 0);
38443 return "fstp\t%y0";
38445 if (STACK_TOP_P (operands[0]))
38446 return "fld%Z1\t%y1";
38447 return "fst\t%y0";
38449 else if (MEM_P (operands[0]))
38451 gcc_assert (REG_P (operands[1]));
38452 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38453 return "fstp%Z0\t%y0";
38454 else
38456 /* There is no non-popping store to memory for XFmode.
38457 So if we need one, follow the store with a load. */
38458 if (GET_MODE (operands[0]) == XFmode)
38459 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
38460 else
38461 return "fst%Z0\t%y0";
38464 else
38465 gcc_unreachable();
38468 /* Output code to perform a conditional jump to LABEL, if C2 flag in
38469 FP status register is set. */
38471 void
38472 ix86_emit_fp_unordered_jump (rtx label)
38474 rtx reg = gen_reg_rtx (HImode);
38475 rtx temp;
38477 emit_insn (gen_x86_fnstsw_1 (reg));
38479 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
38481 emit_insn (gen_x86_sahf_1 (reg));
38483 temp = gen_rtx_REG (CCmode, FLAGS_REG);
38484 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
38486 else
38488 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
38490 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
38491 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
38494 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
38495 gen_rtx_LABEL_REF (VOIDmode, label),
38496 pc_rtx);
38497 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
38499 emit_jump_insn (temp);
38500 predict_jump (REG_BR_PROB_BASE * 10 / 100);
38503 /* Output code to perform a log1p XFmode calculation. */
38505 void ix86_emit_i387_log1p (rtx op0, rtx op1)
38507 rtx label1 = gen_label_rtx ();
38508 rtx label2 = gen_label_rtx ();
38510 rtx tmp = gen_reg_rtx (XFmode);
38511 rtx tmp2 = gen_reg_rtx (XFmode);
38512 rtx test;
38514 emit_insn (gen_absxf2 (tmp, op1));
38515 test = gen_rtx_GE (VOIDmode, tmp,
38516 CONST_DOUBLE_FROM_REAL_VALUE (
38517 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
38518 XFmode));
38519 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
38521 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38522 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
38523 emit_jump (label2);
38525 emit_label (label1);
38526 emit_move_insn (tmp, CONST1_RTX (XFmode));
38527 emit_insn (gen_addxf3 (tmp, op1, tmp));
38528 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38529 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
38531 emit_label (label2);
38534 /* Emit code for round calculation. */
38535 void ix86_emit_i387_round (rtx op0, rtx op1)
38537 enum machine_mode inmode = GET_MODE (op1);
38538 enum machine_mode outmode = GET_MODE (op0);
38539 rtx e1, e2, res, tmp, tmp1, half;
38540 rtx scratch = gen_reg_rtx (HImode);
38541 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
38542 rtx jump_label = gen_label_rtx ();
38543 rtx insn;
38544 rtx (*gen_abs) (rtx, rtx);
38545 rtx (*gen_neg) (rtx, rtx);
38547 switch (inmode)
38549 case SFmode:
38550 gen_abs = gen_abssf2;
38551 break;
38552 case DFmode:
38553 gen_abs = gen_absdf2;
38554 break;
38555 case XFmode:
38556 gen_abs = gen_absxf2;
38557 break;
38558 default:
38559 gcc_unreachable ();
38562 switch (outmode)
38564 case SFmode:
38565 gen_neg = gen_negsf2;
38566 break;
38567 case DFmode:
38568 gen_neg = gen_negdf2;
38569 break;
38570 case XFmode:
38571 gen_neg = gen_negxf2;
38572 break;
38573 case HImode:
38574 gen_neg = gen_neghi2;
38575 break;
38576 case SImode:
38577 gen_neg = gen_negsi2;
38578 break;
38579 case DImode:
38580 gen_neg = gen_negdi2;
38581 break;
38582 default:
38583 gcc_unreachable ();
38586 e1 = gen_reg_rtx (inmode);
38587 e2 = gen_reg_rtx (inmode);
38588 res = gen_reg_rtx (outmode);
38590 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
38592 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
38594 /* scratch = fxam(op1) */
38595 emit_insn (gen_rtx_SET (VOIDmode, scratch,
38596 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
38597 UNSPEC_FXAM)));
38598 /* e1 = fabs(op1) */
38599 emit_insn (gen_abs (e1, op1));
38601 /* e2 = e1 + 0.5 */
38602 half = force_reg (inmode, half);
38603 emit_insn (gen_rtx_SET (VOIDmode, e2,
38604 gen_rtx_PLUS (inmode, e1, half)));
38606 /* res = floor(e2) */
38607 if (inmode != XFmode)
38609 tmp1 = gen_reg_rtx (XFmode);
38611 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
38612 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
38614 else
38615 tmp1 = e2;
38617 switch (outmode)
38619 case SFmode:
38620 case DFmode:
38622 rtx tmp0 = gen_reg_rtx (XFmode);
38624 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
38626 emit_insn (gen_rtx_SET (VOIDmode, res,
38627 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
38628 UNSPEC_TRUNC_NOOP)));
38630 break;
38631 case XFmode:
38632 emit_insn (gen_frndintxf2_floor (res, tmp1));
38633 break;
38634 case HImode:
38635 emit_insn (gen_lfloorxfhi2 (res, tmp1));
38636 break;
38637 case SImode:
38638 emit_insn (gen_lfloorxfsi2 (res, tmp1));
38639 break;
38640 case DImode:
38641 emit_insn (gen_lfloorxfdi2 (res, tmp1));
38642 break;
38643 default:
38644 gcc_unreachable ();
38647 /* flags = signbit(a) */
38648 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
38650 /* if (flags) then res = -res */
38651 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
38652 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
38653 gen_rtx_LABEL_REF (VOIDmode, jump_label),
38654 pc_rtx);
38655 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38656 predict_jump (REG_BR_PROB_BASE * 50 / 100);
38657 JUMP_LABEL (insn) = jump_label;
38659 emit_insn (gen_neg (res, res));
38661 emit_label (jump_label);
38662 LABEL_NUSES (jump_label) = 1;
38664 emit_move_insn (op0, res);
38667 /* Output code to perform a Newton-Rhapson approximation of a single precision
38668 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
38670 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
38672 rtx x0, x1, e0, e1;
38674 x0 = gen_reg_rtx (mode);
38675 e0 = gen_reg_rtx (mode);
38676 e1 = gen_reg_rtx (mode);
38677 x1 = gen_reg_rtx (mode);
38679 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
38681 b = force_reg (mode, b);
38683 /* x0 = rcp(b) estimate */
38684 emit_insn (gen_rtx_SET (VOIDmode, x0,
38685 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
38686 UNSPEC_RCP)));
38687 /* e0 = x0 * b */
38688 emit_insn (gen_rtx_SET (VOIDmode, e0,
38689 gen_rtx_MULT (mode, x0, b)));
38691 /* e0 = x0 * e0 */
38692 emit_insn (gen_rtx_SET (VOIDmode, e0,
38693 gen_rtx_MULT (mode, x0, e0)));
38695 /* e1 = x0 + x0 */
38696 emit_insn (gen_rtx_SET (VOIDmode, e1,
38697 gen_rtx_PLUS (mode, x0, x0)));
38699 /* x1 = e1 - e0 */
38700 emit_insn (gen_rtx_SET (VOIDmode, x1,
38701 gen_rtx_MINUS (mode, e1, e0)));
38703 /* res = a * x1 */
38704 emit_insn (gen_rtx_SET (VOIDmode, res,
38705 gen_rtx_MULT (mode, a, x1)));
38708 /* Output code to perform a Newton-Rhapson approximation of a
38709 single precision floating point [reciprocal] square root. */
38711 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
38712 bool recip)
38714 rtx x0, e0, e1, e2, e3, mthree, mhalf;
38715 REAL_VALUE_TYPE r;
38717 x0 = gen_reg_rtx (mode);
38718 e0 = gen_reg_rtx (mode);
38719 e1 = gen_reg_rtx (mode);
38720 e2 = gen_reg_rtx (mode);
38721 e3 = gen_reg_rtx (mode);
38723 real_from_integer (&r, VOIDmode, -3, -1, 0);
38724 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38726 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
38727 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38729 if (VECTOR_MODE_P (mode))
38731 mthree = ix86_build_const_vector (mode, true, mthree);
38732 mhalf = ix86_build_const_vector (mode, true, mhalf);
38735 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
38736 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
38738 a = force_reg (mode, a);
38740 /* x0 = rsqrt(a) estimate */
38741 emit_insn (gen_rtx_SET (VOIDmode, x0,
38742 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38743 UNSPEC_RSQRT)));
38745 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38746 if (!recip)
38748 rtx zero, mask;
38750 zero = gen_reg_rtx (mode);
38751 mask = gen_reg_rtx (mode);
38753 zero = force_reg (mode, CONST0_RTX(mode));
38754 emit_insn (gen_rtx_SET (VOIDmode, mask,
38755 gen_rtx_NE (mode, zero, a)));
38757 emit_insn (gen_rtx_SET (VOIDmode, x0,
38758 gen_rtx_AND (mode, x0, mask)));
38761 /* e0 = x0 * a */
38762 emit_insn (gen_rtx_SET (VOIDmode, e0,
38763 gen_rtx_MULT (mode, x0, a)));
38764 /* e1 = e0 * x0 */
38765 emit_insn (gen_rtx_SET (VOIDmode, e1,
38766 gen_rtx_MULT (mode, e0, x0)));
38768 /* e2 = e1 - 3. */
38769 mthree = force_reg (mode, mthree);
38770 emit_insn (gen_rtx_SET (VOIDmode, e2,
38771 gen_rtx_PLUS (mode, e1, mthree)));
38773 mhalf = force_reg (mode, mhalf);
38774 if (recip)
38775 /* e3 = -.5 * x0 */
38776 emit_insn (gen_rtx_SET (VOIDmode, e3,
38777 gen_rtx_MULT (mode, x0, mhalf)));
38778 else
38779 /* e3 = -.5 * e0 */
38780 emit_insn (gen_rtx_SET (VOIDmode, e3,
38781 gen_rtx_MULT (mode, e0, mhalf)));
38782 /* ret = e2 * e3 */
38783 emit_insn (gen_rtx_SET (VOIDmode, res,
38784 gen_rtx_MULT (mode, e2, e3)));
38787 #ifdef TARGET_SOLARIS
38788 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38790 static void
38791 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38792 tree decl)
38794 /* With Binutils 2.15, the "@unwind" marker must be specified on
38795 every occurrence of the ".eh_frame" section, not just the first
38796 one. */
38797 if (TARGET_64BIT
38798 && strcmp (name, ".eh_frame") == 0)
38800 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38801 flags & SECTION_WRITE ? "aw" : "a");
38802 return;
38805 #ifndef USE_GAS
38806 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38808 solaris_elf_asm_comdat_section (name, flags, decl);
38809 return;
38811 #endif
38813 default_elf_asm_named_section (name, flags, decl);
38815 #endif /* TARGET_SOLARIS */
38817 /* Return the mangling of TYPE if it is an extended fundamental type. */
38819 static const char *
38820 ix86_mangle_type (const_tree type)
38822 type = TYPE_MAIN_VARIANT (type);
38824 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38825 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38826 return NULL;
38828 switch (TYPE_MODE (type))
38830 case TFmode:
38831 /* __float128 is "g". */
38832 return "g";
38833 case XFmode:
38834 /* "long double" or __float80 is "e". */
38835 return "e";
38836 default:
38837 return NULL;
38841 /* For 32-bit code we can save PIC register setup by using
38842 __stack_chk_fail_local hidden function instead of calling
38843 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38844 register, so it is better to call __stack_chk_fail directly. */
38846 static tree ATTRIBUTE_UNUSED
38847 ix86_stack_protect_fail (void)
38849 return TARGET_64BIT
38850 ? default_external_stack_protect_fail ()
38851 : default_hidden_stack_protect_fail ();
38854 /* Select a format to encode pointers in exception handling data. CODE
38855 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38856 true if the symbol may be affected by dynamic relocations.
38858 ??? All x86 object file formats are capable of representing this.
38859 After all, the relocation needed is the same as for the call insn.
38860 Whether or not a particular assembler allows us to enter such, I
38861 guess we'll have to see. */
38863 asm_preferred_eh_data_format (int code, int global)
38865 if (flag_pic)
38867 int type = DW_EH_PE_sdata8;
38868 if (!TARGET_64BIT
38869 || ix86_cmodel == CM_SMALL_PIC
38870 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38871 type = DW_EH_PE_sdata4;
38872 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38874 if (ix86_cmodel == CM_SMALL
38875 || (ix86_cmodel == CM_MEDIUM && code))
38876 return DW_EH_PE_udata4;
38877 return DW_EH_PE_absptr;
38880 /* Expand copysign from SIGN to the positive value ABS_VALUE
38881 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38882 the sign-bit. */
38883 static void
38884 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38886 enum machine_mode mode = GET_MODE (sign);
38887 rtx sgn = gen_reg_rtx (mode);
38888 if (mask == NULL_RTX)
38890 enum machine_mode vmode;
38892 if (mode == SFmode)
38893 vmode = V4SFmode;
38894 else if (mode == DFmode)
38895 vmode = V2DFmode;
38896 else
38897 vmode = mode;
38899 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38900 if (!VECTOR_MODE_P (mode))
38902 /* We need to generate a scalar mode mask in this case. */
38903 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38904 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38905 mask = gen_reg_rtx (mode);
38906 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38909 else
38910 mask = gen_rtx_NOT (mode, mask);
38911 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38912 gen_rtx_AND (mode, mask, sign)));
38913 emit_insn (gen_rtx_SET (VOIDmode, result,
38914 gen_rtx_IOR (mode, abs_value, sgn)));
38917 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38918 mask for masking out the sign-bit is stored in *SMASK, if that is
38919 non-null. */
38920 static rtx
38921 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38923 enum machine_mode vmode, mode = GET_MODE (op0);
38924 rtx xa, mask;
38926 xa = gen_reg_rtx (mode);
38927 if (mode == SFmode)
38928 vmode = V4SFmode;
38929 else if (mode == DFmode)
38930 vmode = V2DFmode;
38931 else
38932 vmode = mode;
38933 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38934 if (!VECTOR_MODE_P (mode))
38936 /* We need to generate a scalar mode mask in this case. */
38937 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38938 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38939 mask = gen_reg_rtx (mode);
38940 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38942 emit_insn (gen_rtx_SET (VOIDmode, xa,
38943 gen_rtx_AND (mode, op0, mask)));
38945 if (smask)
38946 *smask = mask;
38948 return xa;
38951 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38952 swapping the operands if SWAP_OPERANDS is true. The expanded
38953 code is a forward jump to a newly created label in case the
38954 comparison is true. The generated label rtx is returned. */
38955 static rtx
38956 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38957 bool swap_operands)
38959 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
38960 rtx label, tmp;
38962 if (swap_operands)
38964 tmp = op0;
38965 op0 = op1;
38966 op1 = tmp;
38969 label = gen_label_rtx ();
38970 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
38971 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38972 gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
38973 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
38974 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
38975 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
38976 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38977 JUMP_LABEL (tmp) = label;
38979 return label;
38982 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
38983 using comparison code CODE. Operands are swapped for the comparison if
38984 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
38985 static rtx
38986 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
38987 bool swap_operands)
38989 rtx (*insn)(rtx, rtx, rtx, rtx);
38990 enum machine_mode mode = GET_MODE (op0);
38991 rtx mask = gen_reg_rtx (mode);
38993 if (swap_operands)
38995 rtx tmp = op0;
38996 op0 = op1;
38997 op1 = tmp;
39000 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
39002 emit_insn (insn (mask, op0, op1,
39003 gen_rtx_fmt_ee (code, mode, op0, op1)));
39004 return mask;
39007 /* Generate and return a rtx of mode MODE for 2**n where n is the number
39008 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
39009 static rtx
39010 ix86_gen_TWO52 (enum machine_mode mode)
39012 REAL_VALUE_TYPE TWO52r;
39013 rtx TWO52;
39015 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
39016 TWO52 = const_double_from_real_value (TWO52r, mode);
39017 TWO52 = force_reg (mode, TWO52);
39019 return TWO52;
39022 /* Expand SSE sequence for computing lround from OP1 storing
39023 into OP0. */
39024 void
39025 ix86_expand_lround (rtx op0, rtx op1)
39027 /* C code for the stuff we're doing below:
39028 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
39029 return (long)tmp;
39031 enum machine_mode mode = GET_MODE (op1);
39032 const struct real_format *fmt;
39033 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39034 rtx adj;
39036 /* load nextafter (0.5, 0.0) */
39037 fmt = REAL_MODE_FORMAT (mode);
39038 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39039 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39041 /* adj = copysign (0.5, op1) */
39042 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
39043 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
39045 /* adj = op1 + adj */
39046 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
39048 /* op0 = (imode)adj */
39049 expand_fix (op0, adj, 0);
39052 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
39053 into OPERAND0. */
39054 void
39055 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
39057 /* C code for the stuff we're doing below (for do_floor):
39058 xi = (long)op1;
39059 xi -= (double)xi > op1 ? 1 : 0;
39060 return xi;
39062 enum machine_mode fmode = GET_MODE (op1);
39063 enum machine_mode imode = GET_MODE (op0);
39064 rtx ireg, freg, label, tmp;
39066 /* reg = (long)op1 */
39067 ireg = gen_reg_rtx (imode);
39068 expand_fix (ireg, op1, 0);
39070 /* freg = (double)reg */
39071 freg = gen_reg_rtx (fmode);
39072 expand_float (freg, ireg, 0);
39074 /* ireg = (freg > op1) ? ireg - 1 : ireg */
39075 label = ix86_expand_sse_compare_and_jump (UNLE,
39076 freg, op1, !do_floor);
39077 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
39078 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
39079 emit_move_insn (ireg, tmp);
39081 emit_label (label);
39082 LABEL_NUSES (label) = 1;
39084 emit_move_insn (op0, ireg);
39087 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
39088 result in OPERAND0. */
39089 void
39090 ix86_expand_rint (rtx operand0, rtx operand1)
39092 /* C code for the stuff we're doing below:
39093 xa = fabs (operand1);
39094 if (!isless (xa, 2**52))
39095 return operand1;
39096 xa = xa + 2**52 - 2**52;
39097 return copysign (xa, operand1);
39099 enum machine_mode mode = GET_MODE (operand0);
39100 rtx res, xa, label, TWO52, mask;
39102 res = gen_reg_rtx (mode);
39103 emit_move_insn (res, operand1);
39105 /* xa = abs (operand1) */
39106 xa = ix86_expand_sse_fabs (res, &mask);
39108 /* if (!isless (xa, TWO52)) goto label; */
39109 TWO52 = ix86_gen_TWO52 (mode);
39110 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39112 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39113 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39115 ix86_sse_copysign_to_positive (res, xa, res, mask);
39117 emit_label (label);
39118 LABEL_NUSES (label) = 1;
39120 emit_move_insn (operand0, res);
39123 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39124 into OPERAND0. */
39125 void
39126 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
39128 /* C code for the stuff we expand below.
39129 double xa = fabs (x), x2;
39130 if (!isless (xa, TWO52))
39131 return x;
39132 xa = xa + TWO52 - TWO52;
39133 x2 = copysign (xa, x);
39134 Compensate. Floor:
39135 if (x2 > x)
39136 x2 -= 1;
39137 Compensate. Ceil:
39138 if (x2 < x)
39139 x2 -= -1;
39140 return x2;
39142 enum machine_mode mode = GET_MODE (operand0);
39143 rtx xa, TWO52, tmp, label, one, res, mask;
39145 TWO52 = ix86_gen_TWO52 (mode);
39147 /* Temporary for holding the result, initialized to the input
39148 operand to ease control flow. */
39149 res = gen_reg_rtx (mode);
39150 emit_move_insn (res, operand1);
39152 /* xa = abs (operand1) */
39153 xa = ix86_expand_sse_fabs (res, &mask);
39155 /* if (!isless (xa, TWO52)) goto label; */
39156 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39158 /* xa = xa + TWO52 - TWO52; */
39159 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39160 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39162 /* xa = copysign (xa, operand1) */
39163 ix86_sse_copysign_to_positive (xa, xa, res, mask);
39165 /* generate 1.0 or -1.0 */
39166 one = force_reg (mode,
39167 const_double_from_real_value (do_floor
39168 ? dconst1 : dconstm1, mode));
39170 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39171 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39172 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39173 gen_rtx_AND (mode, one, tmp)));
39174 /* We always need to subtract here to preserve signed zero. */
39175 tmp = expand_simple_binop (mode, MINUS,
39176 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39177 emit_move_insn (res, tmp);
39179 emit_label (label);
39180 LABEL_NUSES (label) = 1;
39182 emit_move_insn (operand0, res);
39185 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39186 into OPERAND0. */
39187 void
39188 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
39190 /* C code for the stuff we expand below.
39191 double xa = fabs (x), x2;
39192 if (!isless (xa, TWO52))
39193 return x;
39194 x2 = (double)(long)x;
39195 Compensate. Floor:
39196 if (x2 > x)
39197 x2 -= 1;
39198 Compensate. Ceil:
39199 if (x2 < x)
39200 x2 += 1;
39201 if (HONOR_SIGNED_ZEROS (mode))
39202 return copysign (x2, x);
39203 return x2;
39205 enum machine_mode mode = GET_MODE (operand0);
39206 rtx xa, xi, TWO52, tmp, label, one, res, mask;
39208 TWO52 = ix86_gen_TWO52 (mode);
39210 /* Temporary for holding the result, initialized to the input
39211 operand to ease control flow. */
39212 res = gen_reg_rtx (mode);
39213 emit_move_insn (res, operand1);
39215 /* xa = abs (operand1) */
39216 xa = ix86_expand_sse_fabs (res, &mask);
39218 /* if (!isless (xa, TWO52)) goto label; */
39219 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39221 /* xa = (double)(long)x */
39222 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39223 expand_fix (xi, res, 0);
39224 expand_float (xa, xi, 0);
39226 /* generate 1.0 */
39227 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39229 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39230 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39231 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39232 gen_rtx_AND (mode, one, tmp)));
39233 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
39234 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39235 emit_move_insn (res, tmp);
39237 if (HONOR_SIGNED_ZEROS (mode))
39238 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39240 emit_label (label);
39241 LABEL_NUSES (label) = 1;
39243 emit_move_insn (operand0, res);
39246 /* Expand SSE sequence for computing round from OPERAND1 storing
39247 into OPERAND0. Sequence that works without relying on DImode truncation
39248 via cvttsd2siq that is only available on 64bit targets. */
39249 void
39250 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
39252 /* C code for the stuff we expand below.
39253 double xa = fabs (x), xa2, x2;
39254 if (!isless (xa, TWO52))
39255 return x;
39256 Using the absolute value and copying back sign makes
39257 -0.0 -> -0.0 correct.
39258 xa2 = xa + TWO52 - TWO52;
39259 Compensate.
39260 dxa = xa2 - xa;
39261 if (dxa <= -0.5)
39262 xa2 += 1;
39263 else if (dxa > 0.5)
39264 xa2 -= 1;
39265 x2 = copysign (xa2, x);
39266 return x2;
39268 enum machine_mode mode = GET_MODE (operand0);
39269 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
39271 TWO52 = ix86_gen_TWO52 (mode);
39273 /* Temporary for holding the result, initialized to the input
39274 operand to ease control flow. */
39275 res = gen_reg_rtx (mode);
39276 emit_move_insn (res, operand1);
39278 /* xa = abs (operand1) */
39279 xa = ix86_expand_sse_fabs (res, &mask);
39281 /* if (!isless (xa, TWO52)) goto label; */
39282 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39284 /* xa2 = xa + TWO52 - TWO52; */
39285 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39286 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
39288 /* dxa = xa2 - xa; */
39289 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
39291 /* generate 0.5, 1.0 and -0.5 */
39292 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
39293 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
39294 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
39295 0, OPTAB_DIRECT);
39297 /* Compensate. */
39298 tmp = gen_reg_rtx (mode);
39299 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
39300 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
39301 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39302 gen_rtx_AND (mode, one, tmp)));
39303 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39304 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
39305 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
39306 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39307 gen_rtx_AND (mode, one, tmp)));
39308 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39310 /* res = copysign (xa2, operand1) */
39311 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
39313 emit_label (label);
39314 LABEL_NUSES (label) = 1;
39316 emit_move_insn (operand0, res);
39319 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39320 into OPERAND0. */
39321 void
39322 ix86_expand_trunc (rtx operand0, rtx operand1)
39324 /* C code for SSE variant we expand below.
39325 double xa = fabs (x), x2;
39326 if (!isless (xa, TWO52))
39327 return x;
39328 x2 = (double)(long)x;
39329 if (HONOR_SIGNED_ZEROS (mode))
39330 return copysign (x2, x);
39331 return x2;
39333 enum machine_mode mode = GET_MODE (operand0);
39334 rtx xa, xi, TWO52, label, res, mask;
39336 TWO52 = ix86_gen_TWO52 (mode);
39338 /* Temporary for holding the result, initialized to the input
39339 operand to ease control flow. */
39340 res = gen_reg_rtx (mode);
39341 emit_move_insn (res, operand1);
39343 /* xa = abs (operand1) */
39344 xa = ix86_expand_sse_fabs (res, &mask);
39346 /* if (!isless (xa, TWO52)) goto label; */
39347 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39349 /* x = (double)(long)x */
39350 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39351 expand_fix (xi, res, 0);
39352 expand_float (res, xi, 0);
39354 if (HONOR_SIGNED_ZEROS (mode))
39355 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39357 emit_label (label);
39358 LABEL_NUSES (label) = 1;
39360 emit_move_insn (operand0, res);
39363 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39364 into OPERAND0. */
39365 void
39366 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
39368 enum machine_mode mode = GET_MODE (operand0);
39369 rtx xa, mask, TWO52, label, one, res, smask, tmp;
39371 /* C code for SSE variant we expand below.
39372 double xa = fabs (x), x2;
39373 if (!isless (xa, TWO52))
39374 return x;
39375 xa2 = xa + TWO52 - TWO52;
39376 Compensate:
39377 if (xa2 > xa)
39378 xa2 -= 1.0;
39379 x2 = copysign (xa2, x);
39380 return x2;
39383 TWO52 = ix86_gen_TWO52 (mode);
39385 /* Temporary for holding the result, initialized to the input
39386 operand to ease control flow. */
39387 res = gen_reg_rtx (mode);
39388 emit_move_insn (res, operand1);
39390 /* xa = abs (operand1) */
39391 xa = ix86_expand_sse_fabs (res, &smask);
39393 /* if (!isless (xa, TWO52)) goto label; */
39394 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39396 /* res = xa + TWO52 - TWO52; */
39397 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39398 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
39399 emit_move_insn (res, tmp);
39401 /* generate 1.0 */
39402 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39404 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
39405 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
39406 emit_insn (gen_rtx_SET (VOIDmode, mask,
39407 gen_rtx_AND (mode, mask, one)));
39408 tmp = expand_simple_binop (mode, MINUS,
39409 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
39410 emit_move_insn (res, tmp);
39412 /* res = copysign (res, operand1) */
39413 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
39415 emit_label (label);
39416 LABEL_NUSES (label) = 1;
39418 emit_move_insn (operand0, res);
39421 /* Expand SSE sequence for computing round from OPERAND1 storing
39422 into OPERAND0. */
39423 void
39424 ix86_expand_round (rtx operand0, rtx operand1)
39426 /* C code for the stuff we're doing below:
39427 double xa = fabs (x);
39428 if (!isless (xa, TWO52))
39429 return x;
39430 xa = (double)(long)(xa + nextafter (0.5, 0.0));
39431 return copysign (xa, x);
39433 enum machine_mode mode = GET_MODE (operand0);
39434 rtx res, TWO52, xa, label, xi, half, mask;
39435 const struct real_format *fmt;
39436 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39438 /* Temporary for holding the result, initialized to the input
39439 operand to ease control flow. */
39440 res = gen_reg_rtx (mode);
39441 emit_move_insn (res, operand1);
39443 TWO52 = ix86_gen_TWO52 (mode);
39444 xa = ix86_expand_sse_fabs (res, &mask);
39445 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39447 /* load nextafter (0.5, 0.0) */
39448 fmt = REAL_MODE_FORMAT (mode);
39449 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39450 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39452 /* xa = xa + 0.5 */
39453 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
39454 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
39456 /* xa = (double)(int64_t)xa */
39457 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39458 expand_fix (xi, xa, 0);
39459 expand_float (xa, xi, 0);
39461 /* res = copysign (xa, operand1) */
39462 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
39464 emit_label (label);
39465 LABEL_NUSES (label) = 1;
39467 emit_move_insn (operand0, res);
39470 /* Expand SSE sequence for computing round
39471 from OP1 storing into OP0 using sse4 round insn. */
39472 void
39473 ix86_expand_round_sse4 (rtx op0, rtx op1)
39475 enum machine_mode mode = GET_MODE (op0);
39476 rtx e1, e2, res, half;
39477 const struct real_format *fmt;
39478 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39479 rtx (*gen_copysign) (rtx, rtx, rtx);
39480 rtx (*gen_round) (rtx, rtx, rtx);
39482 switch (mode)
39484 case SFmode:
39485 gen_copysign = gen_copysignsf3;
39486 gen_round = gen_sse4_1_roundsf2;
39487 break;
39488 case DFmode:
39489 gen_copysign = gen_copysigndf3;
39490 gen_round = gen_sse4_1_rounddf2;
39491 break;
39492 default:
39493 gcc_unreachable ();
39496 /* round (a) = trunc (a + copysign (0.5, a)) */
39498 /* load nextafter (0.5, 0.0) */
39499 fmt = REAL_MODE_FORMAT (mode);
39500 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39501 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39502 half = const_double_from_real_value (pred_half, mode);
39504 /* e1 = copysign (0.5, op1) */
39505 e1 = gen_reg_rtx (mode);
39506 emit_insn (gen_copysign (e1, half, op1));
39508 /* e2 = op1 + e1 */
39509 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
39511 /* res = trunc (e2) */
39512 res = gen_reg_rtx (mode);
39513 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
39515 emit_move_insn (op0, res);
39519 /* Table of valid machine attributes. */
39520 static const struct attribute_spec ix86_attribute_table[] =
39522 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
39523 affects_type_identity } */
39524 /* Stdcall attribute says callee is responsible for popping arguments
39525 if they are not variable. */
39526 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39527 true },
39528 /* Fastcall attribute says callee is responsible for popping arguments
39529 if they are not variable. */
39530 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39531 true },
39532 /* Thiscall attribute says callee is responsible for popping arguments
39533 if they are not variable. */
39534 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39535 true },
39536 /* Cdecl attribute says the callee is a normal C declaration */
39537 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39538 true },
39539 /* Regparm attribute specifies how many integer arguments are to be
39540 passed in registers. */
39541 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
39542 true },
39543 /* Sseregparm attribute says we are using x86_64 calling conventions
39544 for FP arguments. */
39545 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39546 true },
39547 /* The transactional memory builtins are implicitly regparm or fastcall
39548 depending on the ABI. Override the generic do-nothing attribute that
39549 these builtins were declared with. */
39550 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
39551 true },
39552 /* force_align_arg_pointer says this function realigns the stack at entry. */
39553 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
39554 false, true, true, ix86_handle_cconv_attribute, false },
39555 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39556 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
39557 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
39558 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
39559 false },
39560 #endif
39561 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39562 false },
39563 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39564 false },
39565 #ifdef SUBTARGET_ATTRIBUTE_TABLE
39566 SUBTARGET_ATTRIBUTE_TABLE,
39567 #endif
39568 /* ms_abi and sysv_abi calling convention function attributes. */
39569 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39570 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39571 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
39572 false },
39573 { "callee_pop_aggregate_return", 1, 1, false, true, true,
39574 ix86_handle_callee_pop_aggregate_return, true },
39575 /* End element. */
39576 { NULL, 0, 0, false, false, false, NULL, false }
39579 /* Implement targetm.vectorize.builtin_vectorization_cost. */
39580 static int
39581 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
39582 tree vectype,
39583 int misalign ATTRIBUTE_UNUSED)
39585 unsigned elements;
39587 switch (type_of_cost)
39589 case scalar_stmt:
39590 return ix86_cost->scalar_stmt_cost;
39592 case scalar_load:
39593 return ix86_cost->scalar_load_cost;
39595 case scalar_store:
39596 return ix86_cost->scalar_store_cost;
39598 case vector_stmt:
39599 return ix86_cost->vec_stmt_cost;
39601 case vector_load:
39602 return ix86_cost->vec_align_load_cost;
39604 case vector_store:
39605 return ix86_cost->vec_store_cost;
39607 case vec_to_scalar:
39608 return ix86_cost->vec_to_scalar_cost;
39610 case scalar_to_vec:
39611 return ix86_cost->scalar_to_vec_cost;
39613 case unaligned_load:
39614 case unaligned_store:
39615 return ix86_cost->vec_unalign_load_cost;
39617 case cond_branch_taken:
39618 return ix86_cost->cond_taken_branch_cost;
39620 case cond_branch_not_taken:
39621 return ix86_cost->cond_not_taken_branch_cost;
39623 case vec_perm:
39624 case vec_promote_demote:
39625 return ix86_cost->vec_stmt_cost;
39627 case vec_construct:
39628 elements = TYPE_VECTOR_SUBPARTS (vectype);
39629 return elements / 2 + 1;
39631 default:
39632 gcc_unreachable ();
39636 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
39637 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
39638 insn every time. */
39640 static GTY(()) rtx vselect_insn;
39642 /* Initialize vselect_insn. */
39644 static void
39645 init_vselect_insn (void)
39647 unsigned i;
39648 rtx x;
39650 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
39651 for (i = 0; i < MAX_VECT_LEN; ++i)
39652 XVECEXP (x, 0, i) = const0_rtx;
39653 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
39654 const0_rtx), x);
39655 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
39656 start_sequence ();
39657 vselect_insn = emit_insn (x);
39658 end_sequence ();
39661 /* Construct (set target (vec_select op0 (parallel perm))) and
39662 return true if that's a valid instruction in the active ISA. */
39664 static bool
39665 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
39666 unsigned nelt, bool testing_p)
39668 unsigned int i;
39669 rtx x, save_vconcat;
39670 int icode;
39672 if (vselect_insn == NULL_RTX)
39673 init_vselect_insn ();
39675 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
39676 PUT_NUM_ELEM (XVEC (x, 0), nelt);
39677 for (i = 0; i < nelt; ++i)
39678 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
39679 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39680 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
39681 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
39682 SET_DEST (PATTERN (vselect_insn)) = target;
39683 icode = recog_memoized (vselect_insn);
39685 if (icode >= 0 && !testing_p)
39686 emit_insn (copy_rtx (PATTERN (vselect_insn)));
39688 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
39689 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
39690 INSN_CODE (vselect_insn) = -1;
39692 return icode >= 0;
39695 /* Similar, but generate a vec_concat from op0 and op1 as well. */
39697 static bool
39698 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
39699 const unsigned char *perm, unsigned nelt,
39700 bool testing_p)
39702 enum machine_mode v2mode;
39703 rtx x;
39704 bool ok;
39706 if (vselect_insn == NULL_RTX)
39707 init_vselect_insn ();
39709 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
39710 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39711 PUT_MODE (x, v2mode);
39712 XEXP (x, 0) = op0;
39713 XEXP (x, 1) = op1;
39714 ok = expand_vselect (target, x, perm, nelt, testing_p);
39715 XEXP (x, 0) = const0_rtx;
39716 XEXP (x, 1) = const0_rtx;
39717 return ok;
39720 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39721 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
39723 static bool
39724 expand_vec_perm_blend (struct expand_vec_perm_d *d)
39726 enum machine_mode vmode = d->vmode;
39727 unsigned i, mask, nelt = d->nelt;
39728 rtx target, op0, op1, x;
39729 rtx rperm[32], vperm;
39731 if (d->one_operand_p)
39732 return false;
39733 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
39735 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
39737 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
39739 else
39740 return false;
39742 /* This is a blend, not a permute. Elements must stay in their
39743 respective lanes. */
39744 for (i = 0; i < nelt; ++i)
39746 unsigned e = d->perm[i];
39747 if (!(e == i || e == i + nelt))
39748 return false;
39751 if (d->testing_p)
39752 return true;
39754 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39755 decision should be extracted elsewhere, so that we only try that
39756 sequence once all budget==3 options have been tried. */
39757 target = d->target;
39758 op0 = d->op0;
39759 op1 = d->op1;
39760 mask = 0;
39762 switch (vmode)
39764 case V4DFmode:
39765 case V8SFmode:
39766 case V2DFmode:
39767 case V4SFmode:
39768 case V8HImode:
39769 case V8SImode:
39770 for (i = 0; i < nelt; ++i)
39771 mask |= (d->perm[i] >= nelt) << i;
39772 break;
39774 case V2DImode:
39775 for (i = 0; i < 2; ++i)
39776 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39777 vmode = V8HImode;
39778 goto do_subreg;
39780 case V4SImode:
39781 for (i = 0; i < 4; ++i)
39782 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39783 vmode = V8HImode;
39784 goto do_subreg;
39786 case V16QImode:
39787 /* See if bytes move in pairs so we can use pblendw with
39788 an immediate argument, rather than pblendvb with a vector
39789 argument. */
39790 for (i = 0; i < 16; i += 2)
39791 if (d->perm[i] + 1 != d->perm[i + 1])
39793 use_pblendvb:
39794 for (i = 0; i < nelt; ++i)
39795 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39797 finish_pblendvb:
39798 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39799 vperm = force_reg (vmode, vperm);
39801 if (GET_MODE_SIZE (vmode) == 16)
39802 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39803 else
39804 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39805 if (target != d->target)
39806 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39807 return true;
39810 for (i = 0; i < 8; ++i)
39811 mask |= (d->perm[i * 2] >= 16) << i;
39812 vmode = V8HImode;
39813 /* FALLTHRU */
39815 do_subreg:
39816 target = gen_reg_rtx (vmode);
39817 op0 = gen_lowpart (vmode, op0);
39818 op1 = gen_lowpart (vmode, op1);
39819 break;
39821 case V32QImode:
39822 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39823 for (i = 0; i < 32; i += 2)
39824 if (d->perm[i] + 1 != d->perm[i + 1])
39825 goto use_pblendvb;
39826 /* See if bytes move in quadruplets. If yes, vpblendd
39827 with immediate can be used. */
39828 for (i = 0; i < 32; i += 4)
39829 if (d->perm[i] + 2 != d->perm[i + 2])
39830 break;
39831 if (i < 32)
39833 /* See if bytes move the same in both lanes. If yes,
39834 vpblendw with immediate can be used. */
39835 for (i = 0; i < 16; i += 2)
39836 if (d->perm[i] + 16 != d->perm[i + 16])
39837 goto use_pblendvb;
39839 /* Use vpblendw. */
39840 for (i = 0; i < 16; ++i)
39841 mask |= (d->perm[i * 2] >= 32) << i;
39842 vmode = V16HImode;
39843 goto do_subreg;
39846 /* Use vpblendd. */
39847 for (i = 0; i < 8; ++i)
39848 mask |= (d->perm[i * 4] >= 32) << i;
39849 vmode = V8SImode;
39850 goto do_subreg;
39852 case V16HImode:
39853 /* See if words move in pairs. If yes, vpblendd can be used. */
39854 for (i = 0; i < 16; i += 2)
39855 if (d->perm[i] + 1 != d->perm[i + 1])
39856 break;
39857 if (i < 16)
39859 /* See if words move the same in both lanes. If not,
39860 vpblendvb must be used. */
39861 for (i = 0; i < 8; i++)
39862 if (d->perm[i] + 8 != d->perm[i + 8])
39864 /* Use vpblendvb. */
39865 for (i = 0; i < 32; ++i)
39866 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39868 vmode = V32QImode;
39869 nelt = 32;
39870 target = gen_reg_rtx (vmode);
39871 op0 = gen_lowpart (vmode, op0);
39872 op1 = gen_lowpart (vmode, op1);
39873 goto finish_pblendvb;
39876 /* Use vpblendw. */
39877 for (i = 0; i < 16; ++i)
39878 mask |= (d->perm[i] >= 16) << i;
39879 break;
39882 /* Use vpblendd. */
39883 for (i = 0; i < 8; ++i)
39884 mask |= (d->perm[i * 2] >= 16) << i;
39885 vmode = V8SImode;
39886 goto do_subreg;
39888 case V4DImode:
39889 /* Use vpblendd. */
39890 for (i = 0; i < 4; ++i)
39891 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39892 vmode = V8SImode;
39893 goto do_subreg;
39895 default:
39896 gcc_unreachable ();
39899 /* This matches five different patterns with the different modes. */
39900 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39901 x = gen_rtx_SET (VOIDmode, target, x);
39902 emit_insn (x);
39903 if (target != d->target)
39904 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39906 return true;
39909 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39910 in terms of the variable form of vpermilps.
39912 Note that we will have already failed the immediate input vpermilps,
39913 which requires that the high and low part shuffle be identical; the
39914 variable form doesn't require that. */
39916 static bool
39917 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39919 rtx rperm[8], vperm;
39920 unsigned i;
39922 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39923 return false;
39925 /* We can only permute within the 128-bit lane. */
39926 for (i = 0; i < 8; ++i)
39928 unsigned e = d->perm[i];
39929 if (i < 4 ? e >= 4 : e < 4)
39930 return false;
39933 if (d->testing_p)
39934 return true;
39936 for (i = 0; i < 8; ++i)
39938 unsigned e = d->perm[i];
39940 /* Within each 128-bit lane, the elements of op0 are numbered
39941 from 0 and the elements of op1 are numbered from 4. */
39942 if (e >= 8 + 4)
39943 e -= 8;
39944 else if (e >= 4)
39945 e -= 4;
39947 rperm[i] = GEN_INT (e);
39950 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39951 vperm = force_reg (V8SImode, vperm);
39952 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39954 return true;
39957 /* Return true if permutation D can be performed as VMODE permutation
39958 instead. */
39960 static bool
39961 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39963 unsigned int i, j, chunk;
39965 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39966 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39967 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39968 return false;
39970 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39971 return true;
39973 chunk = d->nelt / GET_MODE_NUNITS (vmode);
39974 for (i = 0; i < d->nelt; i += chunk)
39975 if (d->perm[i] & (chunk - 1))
39976 return false;
39977 else
39978 for (j = 1; j < chunk; ++j)
39979 if (d->perm[i] + j != d->perm[i + j])
39980 return false;
39982 return true;
39985 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39986 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
39988 static bool
39989 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
39991 unsigned i, nelt, eltsz, mask;
39992 unsigned char perm[32];
39993 enum machine_mode vmode = V16QImode;
39994 rtx rperm[32], vperm, target, op0, op1;
39996 nelt = d->nelt;
39998 if (!d->one_operand_p)
40000 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
40002 if (TARGET_AVX2
40003 && valid_perm_using_mode_p (V2TImode, d))
40005 if (d->testing_p)
40006 return true;
40008 /* Use vperm2i128 insn. The pattern uses
40009 V4DImode instead of V2TImode. */
40010 target = d->target;
40011 if (d->vmode != V4DImode)
40012 target = gen_reg_rtx (V4DImode);
40013 op0 = gen_lowpart (V4DImode, d->op0);
40014 op1 = gen_lowpart (V4DImode, d->op1);
40015 rperm[0]
40016 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
40017 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
40018 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
40019 if (target != d->target)
40020 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40021 return true;
40023 return false;
40026 else
40028 if (GET_MODE_SIZE (d->vmode) == 16)
40030 if (!TARGET_SSSE3)
40031 return false;
40033 else if (GET_MODE_SIZE (d->vmode) == 32)
40035 if (!TARGET_AVX2)
40036 return false;
40038 /* V4DImode should be already handled through
40039 expand_vselect by vpermq instruction. */
40040 gcc_assert (d->vmode != V4DImode);
40042 vmode = V32QImode;
40043 if (d->vmode == V8SImode
40044 || d->vmode == V16HImode
40045 || d->vmode == V32QImode)
40047 /* First see if vpermq can be used for
40048 V8SImode/V16HImode/V32QImode. */
40049 if (valid_perm_using_mode_p (V4DImode, d))
40051 for (i = 0; i < 4; i++)
40052 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
40053 if (d->testing_p)
40054 return true;
40055 target = gen_reg_rtx (V4DImode);
40056 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
40057 perm, 4, false))
40059 emit_move_insn (d->target,
40060 gen_lowpart (d->vmode, target));
40061 return true;
40063 return false;
40066 /* Next see if vpermd can be used. */
40067 if (valid_perm_using_mode_p (V8SImode, d))
40068 vmode = V8SImode;
40070 /* Or if vpermps can be used. */
40071 else if (d->vmode == V8SFmode)
40072 vmode = V8SImode;
40074 if (vmode == V32QImode)
40076 /* vpshufb only works intra lanes, it is not
40077 possible to shuffle bytes in between the lanes. */
40078 for (i = 0; i < nelt; ++i)
40079 if ((d->perm[i] ^ i) & (nelt / 2))
40080 return false;
40083 else
40084 return false;
40087 if (d->testing_p)
40088 return true;
40090 if (vmode == V8SImode)
40091 for (i = 0; i < 8; ++i)
40092 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
40093 else
40095 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40096 if (!d->one_operand_p)
40097 mask = 2 * nelt - 1;
40098 else if (vmode == V16QImode)
40099 mask = nelt - 1;
40100 else
40101 mask = nelt / 2 - 1;
40103 for (i = 0; i < nelt; ++i)
40105 unsigned j, e = d->perm[i] & mask;
40106 for (j = 0; j < eltsz; ++j)
40107 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
40111 vperm = gen_rtx_CONST_VECTOR (vmode,
40112 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
40113 vperm = force_reg (vmode, vperm);
40115 target = d->target;
40116 if (d->vmode != vmode)
40117 target = gen_reg_rtx (vmode);
40118 op0 = gen_lowpart (vmode, d->op0);
40119 if (d->one_operand_p)
40121 if (vmode == V16QImode)
40122 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
40123 else if (vmode == V32QImode)
40124 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
40125 else if (vmode == V8SFmode)
40126 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
40127 else
40128 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
40130 else
40132 op1 = gen_lowpart (vmode, d->op1);
40133 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
40135 if (target != d->target)
40136 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40138 return true;
40141 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
40142 in a single instruction. */
40144 static bool
40145 expand_vec_perm_1 (struct expand_vec_perm_d *d)
40147 unsigned i, nelt = d->nelt;
40148 unsigned char perm2[MAX_VECT_LEN];
40150 /* Check plain VEC_SELECT first, because AVX has instructions that could
40151 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
40152 input where SEL+CONCAT may not. */
40153 if (d->one_operand_p)
40155 int mask = nelt - 1;
40156 bool identity_perm = true;
40157 bool broadcast_perm = true;
40159 for (i = 0; i < nelt; i++)
40161 perm2[i] = d->perm[i] & mask;
40162 if (perm2[i] != i)
40163 identity_perm = false;
40164 if (perm2[i])
40165 broadcast_perm = false;
40168 if (identity_perm)
40170 if (!d->testing_p)
40171 emit_move_insn (d->target, d->op0);
40172 return true;
40174 else if (broadcast_perm && TARGET_AVX2)
40176 /* Use vpbroadcast{b,w,d}. */
40177 rtx (*gen) (rtx, rtx) = NULL;
40178 switch (d->vmode)
40180 case V32QImode:
40181 gen = gen_avx2_pbroadcastv32qi_1;
40182 break;
40183 case V16HImode:
40184 gen = gen_avx2_pbroadcastv16hi_1;
40185 break;
40186 case V8SImode:
40187 gen = gen_avx2_pbroadcastv8si_1;
40188 break;
40189 case V16QImode:
40190 gen = gen_avx2_pbroadcastv16qi;
40191 break;
40192 case V8HImode:
40193 gen = gen_avx2_pbroadcastv8hi;
40194 break;
40195 case V8SFmode:
40196 gen = gen_avx2_vec_dupv8sf_1;
40197 break;
40198 /* For other modes prefer other shuffles this function creates. */
40199 default: break;
40201 if (gen != NULL)
40203 if (!d->testing_p)
40204 emit_insn (gen (d->target, d->op0));
40205 return true;
40209 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
40210 return true;
40212 /* There are plenty of patterns in sse.md that are written for
40213 SEL+CONCAT and are not replicated for a single op. Perhaps
40214 that should be changed, to avoid the nastiness here. */
40216 /* Recognize interleave style patterns, which means incrementing
40217 every other permutation operand. */
40218 for (i = 0; i < nelt; i += 2)
40220 perm2[i] = d->perm[i] & mask;
40221 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
40223 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40224 d->testing_p))
40225 return true;
40227 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
40228 if (nelt >= 4)
40230 for (i = 0; i < nelt; i += 4)
40232 perm2[i + 0] = d->perm[i + 0] & mask;
40233 perm2[i + 1] = d->perm[i + 1] & mask;
40234 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
40235 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
40238 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40239 d->testing_p))
40240 return true;
40244 /* Finally, try the fully general two operand permute. */
40245 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
40246 d->testing_p))
40247 return true;
40249 /* Recognize interleave style patterns with reversed operands. */
40250 if (!d->one_operand_p)
40252 for (i = 0; i < nelt; ++i)
40254 unsigned e = d->perm[i];
40255 if (e >= nelt)
40256 e -= nelt;
40257 else
40258 e += nelt;
40259 perm2[i] = e;
40262 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
40263 d->testing_p))
40264 return true;
40267 /* Try the SSE4.1 blend variable merge instructions. */
40268 if (expand_vec_perm_blend (d))
40269 return true;
40271 /* Try one of the AVX vpermil variable permutations. */
40272 if (expand_vec_perm_vpermil (d))
40273 return true;
40275 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
40276 vpshufb, vpermd, vpermps or vpermq variable permutation. */
40277 if (expand_vec_perm_pshufb (d))
40278 return true;
40280 return false;
40283 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40284 in terms of a pair of pshuflw + pshufhw instructions. */
40286 static bool
40287 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
40289 unsigned char perm2[MAX_VECT_LEN];
40290 unsigned i;
40291 bool ok;
40293 if (d->vmode != V8HImode || !d->one_operand_p)
40294 return false;
40296 /* The two permutations only operate in 64-bit lanes. */
40297 for (i = 0; i < 4; ++i)
40298 if (d->perm[i] >= 4)
40299 return false;
40300 for (i = 4; i < 8; ++i)
40301 if (d->perm[i] < 4)
40302 return false;
40304 if (d->testing_p)
40305 return true;
40307 /* Emit the pshuflw. */
40308 memcpy (perm2, d->perm, 4);
40309 for (i = 4; i < 8; ++i)
40310 perm2[i] = i;
40311 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
40312 gcc_assert (ok);
40314 /* Emit the pshufhw. */
40315 memcpy (perm2 + 4, d->perm + 4, 4);
40316 for (i = 0; i < 4; ++i)
40317 perm2[i] = i;
40318 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
40319 gcc_assert (ok);
40321 return true;
40324 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40325 the permutation using the SSSE3 palignr instruction. This succeeds
40326 when all of the elements in PERM fit within one vector and we merely
40327 need to shift them down so that a single vector permutation has a
40328 chance to succeed. */
40330 static bool
40331 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
40333 unsigned i, nelt = d->nelt;
40334 unsigned min, max;
40335 bool in_order, ok;
40336 rtx shift, target;
40337 struct expand_vec_perm_d dcopy;
40339 /* Even with AVX, palignr only operates on 128-bit vectors. */
40340 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40341 return false;
40343 min = nelt, max = 0;
40344 for (i = 0; i < nelt; ++i)
40346 unsigned e = d->perm[i];
40347 if (e < min)
40348 min = e;
40349 if (e > max)
40350 max = e;
40352 if (min == 0 || max - min >= nelt)
40353 return false;
40355 /* Given that we have SSSE3, we know we'll be able to implement the
40356 single operand permutation after the palignr with pshufb. */
40357 if (d->testing_p)
40358 return true;
40360 dcopy = *d;
40361 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
40362 target = gen_reg_rtx (TImode);
40363 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
40364 gen_lowpart (TImode, d->op0), shift));
40366 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
40367 dcopy.one_operand_p = true;
40369 in_order = true;
40370 for (i = 0; i < nelt; ++i)
40372 unsigned e = dcopy.perm[i] - min;
40373 if (e != i)
40374 in_order = false;
40375 dcopy.perm[i] = e;
40378 /* Test for the degenerate case where the alignment by itself
40379 produces the desired permutation. */
40380 if (in_order)
40382 emit_move_insn (d->target, dcopy.op0);
40383 return true;
40386 ok = expand_vec_perm_1 (&dcopy);
40387 gcc_assert (ok);
40389 return ok;
40392 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
40394 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40395 a two vector permutation into a single vector permutation by using
40396 an interleave operation to merge the vectors. */
40398 static bool
40399 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
40401 struct expand_vec_perm_d dremap, dfinal;
40402 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
40403 unsigned HOST_WIDE_INT contents;
40404 unsigned char remap[2 * MAX_VECT_LEN];
40405 rtx seq;
40406 bool ok, same_halves = false;
40408 if (GET_MODE_SIZE (d->vmode) == 16)
40410 if (d->one_operand_p)
40411 return false;
40413 else if (GET_MODE_SIZE (d->vmode) == 32)
40415 if (!TARGET_AVX)
40416 return false;
40417 /* For 32-byte modes allow even d->one_operand_p.
40418 The lack of cross-lane shuffling in some instructions
40419 might prevent a single insn shuffle. */
40420 dfinal = *d;
40421 dfinal.testing_p = true;
40422 /* If expand_vec_perm_interleave3 can expand this into
40423 a 3 insn sequence, give up and let it be expanded as
40424 3 insn sequence. While that is one insn longer,
40425 it doesn't need a memory operand and in the common
40426 case that both interleave low and high permutations
40427 with the same operands are adjacent needs 4 insns
40428 for both after CSE. */
40429 if (expand_vec_perm_interleave3 (&dfinal))
40430 return false;
40432 else
40433 return false;
40435 /* Examine from whence the elements come. */
40436 contents = 0;
40437 for (i = 0; i < nelt; ++i)
40438 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
40440 memset (remap, 0xff, sizeof (remap));
40441 dremap = *d;
40443 if (GET_MODE_SIZE (d->vmode) == 16)
40445 unsigned HOST_WIDE_INT h1, h2, h3, h4;
40447 /* Split the two input vectors into 4 halves. */
40448 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
40449 h2 = h1 << nelt2;
40450 h3 = h2 << nelt2;
40451 h4 = h3 << nelt2;
40453 /* If the elements from the low halves use interleave low, and similarly
40454 for interleave high. If the elements are from mis-matched halves, we
40455 can use shufps for V4SF/V4SI or do a DImode shuffle. */
40456 if ((contents & (h1 | h3)) == contents)
40458 /* punpckl* */
40459 for (i = 0; i < nelt2; ++i)
40461 remap[i] = i * 2;
40462 remap[i + nelt] = i * 2 + 1;
40463 dremap.perm[i * 2] = i;
40464 dremap.perm[i * 2 + 1] = i + nelt;
40466 if (!TARGET_SSE2 && d->vmode == V4SImode)
40467 dremap.vmode = V4SFmode;
40469 else if ((contents & (h2 | h4)) == contents)
40471 /* punpckh* */
40472 for (i = 0; i < nelt2; ++i)
40474 remap[i + nelt2] = i * 2;
40475 remap[i + nelt + nelt2] = i * 2 + 1;
40476 dremap.perm[i * 2] = i + nelt2;
40477 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
40479 if (!TARGET_SSE2 && d->vmode == V4SImode)
40480 dremap.vmode = V4SFmode;
40482 else if ((contents & (h1 | h4)) == contents)
40484 /* shufps */
40485 for (i = 0; i < nelt2; ++i)
40487 remap[i] = i;
40488 remap[i + nelt + nelt2] = i + nelt2;
40489 dremap.perm[i] = i;
40490 dremap.perm[i + nelt2] = i + nelt + nelt2;
40492 if (nelt != 4)
40494 /* shufpd */
40495 dremap.vmode = V2DImode;
40496 dremap.nelt = 2;
40497 dremap.perm[0] = 0;
40498 dremap.perm[1] = 3;
40501 else if ((contents & (h2 | h3)) == contents)
40503 /* shufps */
40504 for (i = 0; i < nelt2; ++i)
40506 remap[i + nelt2] = i;
40507 remap[i + nelt] = i + nelt2;
40508 dremap.perm[i] = i + nelt2;
40509 dremap.perm[i + nelt2] = i + nelt;
40511 if (nelt != 4)
40513 /* shufpd */
40514 dremap.vmode = V2DImode;
40515 dremap.nelt = 2;
40516 dremap.perm[0] = 1;
40517 dremap.perm[1] = 2;
40520 else
40521 return false;
40523 else
40525 unsigned int nelt4 = nelt / 4, nzcnt = 0;
40526 unsigned HOST_WIDE_INT q[8];
40527 unsigned int nonzero_halves[4];
40529 /* Split the two input vectors into 8 quarters. */
40530 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
40531 for (i = 1; i < 8; ++i)
40532 q[i] = q[0] << (nelt4 * i);
40533 for (i = 0; i < 4; ++i)
40534 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
40536 nonzero_halves[nzcnt] = i;
40537 ++nzcnt;
40540 if (nzcnt == 1)
40542 gcc_assert (d->one_operand_p);
40543 nonzero_halves[1] = nonzero_halves[0];
40544 same_halves = true;
40546 else if (d->one_operand_p)
40548 gcc_assert (nonzero_halves[0] == 0);
40549 gcc_assert (nonzero_halves[1] == 1);
40552 if (nzcnt <= 2)
40554 if (d->perm[0] / nelt2 == nonzero_halves[1])
40556 /* Attempt to increase the likelihood that dfinal
40557 shuffle will be intra-lane. */
40558 char tmph = nonzero_halves[0];
40559 nonzero_halves[0] = nonzero_halves[1];
40560 nonzero_halves[1] = tmph;
40563 /* vperm2f128 or vperm2i128. */
40564 for (i = 0; i < nelt2; ++i)
40566 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
40567 remap[i + nonzero_halves[0] * nelt2] = i;
40568 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
40569 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
40572 if (d->vmode != V8SFmode
40573 && d->vmode != V4DFmode
40574 && d->vmode != V8SImode)
40576 dremap.vmode = V8SImode;
40577 dremap.nelt = 8;
40578 for (i = 0; i < 4; ++i)
40580 dremap.perm[i] = i + nonzero_halves[0] * 4;
40581 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
40585 else if (d->one_operand_p)
40586 return false;
40587 else if (TARGET_AVX2
40588 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
40590 /* vpunpckl* */
40591 for (i = 0; i < nelt4; ++i)
40593 remap[i] = i * 2;
40594 remap[i + nelt] = i * 2 + 1;
40595 remap[i + nelt2] = i * 2 + nelt2;
40596 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
40597 dremap.perm[i * 2] = i;
40598 dremap.perm[i * 2 + 1] = i + nelt;
40599 dremap.perm[i * 2 + nelt2] = i + nelt2;
40600 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
40603 else if (TARGET_AVX2
40604 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
40606 /* vpunpckh* */
40607 for (i = 0; i < nelt4; ++i)
40609 remap[i + nelt4] = i * 2;
40610 remap[i + nelt + nelt4] = i * 2 + 1;
40611 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
40612 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
40613 dremap.perm[i * 2] = i + nelt4;
40614 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
40615 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
40616 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
40619 else
40620 return false;
40623 /* Use the remapping array set up above to move the elements from their
40624 swizzled locations into their final destinations. */
40625 dfinal = *d;
40626 for (i = 0; i < nelt; ++i)
40628 unsigned e = remap[d->perm[i]];
40629 gcc_assert (e < nelt);
40630 /* If same_halves is true, both halves of the remapped vector are the
40631 same. Avoid cross-lane accesses if possible. */
40632 if (same_halves && i >= nelt2)
40634 gcc_assert (e < nelt2);
40635 dfinal.perm[i] = e + nelt2;
40637 else
40638 dfinal.perm[i] = e;
40640 dremap.target = gen_reg_rtx (dremap.vmode);
40641 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40642 dfinal.op1 = dfinal.op0;
40643 dfinal.one_operand_p = true;
40645 /* Test if the final remap can be done with a single insn. For V4SFmode or
40646 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
40647 start_sequence ();
40648 ok = expand_vec_perm_1 (&dfinal);
40649 seq = get_insns ();
40650 end_sequence ();
40652 if (!ok)
40653 return false;
40655 if (d->testing_p)
40656 return true;
40658 if (dremap.vmode != dfinal.vmode)
40660 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
40661 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
40664 ok = expand_vec_perm_1 (&dremap);
40665 gcc_assert (ok);
40667 emit_insn (seq);
40668 return true;
40671 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40672 a single vector cross-lane permutation into vpermq followed
40673 by any of the single insn permutations. */
40675 static bool
40676 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
40678 struct expand_vec_perm_d dremap, dfinal;
40679 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
40680 unsigned contents[2];
40681 bool ok;
40683 if (!(TARGET_AVX2
40684 && (d->vmode == V32QImode || d->vmode == V16HImode)
40685 && d->one_operand_p))
40686 return false;
40688 contents[0] = 0;
40689 contents[1] = 0;
40690 for (i = 0; i < nelt2; ++i)
40692 contents[0] |= 1u << (d->perm[i] / nelt4);
40693 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
40696 for (i = 0; i < 2; ++i)
40698 unsigned int cnt = 0;
40699 for (j = 0; j < 4; ++j)
40700 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
40701 return false;
40704 if (d->testing_p)
40705 return true;
40707 dremap = *d;
40708 dremap.vmode = V4DImode;
40709 dremap.nelt = 4;
40710 dremap.target = gen_reg_rtx (V4DImode);
40711 dremap.op0 = gen_lowpart (V4DImode, d->op0);
40712 dremap.op1 = dremap.op0;
40713 dremap.one_operand_p = true;
40714 for (i = 0; i < 2; ++i)
40716 unsigned int cnt = 0;
40717 for (j = 0; j < 4; ++j)
40718 if ((contents[i] & (1u << j)) != 0)
40719 dremap.perm[2 * i + cnt++] = j;
40720 for (; cnt < 2; ++cnt)
40721 dremap.perm[2 * i + cnt] = 0;
40724 dfinal = *d;
40725 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40726 dfinal.op1 = dfinal.op0;
40727 dfinal.one_operand_p = true;
40728 for (i = 0, j = 0; i < nelt; ++i)
40730 if (i == nelt2)
40731 j = 2;
40732 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
40733 if ((d->perm[i] / nelt4) == dremap.perm[j])
40735 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
40736 dfinal.perm[i] |= nelt4;
40737 else
40738 gcc_unreachable ();
40741 ok = expand_vec_perm_1 (&dremap);
40742 gcc_assert (ok);
40744 ok = expand_vec_perm_1 (&dfinal);
40745 gcc_assert (ok);
40747 return true;
40750 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
40751 a vector permutation using two instructions, vperm2f128 resp.
40752 vperm2i128 followed by any single in-lane permutation. */
40754 static bool
40755 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
40757 struct expand_vec_perm_d dfirst, dsecond;
40758 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
40759 bool ok;
40761 if (!TARGET_AVX
40762 || GET_MODE_SIZE (d->vmode) != 32
40763 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40764 return false;
40766 dsecond = *d;
40767 dsecond.one_operand_p = false;
40768 dsecond.testing_p = true;
40770 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40771 immediate. For perm < 16 the second permutation uses
40772 d->op0 as first operand, for perm >= 16 it uses d->op1
40773 as first operand. The second operand is the result of
40774 vperm2[fi]128. */
40775 for (perm = 0; perm < 32; perm++)
40777 /* Ignore permutations which do not move anything cross-lane. */
40778 if (perm < 16)
40780 /* The second shuffle for e.g. V4DFmode has
40781 0123 and ABCD operands.
40782 Ignore AB23, as 23 is already in the second lane
40783 of the first operand. */
40784 if ((perm & 0xc) == (1 << 2)) continue;
40785 /* And 01CD, as 01 is in the first lane of the first
40786 operand. */
40787 if ((perm & 3) == 0) continue;
40788 /* And 4567, as then the vperm2[fi]128 doesn't change
40789 anything on the original 4567 second operand. */
40790 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40792 else
40794 /* The second shuffle for e.g. V4DFmode has
40795 4567 and ABCD operands.
40796 Ignore AB67, as 67 is already in the second lane
40797 of the first operand. */
40798 if ((perm & 0xc) == (3 << 2)) continue;
40799 /* And 45CD, as 45 is in the first lane of the first
40800 operand. */
40801 if ((perm & 3) == 2) continue;
40802 /* And 0123, as then the vperm2[fi]128 doesn't change
40803 anything on the original 0123 first operand. */
40804 if ((perm & 0xf) == (1 << 2)) continue;
40807 for (i = 0; i < nelt; i++)
40809 j = d->perm[i] / nelt2;
40810 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40811 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40812 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40813 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40814 else
40815 break;
40818 if (i == nelt)
40820 start_sequence ();
40821 ok = expand_vec_perm_1 (&dsecond);
40822 end_sequence ();
40824 else
40825 ok = false;
40827 if (ok)
40829 if (d->testing_p)
40830 return true;
40832 /* Found a usable second shuffle. dfirst will be
40833 vperm2f128 on d->op0 and d->op1. */
40834 dsecond.testing_p = false;
40835 dfirst = *d;
40836 dfirst.target = gen_reg_rtx (d->vmode);
40837 for (i = 0; i < nelt; i++)
40838 dfirst.perm[i] = (i & (nelt2 - 1))
40839 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40841 ok = expand_vec_perm_1 (&dfirst);
40842 gcc_assert (ok);
40844 /* And dsecond is some single insn shuffle, taking
40845 d->op0 and result of vperm2f128 (if perm < 16) or
40846 d->op1 and result of vperm2f128 (otherwise). */
40847 dsecond.op1 = dfirst.target;
40848 if (perm >= 16)
40849 dsecond.op0 = dfirst.op1;
40851 ok = expand_vec_perm_1 (&dsecond);
40852 gcc_assert (ok);
40854 return true;
40857 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40858 if (d->one_operand_p)
40859 return false;
40862 return false;
40865 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40866 a two vector permutation using 2 intra-lane interleave insns
40867 and cross-lane shuffle for 32-byte vectors. */
40869 static bool
40870 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40872 unsigned i, nelt;
40873 rtx (*gen) (rtx, rtx, rtx);
40875 if (d->one_operand_p)
40876 return false;
40877 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40879 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40881 else
40882 return false;
40884 nelt = d->nelt;
40885 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40886 return false;
40887 for (i = 0; i < nelt; i += 2)
40888 if (d->perm[i] != d->perm[0] + i / 2
40889 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40890 return false;
40892 if (d->testing_p)
40893 return true;
40895 switch (d->vmode)
40897 case V32QImode:
40898 if (d->perm[0])
40899 gen = gen_vec_interleave_highv32qi;
40900 else
40901 gen = gen_vec_interleave_lowv32qi;
40902 break;
40903 case V16HImode:
40904 if (d->perm[0])
40905 gen = gen_vec_interleave_highv16hi;
40906 else
40907 gen = gen_vec_interleave_lowv16hi;
40908 break;
40909 case V8SImode:
40910 if (d->perm[0])
40911 gen = gen_vec_interleave_highv8si;
40912 else
40913 gen = gen_vec_interleave_lowv8si;
40914 break;
40915 case V4DImode:
40916 if (d->perm[0])
40917 gen = gen_vec_interleave_highv4di;
40918 else
40919 gen = gen_vec_interleave_lowv4di;
40920 break;
40921 case V8SFmode:
40922 if (d->perm[0])
40923 gen = gen_vec_interleave_highv8sf;
40924 else
40925 gen = gen_vec_interleave_lowv8sf;
40926 break;
40927 case V4DFmode:
40928 if (d->perm[0])
40929 gen = gen_vec_interleave_highv4df;
40930 else
40931 gen = gen_vec_interleave_lowv4df;
40932 break;
40933 default:
40934 gcc_unreachable ();
40937 emit_insn (gen (d->target, d->op0, d->op1));
40938 return true;
40941 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40942 a single vector permutation using a single intra-lane vector
40943 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40944 the non-swapped and swapped vectors together. */
40946 static bool
40947 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40949 struct expand_vec_perm_d dfirst, dsecond;
40950 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40951 rtx seq;
40952 bool ok;
40953 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40955 if (!TARGET_AVX
40956 || TARGET_AVX2
40957 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40958 || !d->one_operand_p)
40959 return false;
40961 dfirst = *d;
40962 for (i = 0; i < nelt; i++)
40963 dfirst.perm[i] = 0xff;
40964 for (i = 0, msk = 0; i < nelt; i++)
40966 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40967 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40968 return false;
40969 dfirst.perm[j] = d->perm[i];
40970 if (j != i)
40971 msk |= (1 << i);
40973 for (i = 0; i < nelt; i++)
40974 if (dfirst.perm[i] == 0xff)
40975 dfirst.perm[i] = i;
40977 if (!d->testing_p)
40978 dfirst.target = gen_reg_rtx (dfirst.vmode);
40980 start_sequence ();
40981 ok = expand_vec_perm_1 (&dfirst);
40982 seq = get_insns ();
40983 end_sequence ();
40985 if (!ok)
40986 return false;
40988 if (d->testing_p)
40989 return true;
40991 emit_insn (seq);
40993 dsecond = *d;
40994 dsecond.op0 = dfirst.target;
40995 dsecond.op1 = dfirst.target;
40996 dsecond.one_operand_p = true;
40997 dsecond.target = gen_reg_rtx (dsecond.vmode);
40998 for (i = 0; i < nelt; i++)
40999 dsecond.perm[i] = i ^ nelt2;
41001 ok = expand_vec_perm_1 (&dsecond);
41002 gcc_assert (ok);
41004 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
41005 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
41006 return true;
41009 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
41010 permutation using two vperm2f128, followed by a vshufpd insn blending
41011 the two vectors together. */
41013 static bool
41014 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
41016 struct expand_vec_perm_d dfirst, dsecond, dthird;
41017 bool ok;
41019 if (!TARGET_AVX || (d->vmode != V4DFmode))
41020 return false;
41022 if (d->testing_p)
41023 return true;
41025 dfirst = *d;
41026 dsecond = *d;
41027 dthird = *d;
41029 dfirst.perm[0] = (d->perm[0] & ~1);
41030 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
41031 dfirst.perm[2] = (d->perm[2] & ~1);
41032 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
41033 dsecond.perm[0] = (d->perm[1] & ~1);
41034 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
41035 dsecond.perm[2] = (d->perm[3] & ~1);
41036 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
41037 dthird.perm[0] = (d->perm[0] % 2);
41038 dthird.perm[1] = (d->perm[1] % 2) + 4;
41039 dthird.perm[2] = (d->perm[2] % 2) + 2;
41040 dthird.perm[3] = (d->perm[3] % 2) + 6;
41042 dfirst.target = gen_reg_rtx (dfirst.vmode);
41043 dsecond.target = gen_reg_rtx (dsecond.vmode);
41044 dthird.op0 = dfirst.target;
41045 dthird.op1 = dsecond.target;
41046 dthird.one_operand_p = false;
41048 canonicalize_perm (&dfirst);
41049 canonicalize_perm (&dsecond);
41051 ok = expand_vec_perm_1 (&dfirst)
41052 && expand_vec_perm_1 (&dsecond)
41053 && expand_vec_perm_1 (&dthird);
41055 gcc_assert (ok);
41057 return true;
41060 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
41061 permutation with two pshufb insns and an ior. We should have already
41062 failed all two instruction sequences. */
41064 static bool
41065 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
41067 rtx rperm[2][16], vperm, l, h, op, m128;
41068 unsigned int i, nelt, eltsz;
41070 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
41071 return false;
41072 gcc_assert (!d->one_operand_p);
41074 nelt = d->nelt;
41075 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41077 /* Generate two permutation masks. If the required element is within
41078 the given vector it is shuffled into the proper lane. If the required
41079 element is in the other vector, force a zero into the lane by setting
41080 bit 7 in the permutation mask. */
41081 m128 = GEN_INT (-128);
41082 for (i = 0; i < nelt; ++i)
41084 unsigned j, e = d->perm[i];
41085 unsigned which = (e >= nelt);
41086 if (e >= nelt)
41087 e -= nelt;
41089 for (j = 0; j < eltsz; ++j)
41091 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
41092 rperm[1-which][i*eltsz + j] = m128;
41096 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
41097 vperm = force_reg (V16QImode, vperm);
41099 l = gen_reg_rtx (V16QImode);
41100 op = gen_lowpart (V16QImode, d->op0);
41101 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
41103 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
41104 vperm = force_reg (V16QImode, vperm);
41106 h = gen_reg_rtx (V16QImode);
41107 op = gen_lowpart (V16QImode, d->op1);
41108 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
41110 op = d->target;
41111 if (d->vmode != V16QImode)
41112 op = gen_reg_rtx (V16QImode);
41113 emit_insn (gen_iorv16qi3 (op, l, h));
41114 if (op != d->target)
41115 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41117 return true;
41120 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
41121 with two vpshufb insns, vpermq and vpor. We should have already failed
41122 all two or three instruction sequences. */
41124 static bool
41125 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
41127 rtx rperm[2][32], vperm, l, h, hp, op, m128;
41128 unsigned int i, nelt, eltsz;
41130 if (!TARGET_AVX2
41131 || !d->one_operand_p
41132 || (d->vmode != V32QImode && d->vmode != V16HImode))
41133 return false;
41135 if (d->testing_p)
41136 return true;
41138 nelt = d->nelt;
41139 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41141 /* Generate two permutation masks. If the required element is within
41142 the same lane, it is shuffled in. If the required element from the
41143 other lane, force a zero by setting bit 7 in the permutation mask.
41144 In the other mask the mask has non-negative elements if element
41145 is requested from the other lane, but also moved to the other lane,
41146 so that the result of vpshufb can have the two V2TImode halves
41147 swapped. */
41148 m128 = GEN_INT (-128);
41149 for (i = 0; i < nelt; ++i)
41151 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41152 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41154 for (j = 0; j < eltsz; ++j)
41156 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
41157 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
41161 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41162 vperm = force_reg (V32QImode, vperm);
41164 h = gen_reg_rtx (V32QImode);
41165 op = gen_lowpart (V32QImode, d->op0);
41166 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41168 /* Swap the 128-byte lanes of h into hp. */
41169 hp = gen_reg_rtx (V4DImode);
41170 op = gen_lowpart (V4DImode, h);
41171 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
41172 const1_rtx));
41174 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41175 vperm = force_reg (V32QImode, vperm);
41177 l = gen_reg_rtx (V32QImode);
41178 op = gen_lowpart (V32QImode, d->op0);
41179 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41181 op = d->target;
41182 if (d->vmode != V32QImode)
41183 op = gen_reg_rtx (V32QImode);
41184 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
41185 if (op != d->target)
41186 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41188 return true;
41191 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
41192 and extract-odd permutations of two V32QImode and V16QImode operand
41193 with two vpshufb insns, vpor and vpermq. We should have already
41194 failed all two or three instruction sequences. */
41196 static bool
41197 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
41199 rtx rperm[2][32], vperm, l, h, ior, op, m128;
41200 unsigned int i, nelt, eltsz;
41202 if (!TARGET_AVX2
41203 || d->one_operand_p
41204 || (d->vmode != V32QImode && d->vmode != V16HImode))
41205 return false;
41207 for (i = 0; i < d->nelt; ++i)
41208 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
41209 return false;
41211 if (d->testing_p)
41212 return true;
41214 nelt = d->nelt;
41215 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41217 /* Generate two permutation masks. In the first permutation mask
41218 the first quarter will contain indexes for the first half
41219 of the op0, the second quarter will contain bit 7 set, third quarter
41220 will contain indexes for the second half of the op0 and the
41221 last quarter bit 7 set. In the second permutation mask
41222 the first quarter will contain bit 7 set, the second quarter
41223 indexes for the first half of the op1, the third quarter bit 7 set
41224 and last quarter indexes for the second half of the op1.
41225 I.e. the first mask e.g. for V32QImode extract even will be:
41226 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
41227 (all values masked with 0xf except for -128) and second mask
41228 for extract even will be
41229 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
41230 m128 = GEN_INT (-128);
41231 for (i = 0; i < nelt; ++i)
41233 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41234 unsigned which = d->perm[i] >= nelt;
41235 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
41237 for (j = 0; j < eltsz; ++j)
41239 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
41240 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
41244 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41245 vperm = force_reg (V32QImode, vperm);
41247 l = gen_reg_rtx (V32QImode);
41248 op = gen_lowpart (V32QImode, d->op0);
41249 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41251 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41252 vperm = force_reg (V32QImode, vperm);
41254 h = gen_reg_rtx (V32QImode);
41255 op = gen_lowpart (V32QImode, d->op1);
41256 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41258 ior = gen_reg_rtx (V32QImode);
41259 emit_insn (gen_iorv32qi3 (ior, l, h));
41261 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
41262 op = gen_reg_rtx (V4DImode);
41263 ior = gen_lowpart (V4DImode, ior);
41264 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
41265 const1_rtx, GEN_INT (3)));
41266 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41268 return true;
41271 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
41272 and extract-odd permutations. */
41274 static bool
41275 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
41277 rtx t1, t2, t3, t4, t5;
41279 switch (d->vmode)
41281 case V4DFmode:
41282 t1 = gen_reg_rtx (V4DFmode);
41283 t2 = gen_reg_rtx (V4DFmode);
41285 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41286 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
41287 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
41289 /* Now an unpck[lh]pd will produce the result required. */
41290 if (odd)
41291 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
41292 else
41293 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
41294 emit_insn (t3);
41295 break;
41297 case V8SFmode:
41299 int mask = odd ? 0xdd : 0x88;
41301 t1 = gen_reg_rtx (V8SFmode);
41302 t2 = gen_reg_rtx (V8SFmode);
41303 t3 = gen_reg_rtx (V8SFmode);
41305 /* Shuffle within the 128-bit lanes to produce:
41306 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
41307 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
41308 GEN_INT (mask)));
41310 /* Shuffle the lanes around to produce:
41311 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
41312 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
41313 GEN_INT (0x3)));
41315 /* Shuffle within the 128-bit lanes to produce:
41316 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
41317 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
41319 /* Shuffle within the 128-bit lanes to produce:
41320 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
41321 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
41323 /* Shuffle the lanes around to produce:
41324 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
41325 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
41326 GEN_INT (0x20)));
41328 break;
41330 case V2DFmode:
41331 case V4SFmode:
41332 case V2DImode:
41333 case V4SImode:
41334 /* These are always directly implementable by expand_vec_perm_1. */
41335 gcc_unreachable ();
41337 case V8HImode:
41338 if (TARGET_SSSE3)
41339 return expand_vec_perm_pshufb2 (d);
41340 else
41342 /* We need 2*log2(N)-1 operations to achieve odd/even
41343 with interleave. */
41344 t1 = gen_reg_rtx (V8HImode);
41345 t2 = gen_reg_rtx (V8HImode);
41346 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
41347 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
41348 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
41349 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
41350 if (odd)
41351 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
41352 else
41353 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
41354 emit_insn (t3);
41356 break;
41358 case V16QImode:
41359 if (TARGET_SSSE3)
41360 return expand_vec_perm_pshufb2 (d);
41361 else
41363 t1 = gen_reg_rtx (V16QImode);
41364 t2 = gen_reg_rtx (V16QImode);
41365 t3 = gen_reg_rtx (V16QImode);
41366 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
41367 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
41368 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
41369 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
41370 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
41371 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
41372 if (odd)
41373 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
41374 else
41375 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
41376 emit_insn (t3);
41378 break;
41380 case V16HImode:
41381 case V32QImode:
41382 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
41384 case V4DImode:
41385 if (!TARGET_AVX2)
41387 struct expand_vec_perm_d d_copy = *d;
41388 d_copy.vmode = V4DFmode;
41389 d_copy.target = gen_reg_rtx (V4DFmode);
41390 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
41391 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
41392 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41394 if (!d->testing_p)
41395 emit_move_insn (d->target,
41396 gen_lowpart (V4DImode, d_copy.target));
41397 return true;
41399 return false;
41402 t1 = gen_reg_rtx (V4DImode);
41403 t2 = gen_reg_rtx (V4DImode);
41405 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41406 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
41407 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
41409 /* Now an vpunpck[lh]qdq will produce the result required. */
41410 if (odd)
41411 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
41412 else
41413 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
41414 emit_insn (t3);
41415 break;
41417 case V8SImode:
41418 if (!TARGET_AVX2)
41420 struct expand_vec_perm_d d_copy = *d;
41421 d_copy.vmode = V8SFmode;
41422 d_copy.target = gen_reg_rtx (V8SFmode);
41423 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
41424 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
41425 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41427 if (!d->testing_p)
41428 emit_move_insn (d->target,
41429 gen_lowpart (V8SImode, d_copy.target));
41430 return true;
41432 return false;
41435 t1 = gen_reg_rtx (V8SImode);
41436 t2 = gen_reg_rtx (V8SImode);
41437 t3 = gen_reg_rtx (V4DImode);
41438 t4 = gen_reg_rtx (V4DImode);
41439 t5 = gen_reg_rtx (V4DImode);
41441 /* Shuffle the lanes around into
41442 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
41443 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
41444 gen_lowpart (V4DImode, d->op1),
41445 GEN_INT (0x20)));
41446 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
41447 gen_lowpart (V4DImode, d->op1),
41448 GEN_INT (0x31)));
41450 /* Swap the 2nd and 3rd position in each lane into
41451 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
41452 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
41453 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41454 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
41455 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41457 /* Now an vpunpck[lh]qdq will produce
41458 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
41459 if (odd)
41460 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
41461 gen_lowpart (V4DImode, t2));
41462 else
41463 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
41464 gen_lowpart (V4DImode, t2));
41465 emit_insn (t3);
41466 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
41467 break;
41469 default:
41470 gcc_unreachable ();
41473 return true;
41476 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41477 extract-even and extract-odd permutations. */
41479 static bool
41480 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
41482 unsigned i, odd, nelt = d->nelt;
41484 odd = d->perm[0];
41485 if (odd != 0 && odd != 1)
41486 return false;
41488 for (i = 1; i < nelt; ++i)
41489 if (d->perm[i] != 2 * i + odd)
41490 return false;
41492 return expand_vec_perm_even_odd_1 (d, odd);
41495 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
41496 permutations. We assume that expand_vec_perm_1 has already failed. */
41498 static bool
41499 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
41501 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
41502 enum machine_mode vmode = d->vmode;
41503 unsigned char perm2[4];
41504 rtx op0 = d->op0, dest;
41505 bool ok;
41507 switch (vmode)
41509 case V4DFmode:
41510 case V8SFmode:
41511 /* These are special-cased in sse.md so that we can optionally
41512 use the vbroadcast instruction. They expand to two insns
41513 if the input happens to be in a register. */
41514 gcc_unreachable ();
41516 case V2DFmode:
41517 case V2DImode:
41518 case V4SFmode:
41519 case V4SImode:
41520 /* These are always implementable using standard shuffle patterns. */
41521 gcc_unreachable ();
41523 case V8HImode:
41524 case V16QImode:
41525 /* These can be implemented via interleave. We save one insn by
41526 stopping once we have promoted to V4SImode and then use pshufd. */
41529 rtx dest;
41530 rtx (*gen) (rtx, rtx, rtx)
41531 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
41532 : gen_vec_interleave_lowv8hi;
41534 if (elt >= nelt2)
41536 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
41537 : gen_vec_interleave_highv8hi;
41538 elt -= nelt2;
41540 nelt2 /= 2;
41542 dest = gen_reg_rtx (vmode);
41543 emit_insn (gen (dest, op0, op0));
41544 vmode = get_mode_wider_vector (vmode);
41545 op0 = gen_lowpart (vmode, dest);
41547 while (vmode != V4SImode);
41549 memset (perm2, elt, 4);
41550 dest = gen_reg_rtx (V4SImode);
41551 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
41552 gcc_assert (ok);
41553 if (!d->testing_p)
41554 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
41555 return true;
41557 case V32QImode:
41558 case V16HImode:
41559 case V8SImode:
41560 case V4DImode:
41561 /* For AVX2 broadcasts of the first element vpbroadcast* or
41562 vpermq should be used by expand_vec_perm_1. */
41563 gcc_assert (!TARGET_AVX2 || d->perm[0]);
41564 return false;
41566 default:
41567 gcc_unreachable ();
41571 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41572 broadcast permutations. */
41574 static bool
41575 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
41577 unsigned i, elt, nelt = d->nelt;
41579 if (!d->one_operand_p)
41580 return false;
41582 elt = d->perm[0];
41583 for (i = 1; i < nelt; ++i)
41584 if (d->perm[i] != elt)
41585 return false;
41587 return expand_vec_perm_broadcast_1 (d);
41590 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
41591 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
41592 all the shorter instruction sequences. */
41594 static bool
41595 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
41597 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
41598 unsigned int i, nelt, eltsz;
41599 bool used[4];
41601 if (!TARGET_AVX2
41602 || d->one_operand_p
41603 || (d->vmode != V32QImode && d->vmode != V16HImode))
41604 return false;
41606 if (d->testing_p)
41607 return true;
41609 nelt = d->nelt;
41610 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41612 /* Generate 4 permutation masks. If the required element is within
41613 the same lane, it is shuffled in. If the required element from the
41614 other lane, force a zero by setting bit 7 in the permutation mask.
41615 In the other mask the mask has non-negative elements if element
41616 is requested from the other lane, but also moved to the other lane,
41617 so that the result of vpshufb can have the two V2TImode halves
41618 swapped. */
41619 m128 = GEN_INT (-128);
41620 for (i = 0; i < 32; ++i)
41622 rperm[0][i] = m128;
41623 rperm[1][i] = m128;
41624 rperm[2][i] = m128;
41625 rperm[3][i] = m128;
41627 used[0] = false;
41628 used[1] = false;
41629 used[2] = false;
41630 used[3] = false;
41631 for (i = 0; i < nelt; ++i)
41633 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41634 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41635 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
41637 for (j = 0; j < eltsz; ++j)
41638 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
41639 used[which] = true;
41642 for (i = 0; i < 2; ++i)
41644 if (!used[2 * i + 1])
41646 h[i] = NULL_RTX;
41647 continue;
41649 vperm = gen_rtx_CONST_VECTOR (V32QImode,
41650 gen_rtvec_v (32, rperm[2 * i + 1]));
41651 vperm = force_reg (V32QImode, vperm);
41652 h[i] = gen_reg_rtx (V32QImode);
41653 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41654 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
41657 /* Swap the 128-byte lanes of h[X]. */
41658 for (i = 0; i < 2; ++i)
41660 if (h[i] == NULL_RTX)
41661 continue;
41662 op = gen_reg_rtx (V4DImode);
41663 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
41664 const2_rtx, GEN_INT (3), const0_rtx,
41665 const1_rtx));
41666 h[i] = gen_lowpart (V32QImode, op);
41669 for (i = 0; i < 2; ++i)
41671 if (!used[2 * i])
41673 l[i] = NULL_RTX;
41674 continue;
41676 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
41677 vperm = force_reg (V32QImode, vperm);
41678 l[i] = gen_reg_rtx (V32QImode);
41679 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41680 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
41683 for (i = 0; i < 2; ++i)
41685 if (h[i] && l[i])
41687 op = gen_reg_rtx (V32QImode);
41688 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
41689 l[i] = op;
41691 else if (h[i])
41692 l[i] = h[i];
41695 gcc_assert (l[0] && l[1]);
41696 op = d->target;
41697 if (d->vmode != V32QImode)
41698 op = gen_reg_rtx (V32QImode);
41699 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
41700 if (op != d->target)
41701 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41702 return true;
41705 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
41706 With all of the interface bits taken care of, perform the expansion
41707 in D and return true on success. */
41709 static bool
41710 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
41712 /* Try a single instruction expansion. */
41713 if (expand_vec_perm_1 (d))
41714 return true;
41716 /* Try sequences of two instructions. */
41718 if (expand_vec_perm_pshuflw_pshufhw (d))
41719 return true;
41721 if (expand_vec_perm_palignr (d))
41722 return true;
41724 if (expand_vec_perm_interleave2 (d))
41725 return true;
41727 if (expand_vec_perm_broadcast (d))
41728 return true;
41730 if (expand_vec_perm_vpermq_perm_1 (d))
41731 return true;
41733 if (expand_vec_perm_vperm2f128 (d))
41734 return true;
41736 /* Try sequences of three instructions. */
41738 if (expand_vec_perm_2vperm2f128_vshuf (d))
41739 return true;
41741 if (expand_vec_perm_pshufb2 (d))
41742 return true;
41744 if (expand_vec_perm_interleave3 (d))
41745 return true;
41747 if (expand_vec_perm_vperm2f128_vblend (d))
41748 return true;
41750 /* Try sequences of four instructions. */
41752 if (expand_vec_perm_vpshufb2_vpermq (d))
41753 return true;
41755 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
41756 return true;
41758 /* ??? Look for narrow permutations whose element orderings would
41759 allow the promotion to a wider mode. */
41761 /* ??? Look for sequences of interleave or a wider permute that place
41762 the data into the correct lanes for a half-vector shuffle like
41763 pshuf[lh]w or vpermilps. */
41765 /* ??? Look for sequences of interleave that produce the desired results.
41766 The combinatorics of punpck[lh] get pretty ugly... */
41768 if (expand_vec_perm_even_odd (d))
41769 return true;
41771 /* Even longer sequences. */
41772 if (expand_vec_perm_vpshufb4_vpermq2 (d))
41773 return true;
41775 return false;
41778 /* If a permutation only uses one operand, make it clear. Returns true
41779 if the permutation references both operands. */
41781 static bool
41782 canonicalize_perm (struct expand_vec_perm_d *d)
41784 int i, which, nelt = d->nelt;
41786 for (i = which = 0; i < nelt; ++i)
41787 which |= (d->perm[i] < nelt ? 1 : 2);
41789 d->one_operand_p = true;
41790 switch (which)
41792 default:
41793 gcc_unreachable();
41795 case 3:
41796 if (!rtx_equal_p (d->op0, d->op1))
41798 d->one_operand_p = false;
41799 break;
41801 /* The elements of PERM do not suggest that only the first operand
41802 is used, but both operands are identical. Allow easier matching
41803 of the permutation by folding the permutation into the single
41804 input vector. */
41805 /* FALLTHRU */
41807 case 2:
41808 for (i = 0; i < nelt; ++i)
41809 d->perm[i] &= nelt - 1;
41810 d->op0 = d->op1;
41811 break;
41813 case 1:
41814 d->op1 = d->op0;
41815 break;
41818 return (which == 3);
41821 bool
41822 ix86_expand_vec_perm_const (rtx operands[4])
41824 struct expand_vec_perm_d d;
41825 unsigned char perm[MAX_VECT_LEN];
41826 int i, nelt;
41827 bool two_args;
41828 rtx sel;
41830 d.target = operands[0];
41831 d.op0 = operands[1];
41832 d.op1 = operands[2];
41833 sel = operands[3];
41835 d.vmode = GET_MODE (d.target);
41836 gcc_assert (VECTOR_MODE_P (d.vmode));
41837 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41838 d.testing_p = false;
41840 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41841 gcc_assert (XVECLEN (sel, 0) == nelt);
41842 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41844 for (i = 0; i < nelt; ++i)
41846 rtx e = XVECEXP (sel, 0, i);
41847 int ei = INTVAL (e) & (2 * nelt - 1);
41848 d.perm[i] = ei;
41849 perm[i] = ei;
41852 two_args = canonicalize_perm (&d);
41854 if (ix86_expand_vec_perm_const_1 (&d))
41855 return true;
41857 /* If the selector says both arguments are needed, but the operands are the
41858 same, the above tried to expand with one_operand_p and flattened selector.
41859 If that didn't work, retry without one_operand_p; we succeeded with that
41860 during testing. */
41861 if (two_args && d.one_operand_p)
41863 d.one_operand_p = false;
41864 memcpy (d.perm, perm, sizeof (perm));
41865 return ix86_expand_vec_perm_const_1 (&d);
41868 return false;
41871 /* Implement targetm.vectorize.vec_perm_const_ok. */
41873 static bool
41874 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41875 const unsigned char *sel)
41877 struct expand_vec_perm_d d;
41878 unsigned int i, nelt, which;
41879 bool ret;
41881 d.vmode = vmode;
41882 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41883 d.testing_p = true;
41885 /* Given sufficient ISA support we can just return true here
41886 for selected vector modes. */
41887 if (GET_MODE_SIZE (d.vmode) == 16)
41889 /* All implementable with a single vpperm insn. */
41890 if (TARGET_XOP)
41891 return true;
41892 /* All implementable with 2 pshufb + 1 ior. */
41893 if (TARGET_SSSE3)
41894 return true;
41895 /* All implementable with shufpd or unpck[lh]pd. */
41896 if (d.nelt == 2)
41897 return true;
41900 /* Extract the values from the vector CST into the permutation
41901 array in D. */
41902 memcpy (d.perm, sel, nelt);
41903 for (i = which = 0; i < nelt; ++i)
41905 unsigned char e = d.perm[i];
41906 gcc_assert (e < 2 * nelt);
41907 which |= (e < nelt ? 1 : 2);
41910 /* For all elements from second vector, fold the elements to first. */
41911 if (which == 2)
41912 for (i = 0; i < nelt; ++i)
41913 d.perm[i] -= nelt;
41915 /* Check whether the mask can be applied to the vector type. */
41916 d.one_operand_p = (which != 3);
41918 /* Implementable with shufps or pshufd. */
41919 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41920 return true;
41922 /* Otherwise we have to go through the motions and see if we can
41923 figure out how to generate the requested permutation. */
41924 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41925 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41926 if (!d.one_operand_p)
41927 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41929 start_sequence ();
41930 ret = ix86_expand_vec_perm_const_1 (&d);
41931 end_sequence ();
41933 return ret;
41936 void
41937 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41939 struct expand_vec_perm_d d;
41940 unsigned i, nelt;
41942 d.target = targ;
41943 d.op0 = op0;
41944 d.op1 = op1;
41945 d.vmode = GET_MODE (targ);
41946 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41947 d.one_operand_p = false;
41948 d.testing_p = false;
41950 for (i = 0; i < nelt; ++i)
41951 d.perm[i] = i * 2 + odd;
41953 /* We'll either be able to implement the permutation directly... */
41954 if (expand_vec_perm_1 (&d))
41955 return;
41957 /* ... or we use the special-case patterns. */
41958 expand_vec_perm_even_odd_1 (&d, odd);
41961 static void
41962 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41964 struct expand_vec_perm_d d;
41965 unsigned i, nelt, base;
41966 bool ok;
41968 d.target = targ;
41969 d.op0 = op0;
41970 d.op1 = op1;
41971 d.vmode = GET_MODE (targ);
41972 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41973 d.one_operand_p = false;
41974 d.testing_p = false;
41976 base = high_p ? nelt / 2 : 0;
41977 for (i = 0; i < nelt / 2; ++i)
41979 d.perm[i * 2] = i + base;
41980 d.perm[i * 2 + 1] = i + base + nelt;
41983 /* Note that for AVX this isn't one instruction. */
41984 ok = ix86_expand_vec_perm_const_1 (&d);
41985 gcc_assert (ok);
41989 /* Expand a vector operation CODE for a V*QImode in terms of the
41990 same operation on V*HImode. */
41992 void
41993 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
41995 enum machine_mode qimode = GET_MODE (dest);
41996 enum machine_mode himode;
41997 rtx (*gen_il) (rtx, rtx, rtx);
41998 rtx (*gen_ih) (rtx, rtx, rtx);
41999 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
42000 struct expand_vec_perm_d d;
42001 bool ok, full_interleave;
42002 bool uns_p = false;
42003 int i;
42005 switch (qimode)
42007 case V16QImode:
42008 himode = V8HImode;
42009 gen_il = gen_vec_interleave_lowv16qi;
42010 gen_ih = gen_vec_interleave_highv16qi;
42011 break;
42012 case V32QImode:
42013 himode = V16HImode;
42014 gen_il = gen_avx2_interleave_lowv32qi;
42015 gen_ih = gen_avx2_interleave_highv32qi;
42016 break;
42017 default:
42018 gcc_unreachable ();
42021 op2_l = op2_h = op2;
42022 switch (code)
42024 case MULT:
42025 /* Unpack data such that we've got a source byte in each low byte of
42026 each word. We don't care what goes into the high byte of each word.
42027 Rather than trying to get zero in there, most convenient is to let
42028 it be a copy of the low byte. */
42029 op2_l = gen_reg_rtx (qimode);
42030 op2_h = gen_reg_rtx (qimode);
42031 emit_insn (gen_il (op2_l, op2, op2));
42032 emit_insn (gen_ih (op2_h, op2, op2));
42033 /* FALLTHRU */
42035 op1_l = gen_reg_rtx (qimode);
42036 op1_h = gen_reg_rtx (qimode);
42037 emit_insn (gen_il (op1_l, op1, op1));
42038 emit_insn (gen_ih (op1_h, op1, op1));
42039 full_interleave = qimode == V16QImode;
42040 break;
42042 case ASHIFT:
42043 case LSHIFTRT:
42044 uns_p = true;
42045 /* FALLTHRU */
42046 case ASHIFTRT:
42047 op1_l = gen_reg_rtx (himode);
42048 op1_h = gen_reg_rtx (himode);
42049 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
42050 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
42051 full_interleave = true;
42052 break;
42053 default:
42054 gcc_unreachable ();
42057 /* Perform the operation. */
42058 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
42059 1, OPTAB_DIRECT);
42060 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
42061 1, OPTAB_DIRECT);
42062 gcc_assert (res_l && res_h);
42064 /* Merge the data back into the right place. */
42065 d.target = dest;
42066 d.op0 = gen_lowpart (qimode, res_l);
42067 d.op1 = gen_lowpart (qimode, res_h);
42068 d.vmode = qimode;
42069 d.nelt = GET_MODE_NUNITS (qimode);
42070 d.one_operand_p = false;
42071 d.testing_p = false;
42073 if (full_interleave)
42075 /* For SSE2, we used an full interleave, so the desired
42076 results are in the even elements. */
42077 for (i = 0; i < 32; ++i)
42078 d.perm[i] = i * 2;
42080 else
42082 /* For AVX, the interleave used above was not cross-lane. So the
42083 extraction is evens but with the second and third quarter swapped.
42084 Happily, that is even one insn shorter than even extraction. */
42085 for (i = 0; i < 32; ++i)
42086 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
42089 ok = ix86_expand_vec_perm_const_1 (&d);
42090 gcc_assert (ok);
42092 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42093 gen_rtx_fmt_ee (code, qimode, op1, op2));
42096 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
42097 if op is CONST_VECTOR with all odd elements equal to their
42098 preceding element. */
42100 static bool
42101 const_vector_equal_evenodd_p (rtx op)
42103 enum machine_mode mode = GET_MODE (op);
42104 int i, nunits = GET_MODE_NUNITS (mode);
42105 if (GET_CODE (op) != CONST_VECTOR
42106 || nunits != CONST_VECTOR_NUNITS (op))
42107 return false;
42108 for (i = 0; i < nunits; i += 2)
42109 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
42110 return false;
42111 return true;
42114 void
42115 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
42116 bool uns_p, bool odd_p)
42118 enum machine_mode mode = GET_MODE (op1);
42119 enum machine_mode wmode = GET_MODE (dest);
42120 rtx x;
42121 rtx orig_op1 = op1, orig_op2 = op2;
42123 if (!nonimmediate_operand (op1, mode))
42124 op1 = force_reg (mode, op1);
42125 if (!nonimmediate_operand (op2, mode))
42126 op2 = force_reg (mode, op2);
42128 /* We only play even/odd games with vectors of SImode. */
42129 gcc_assert (mode == V4SImode || mode == V8SImode);
42131 /* If we're looking for the odd results, shift those members down to
42132 the even slots. For some cpus this is faster than a PSHUFD. */
42133 if (odd_p)
42135 /* For XOP use vpmacsdqh, but only for smult, as it is only
42136 signed. */
42137 if (TARGET_XOP && mode == V4SImode && !uns_p)
42139 x = force_reg (wmode, CONST0_RTX (wmode));
42140 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
42141 return;
42144 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
42145 if (!const_vector_equal_evenodd_p (orig_op1))
42146 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
42147 x, NULL, 1, OPTAB_DIRECT);
42148 if (!const_vector_equal_evenodd_p (orig_op2))
42149 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
42150 x, NULL, 1, OPTAB_DIRECT);
42151 op1 = gen_lowpart (mode, op1);
42152 op2 = gen_lowpart (mode, op2);
42155 if (mode == V8SImode)
42157 if (uns_p)
42158 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
42159 else
42160 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
42162 else if (uns_p)
42163 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
42164 else if (TARGET_SSE4_1)
42165 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
42166 else
42168 rtx s1, s2, t0, t1, t2;
42170 /* The easiest way to implement this without PMULDQ is to go through
42171 the motions as if we are performing a full 64-bit multiply. With
42172 the exception that we need to do less shuffling of the elements. */
42174 /* Compute the sign-extension, aka highparts, of the two operands. */
42175 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42176 op1, pc_rtx, pc_rtx);
42177 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42178 op2, pc_rtx, pc_rtx);
42180 /* Multiply LO(A) * HI(B), and vice-versa. */
42181 t1 = gen_reg_rtx (wmode);
42182 t2 = gen_reg_rtx (wmode);
42183 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
42184 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
42186 /* Multiply LO(A) * LO(B). */
42187 t0 = gen_reg_rtx (wmode);
42188 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
42190 /* Combine and shift the highparts into place. */
42191 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
42192 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
42193 1, OPTAB_DIRECT);
42195 /* Combine high and low parts. */
42196 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
42197 return;
42199 emit_insn (x);
42202 void
42203 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
42204 bool uns_p, bool high_p)
42206 enum machine_mode wmode = GET_MODE (dest);
42207 enum machine_mode mode = GET_MODE (op1);
42208 rtx t1, t2, t3, t4, mask;
42210 switch (mode)
42212 case V4SImode:
42213 t1 = gen_reg_rtx (mode);
42214 t2 = gen_reg_rtx (mode);
42215 if (TARGET_XOP && !uns_p)
42217 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
42218 shuffle the elements once so that all elements are in the right
42219 place for immediate use: { A C B D }. */
42220 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
42221 const1_rtx, GEN_INT (3)));
42222 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
42223 const1_rtx, GEN_INT (3)));
42225 else
42227 /* Put the elements into place for the multiply. */
42228 ix86_expand_vec_interleave (t1, op1, op1, high_p);
42229 ix86_expand_vec_interleave (t2, op2, op2, high_p);
42230 high_p = false;
42232 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
42233 break;
42235 case V8SImode:
42236 /* Shuffle the elements between the lanes. After this we
42237 have { A B E F | C D G H } for each operand. */
42238 t1 = gen_reg_rtx (V4DImode);
42239 t2 = gen_reg_rtx (V4DImode);
42240 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
42241 const0_rtx, const2_rtx,
42242 const1_rtx, GEN_INT (3)));
42243 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
42244 const0_rtx, const2_rtx,
42245 const1_rtx, GEN_INT (3)));
42247 /* Shuffle the elements within the lanes. After this we
42248 have { A A B B | C C D D } or { E E F F | G G H H }. */
42249 t3 = gen_reg_rtx (V8SImode);
42250 t4 = gen_reg_rtx (V8SImode);
42251 mask = GEN_INT (high_p
42252 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
42253 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
42254 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
42255 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
42257 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
42258 break;
42260 case V8HImode:
42261 case V16HImode:
42262 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
42263 uns_p, OPTAB_DIRECT);
42264 t2 = expand_binop (mode,
42265 uns_p ? umul_highpart_optab : smul_highpart_optab,
42266 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
42267 gcc_assert (t1 && t2);
42269 t3 = gen_reg_rtx (mode);
42270 ix86_expand_vec_interleave (t3, t1, t2, high_p);
42271 emit_move_insn (dest, gen_lowpart (wmode, t3));
42272 break;
42274 case V16QImode:
42275 case V32QImode:
42276 t1 = gen_reg_rtx (wmode);
42277 t2 = gen_reg_rtx (wmode);
42278 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
42279 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
42281 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
42282 break;
42284 default:
42285 gcc_unreachable ();
42289 void
42290 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
42292 rtx res_1, res_2, res_3, res_4;
42294 res_1 = gen_reg_rtx (V4SImode);
42295 res_2 = gen_reg_rtx (V4SImode);
42296 res_3 = gen_reg_rtx (V2DImode);
42297 res_4 = gen_reg_rtx (V2DImode);
42298 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
42299 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
42301 /* Move the results in element 2 down to element 1; we don't care
42302 what goes in elements 2 and 3. Then we can merge the parts
42303 back together with an interleave.
42305 Note that two other sequences were tried:
42306 (1) Use interleaves at the start instead of psrldq, which allows
42307 us to use a single shufps to merge things back at the end.
42308 (2) Use shufps here to combine the two vectors, then pshufd to
42309 put the elements in the correct order.
42310 In both cases the cost of the reformatting stall was too high
42311 and the overall sequence slower. */
42313 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
42314 const0_rtx, const2_rtx,
42315 const0_rtx, const0_rtx));
42316 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
42317 const0_rtx, const2_rtx,
42318 const0_rtx, const0_rtx));
42319 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
42321 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
42324 void
42325 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
42327 enum machine_mode mode = GET_MODE (op0);
42328 rtx t1, t2, t3, t4, t5, t6;
42330 if (TARGET_XOP && mode == V2DImode)
42332 /* op1: A,B,C,D, op2: E,F,G,H */
42333 op1 = gen_lowpart (V4SImode, op1);
42334 op2 = gen_lowpart (V4SImode, op2);
42336 t1 = gen_reg_rtx (V4SImode);
42337 t2 = gen_reg_rtx (V4SImode);
42338 t3 = gen_reg_rtx (V2DImode);
42339 t4 = gen_reg_rtx (V2DImode);
42341 /* t1: B,A,D,C */
42342 emit_insn (gen_sse2_pshufd_1 (t1, op1,
42343 GEN_INT (1),
42344 GEN_INT (0),
42345 GEN_INT (3),
42346 GEN_INT (2)));
42348 /* t2: (B*E),(A*F),(D*G),(C*H) */
42349 emit_insn (gen_mulv4si3 (t2, t1, op2));
42351 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
42352 emit_insn (gen_xop_phadddq (t3, t2));
42354 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
42355 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
42357 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
42358 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
42360 else
42362 enum machine_mode nmode;
42363 rtx (*umul) (rtx, rtx, rtx);
42365 if (mode == V2DImode)
42367 umul = gen_vec_widen_umult_even_v4si;
42368 nmode = V4SImode;
42370 else if (mode == V4DImode)
42372 umul = gen_vec_widen_umult_even_v8si;
42373 nmode = V8SImode;
42375 else
42376 gcc_unreachable ();
42379 /* Multiply low parts. */
42380 t1 = gen_reg_rtx (mode);
42381 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
42383 /* Shift input vectors right 32 bits so we can multiply high parts. */
42384 t6 = GEN_INT (32);
42385 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
42386 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
42388 /* Multiply high parts by low parts. */
42389 t4 = gen_reg_rtx (mode);
42390 t5 = gen_reg_rtx (mode);
42391 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
42392 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
42394 /* Combine and shift the highparts back. */
42395 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
42396 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
42398 /* Combine high and low parts. */
42399 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
42402 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42403 gen_rtx_MULT (mode, op1, op2));
42406 /* Return 1 if control tansfer instruction INSN
42407 should be encoded with bnd prefix.
42408 If insn is NULL then return 1 when control
42409 transfer instructions should be prefixed with
42410 bnd by default for current function. */
42412 bool
42413 ix86_bnd_prefixed_insn_p (rtx insn ATTRIBUTE_UNUSED)
42415 return false;
42418 /* Calculate integer abs() using only SSE2 instructions. */
42420 void
42421 ix86_expand_sse2_abs (rtx target, rtx input)
42423 enum machine_mode mode = GET_MODE (target);
42424 rtx tmp0, tmp1, x;
42426 switch (mode)
42428 /* For 32-bit signed integer X, the best way to calculate the absolute
42429 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
42430 case V4SImode:
42431 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
42432 GEN_INT (GET_MODE_BITSIZE
42433 (GET_MODE_INNER (mode)) - 1),
42434 NULL, 0, OPTAB_DIRECT);
42435 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
42436 NULL, 0, OPTAB_DIRECT);
42437 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
42438 target, 0, OPTAB_DIRECT);
42439 break;
42441 /* For 16-bit signed integer X, the best way to calculate the absolute
42442 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
42443 case V8HImode:
42444 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42446 x = expand_simple_binop (mode, SMAX, tmp0, input,
42447 target, 0, OPTAB_DIRECT);
42448 break;
42450 /* For 8-bit signed integer X, the best way to calculate the absolute
42451 value of X is min ((unsigned char) X, (unsigned char) (-X)),
42452 as SSE2 provides the PMINUB insn. */
42453 case V16QImode:
42454 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42456 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
42457 target, 0, OPTAB_DIRECT);
42458 break;
42460 default:
42461 gcc_unreachable ();
42464 if (x != target)
42465 emit_move_insn (target, x);
42468 /* Expand an insert into a vector register through pinsr insn.
42469 Return true if successful. */
42471 bool
42472 ix86_expand_pinsr (rtx *operands)
42474 rtx dst = operands[0];
42475 rtx src = operands[3];
42477 unsigned int size = INTVAL (operands[1]);
42478 unsigned int pos = INTVAL (operands[2]);
42480 if (GET_CODE (dst) == SUBREG)
42482 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
42483 dst = SUBREG_REG (dst);
42486 if (GET_CODE (src) == SUBREG)
42487 src = SUBREG_REG (src);
42489 switch (GET_MODE (dst))
42491 case V16QImode:
42492 case V8HImode:
42493 case V4SImode:
42494 case V2DImode:
42496 enum machine_mode srcmode, dstmode;
42497 rtx (*pinsr)(rtx, rtx, rtx, rtx);
42499 srcmode = mode_for_size (size, MODE_INT, 0);
42501 switch (srcmode)
42503 case QImode:
42504 if (!TARGET_SSE4_1)
42505 return false;
42506 dstmode = V16QImode;
42507 pinsr = gen_sse4_1_pinsrb;
42508 break;
42510 case HImode:
42511 if (!TARGET_SSE2)
42512 return false;
42513 dstmode = V8HImode;
42514 pinsr = gen_sse2_pinsrw;
42515 break;
42517 case SImode:
42518 if (!TARGET_SSE4_1)
42519 return false;
42520 dstmode = V4SImode;
42521 pinsr = gen_sse4_1_pinsrd;
42522 break;
42524 case DImode:
42525 gcc_assert (TARGET_64BIT);
42526 if (!TARGET_SSE4_1)
42527 return false;
42528 dstmode = V2DImode;
42529 pinsr = gen_sse4_1_pinsrq;
42530 break;
42532 default:
42533 return false;
42536 rtx d = dst;
42537 if (GET_MODE (dst) != dstmode)
42538 d = gen_reg_rtx (dstmode);
42539 src = gen_lowpart (srcmode, src);
42541 pos /= size;
42543 emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
42544 GEN_INT (1 << pos)));
42545 if (d != dst)
42546 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
42547 return true;
42550 default:
42551 return false;
42555 /* This function returns the calling abi specific va_list type node.
42556 It returns the FNDECL specific va_list type. */
42558 static tree
42559 ix86_fn_abi_va_list (tree fndecl)
42561 if (!TARGET_64BIT)
42562 return va_list_type_node;
42563 gcc_assert (fndecl != NULL_TREE);
42565 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
42566 return ms_va_list_type_node;
42567 else
42568 return sysv_va_list_type_node;
42571 /* Returns the canonical va_list type specified by TYPE. If there
42572 is no valid TYPE provided, it return NULL_TREE. */
42574 static tree
42575 ix86_canonical_va_list_type (tree type)
42577 tree wtype, htype;
42579 /* Resolve references and pointers to va_list type. */
42580 if (TREE_CODE (type) == MEM_REF)
42581 type = TREE_TYPE (type);
42582 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
42583 type = TREE_TYPE (type);
42584 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
42585 type = TREE_TYPE (type);
42587 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
42589 wtype = va_list_type_node;
42590 gcc_assert (wtype != NULL_TREE);
42591 htype = type;
42592 if (TREE_CODE (wtype) == ARRAY_TYPE)
42594 /* If va_list is an array type, the argument may have decayed
42595 to a pointer type, e.g. by being passed to another function.
42596 In that case, unwrap both types so that we can compare the
42597 underlying records. */
42598 if (TREE_CODE (htype) == ARRAY_TYPE
42599 || POINTER_TYPE_P (htype))
42601 wtype = TREE_TYPE (wtype);
42602 htype = TREE_TYPE (htype);
42605 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42606 return va_list_type_node;
42607 wtype = sysv_va_list_type_node;
42608 gcc_assert (wtype != NULL_TREE);
42609 htype = type;
42610 if (TREE_CODE (wtype) == ARRAY_TYPE)
42612 /* If va_list is an array type, the argument may have decayed
42613 to a pointer type, e.g. by being passed to another function.
42614 In that case, unwrap both types so that we can compare the
42615 underlying records. */
42616 if (TREE_CODE (htype) == ARRAY_TYPE
42617 || POINTER_TYPE_P (htype))
42619 wtype = TREE_TYPE (wtype);
42620 htype = TREE_TYPE (htype);
42623 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42624 return sysv_va_list_type_node;
42625 wtype = ms_va_list_type_node;
42626 gcc_assert (wtype != NULL_TREE);
42627 htype = type;
42628 if (TREE_CODE (wtype) == ARRAY_TYPE)
42630 /* If va_list is an array type, the argument may have decayed
42631 to a pointer type, e.g. by being passed to another function.
42632 In that case, unwrap both types so that we can compare the
42633 underlying records. */
42634 if (TREE_CODE (htype) == ARRAY_TYPE
42635 || POINTER_TYPE_P (htype))
42637 wtype = TREE_TYPE (wtype);
42638 htype = TREE_TYPE (htype);
42641 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42642 return ms_va_list_type_node;
42643 return NULL_TREE;
42645 return std_canonical_va_list_type (type);
42648 /* Iterate through the target-specific builtin types for va_list.
42649 IDX denotes the iterator, *PTREE is set to the result type of
42650 the va_list builtin, and *PNAME to its internal type.
42651 Returns zero if there is no element for this index, otherwise
42652 IDX should be increased upon the next call.
42653 Note, do not iterate a base builtin's name like __builtin_va_list.
42654 Used from c_common_nodes_and_builtins. */
42656 static int
42657 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
42659 if (TARGET_64BIT)
42661 switch (idx)
42663 default:
42664 break;
42666 case 0:
42667 *ptree = ms_va_list_type_node;
42668 *pname = "__builtin_ms_va_list";
42669 return 1;
42671 case 1:
42672 *ptree = sysv_va_list_type_node;
42673 *pname = "__builtin_sysv_va_list";
42674 return 1;
42678 return 0;
42681 #undef TARGET_SCHED_DISPATCH
42682 #define TARGET_SCHED_DISPATCH has_dispatch
42683 #undef TARGET_SCHED_DISPATCH_DO
42684 #define TARGET_SCHED_DISPATCH_DO do_dispatch
42685 #undef TARGET_SCHED_REASSOCIATION_WIDTH
42686 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
42687 #undef TARGET_SCHED_REORDER
42688 #define TARGET_SCHED_REORDER ix86_sched_reorder
42689 #undef TARGET_SCHED_ADJUST_PRIORITY
42690 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
42691 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
42692 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
42693 ix86_dependencies_evaluation_hook
42695 /* The size of the dispatch window is the total number of bytes of
42696 object code allowed in a window. */
42697 #define DISPATCH_WINDOW_SIZE 16
42699 /* Number of dispatch windows considered for scheduling. */
42700 #define MAX_DISPATCH_WINDOWS 3
42702 /* Maximum number of instructions in a window. */
42703 #define MAX_INSN 4
42705 /* Maximum number of immediate operands in a window. */
42706 #define MAX_IMM 4
42708 /* Maximum number of immediate bits allowed in a window. */
42709 #define MAX_IMM_SIZE 128
42711 /* Maximum number of 32 bit immediates allowed in a window. */
42712 #define MAX_IMM_32 4
42714 /* Maximum number of 64 bit immediates allowed in a window. */
42715 #define MAX_IMM_64 2
42717 /* Maximum total of loads or prefetches allowed in a window. */
42718 #define MAX_LOAD 2
42720 /* Maximum total of stores allowed in a window. */
42721 #define MAX_STORE 1
42723 #undef BIG
42724 #define BIG 100
42727 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
42728 enum dispatch_group {
42729 disp_no_group = 0,
42730 disp_load,
42731 disp_store,
42732 disp_load_store,
42733 disp_prefetch,
42734 disp_imm,
42735 disp_imm_32,
42736 disp_imm_64,
42737 disp_branch,
42738 disp_cmp,
42739 disp_jcc,
42740 disp_last
42743 /* Number of allowable groups in a dispatch window. It is an array
42744 indexed by dispatch_group enum. 100 is used as a big number,
42745 because the number of these kind of operations does not have any
42746 effect in dispatch window, but we need them for other reasons in
42747 the table. */
42748 static unsigned int num_allowable_groups[disp_last] = {
42749 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
42752 char group_name[disp_last + 1][16] = {
42753 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
42754 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
42755 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
42758 /* Instruction path. */
42759 enum insn_path {
42760 no_path = 0,
42761 path_single, /* Single micro op. */
42762 path_double, /* Double micro op. */
42763 path_multi, /* Instructions with more than 2 micro op.. */
42764 last_path
42767 /* sched_insn_info defines a window to the instructions scheduled in
42768 the basic block. It contains a pointer to the insn_info table and
42769 the instruction scheduled.
42771 Windows are allocated for each basic block and are linked
42772 together. */
42773 typedef struct sched_insn_info_s {
42774 rtx insn;
42775 enum dispatch_group group;
42776 enum insn_path path;
42777 int byte_len;
42778 int imm_bytes;
42779 } sched_insn_info;
42781 /* Linked list of dispatch windows. This is a two way list of
42782 dispatch windows of a basic block. It contains information about
42783 the number of uops in the window and the total number of
42784 instructions and of bytes in the object code for this dispatch
42785 window. */
42786 typedef struct dispatch_windows_s {
42787 int num_insn; /* Number of insn in the window. */
42788 int num_uops; /* Number of uops in the window. */
42789 int window_size; /* Number of bytes in the window. */
42790 int window_num; /* Window number between 0 or 1. */
42791 int num_imm; /* Number of immediates in an insn. */
42792 int num_imm_32; /* Number of 32 bit immediates in an insn. */
42793 int num_imm_64; /* Number of 64 bit immediates in an insn. */
42794 int imm_size; /* Total immediates in the window. */
42795 int num_loads; /* Total memory loads in the window. */
42796 int num_stores; /* Total memory stores in the window. */
42797 int violation; /* Violation exists in window. */
42798 sched_insn_info *window; /* Pointer to the window. */
42799 struct dispatch_windows_s *next;
42800 struct dispatch_windows_s *prev;
42801 } dispatch_windows;
42803 /* Immediate valuse used in an insn. */
42804 typedef struct imm_info_s
42806 int imm;
42807 int imm32;
42808 int imm64;
42809 } imm_info;
42811 static dispatch_windows *dispatch_window_list;
42812 static dispatch_windows *dispatch_window_list1;
42814 /* Get dispatch group of insn. */
42816 static enum dispatch_group
42817 get_mem_group (rtx insn)
42819 enum attr_memory memory;
42821 if (INSN_CODE (insn) < 0)
42822 return disp_no_group;
42823 memory = get_attr_memory (insn);
42824 if (memory == MEMORY_STORE)
42825 return disp_store;
42827 if (memory == MEMORY_LOAD)
42828 return disp_load;
42830 if (memory == MEMORY_BOTH)
42831 return disp_load_store;
42833 return disp_no_group;
42836 /* Return true if insn is a compare instruction. */
42838 static bool
42839 is_cmp (rtx insn)
42841 enum attr_type type;
42843 type = get_attr_type (insn);
42844 return (type == TYPE_TEST
42845 || type == TYPE_ICMP
42846 || type == TYPE_FCMP
42847 || GET_CODE (PATTERN (insn)) == COMPARE);
42850 /* Return true if a dispatch violation encountered. */
42852 static bool
42853 dispatch_violation (void)
42855 if (dispatch_window_list->next)
42856 return dispatch_window_list->next->violation;
42857 return dispatch_window_list->violation;
42860 /* Return true if insn is a branch instruction. */
42862 static bool
42863 is_branch (rtx insn)
42865 return (CALL_P (insn) || JUMP_P (insn));
42868 /* Return true if insn is a prefetch instruction. */
42870 static bool
42871 is_prefetch (rtx insn)
42873 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
42876 /* This function initializes a dispatch window and the list container holding a
42877 pointer to the window. */
42879 static void
42880 init_window (int window_num)
42882 int i;
42883 dispatch_windows *new_list;
42885 if (window_num == 0)
42886 new_list = dispatch_window_list;
42887 else
42888 new_list = dispatch_window_list1;
42890 new_list->num_insn = 0;
42891 new_list->num_uops = 0;
42892 new_list->window_size = 0;
42893 new_list->next = NULL;
42894 new_list->prev = NULL;
42895 new_list->window_num = window_num;
42896 new_list->num_imm = 0;
42897 new_list->num_imm_32 = 0;
42898 new_list->num_imm_64 = 0;
42899 new_list->imm_size = 0;
42900 new_list->num_loads = 0;
42901 new_list->num_stores = 0;
42902 new_list->violation = false;
42904 for (i = 0; i < MAX_INSN; i++)
42906 new_list->window[i].insn = NULL;
42907 new_list->window[i].group = disp_no_group;
42908 new_list->window[i].path = no_path;
42909 new_list->window[i].byte_len = 0;
42910 new_list->window[i].imm_bytes = 0;
42912 return;
42915 /* This function allocates and initializes a dispatch window and the
42916 list container holding a pointer to the window. */
42918 static dispatch_windows *
42919 allocate_window (void)
42921 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42922 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42924 return new_list;
42927 /* This routine initializes the dispatch scheduling information. It
42928 initiates building dispatch scheduler tables and constructs the
42929 first dispatch window. */
42931 static void
42932 init_dispatch_sched (void)
42934 /* Allocate a dispatch list and a window. */
42935 dispatch_window_list = allocate_window ();
42936 dispatch_window_list1 = allocate_window ();
42937 init_window (0);
42938 init_window (1);
42941 /* This function returns true if a branch is detected. End of a basic block
42942 does not have to be a branch, but here we assume only branches end a
42943 window. */
42945 static bool
42946 is_end_basic_block (enum dispatch_group group)
42948 return group == disp_branch;
42951 /* This function is called when the end of a window processing is reached. */
42953 static void
42954 process_end_window (void)
42956 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42957 if (dispatch_window_list->next)
42959 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42960 gcc_assert (dispatch_window_list->window_size
42961 + dispatch_window_list1->window_size <= 48);
42962 init_window (1);
42964 init_window (0);
42967 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42968 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42969 for 48 bytes of instructions. Note that these windows are not dispatch
42970 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42972 static dispatch_windows *
42973 allocate_next_window (int window_num)
42975 if (window_num == 0)
42977 if (dispatch_window_list->next)
42978 init_window (1);
42979 init_window (0);
42980 return dispatch_window_list;
42983 dispatch_window_list->next = dispatch_window_list1;
42984 dispatch_window_list1->prev = dispatch_window_list;
42986 return dispatch_window_list1;
42989 /* Increment the number of immediate operands of an instruction. */
42991 static int
42992 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
42994 if (*in_rtx == 0)
42995 return 0;
42997 switch ( GET_CODE (*in_rtx))
42999 case CONST:
43000 case SYMBOL_REF:
43001 case CONST_INT:
43002 (imm_values->imm)++;
43003 if (x86_64_immediate_operand (*in_rtx, SImode))
43004 (imm_values->imm32)++;
43005 else
43006 (imm_values->imm64)++;
43007 break;
43009 case CONST_DOUBLE:
43010 (imm_values->imm)++;
43011 (imm_values->imm64)++;
43012 break;
43014 case CODE_LABEL:
43015 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
43017 (imm_values->imm)++;
43018 (imm_values->imm32)++;
43020 break;
43022 default:
43023 break;
43026 return 0;
43029 /* Compute number of immediate operands of an instruction. */
43031 static void
43032 find_constant (rtx in_rtx, imm_info *imm_values)
43034 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
43035 (rtx_function) find_constant_1, (void *) imm_values);
43038 /* Return total size of immediate operands of an instruction along with number
43039 of corresponding immediate-operands. It initializes its parameters to zero
43040 befor calling FIND_CONSTANT.
43041 INSN is the input instruction. IMM is the total of immediates.
43042 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
43043 bit immediates. */
43045 static int
43046 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
43048 imm_info imm_values = {0, 0, 0};
43050 find_constant (insn, &imm_values);
43051 *imm = imm_values.imm;
43052 *imm32 = imm_values.imm32;
43053 *imm64 = imm_values.imm64;
43054 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
43057 /* This function indicates if an operand of an instruction is an
43058 immediate. */
43060 static bool
43061 has_immediate (rtx insn)
43063 int num_imm_operand;
43064 int num_imm32_operand;
43065 int num_imm64_operand;
43067 if (insn)
43068 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43069 &num_imm64_operand);
43070 return false;
43073 /* Return single or double path for instructions. */
43075 static enum insn_path
43076 get_insn_path (rtx insn)
43078 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
43080 if ((int)path == 0)
43081 return path_single;
43083 if ((int)path == 1)
43084 return path_double;
43086 return path_multi;
43089 /* Return insn dispatch group. */
43091 static enum dispatch_group
43092 get_insn_group (rtx insn)
43094 enum dispatch_group group = get_mem_group (insn);
43095 if (group)
43096 return group;
43098 if (is_branch (insn))
43099 return disp_branch;
43101 if (is_cmp (insn))
43102 return disp_cmp;
43104 if (has_immediate (insn))
43105 return disp_imm;
43107 if (is_prefetch (insn))
43108 return disp_prefetch;
43110 return disp_no_group;
43113 /* Count number of GROUP restricted instructions in a dispatch
43114 window WINDOW_LIST. */
43116 static int
43117 count_num_restricted (rtx insn, dispatch_windows *window_list)
43119 enum dispatch_group group = get_insn_group (insn);
43120 int imm_size;
43121 int num_imm_operand;
43122 int num_imm32_operand;
43123 int num_imm64_operand;
43125 if (group == disp_no_group)
43126 return 0;
43128 if (group == disp_imm)
43130 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43131 &num_imm64_operand);
43132 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
43133 || num_imm_operand + window_list->num_imm > MAX_IMM
43134 || (num_imm32_operand > 0
43135 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
43136 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
43137 || (num_imm64_operand > 0
43138 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
43139 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
43140 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
43141 && num_imm64_operand > 0
43142 && ((window_list->num_imm_64 > 0
43143 && window_list->num_insn >= 2)
43144 || window_list->num_insn >= 3)))
43145 return BIG;
43147 return 1;
43150 if ((group == disp_load_store
43151 && (window_list->num_loads >= MAX_LOAD
43152 || window_list->num_stores >= MAX_STORE))
43153 || ((group == disp_load
43154 || group == disp_prefetch)
43155 && window_list->num_loads >= MAX_LOAD)
43156 || (group == disp_store
43157 && window_list->num_stores >= MAX_STORE))
43158 return BIG;
43160 return 1;
43163 /* This function returns true if insn satisfies dispatch rules on the
43164 last window scheduled. */
43166 static bool
43167 fits_dispatch_window (rtx insn)
43169 dispatch_windows *window_list = dispatch_window_list;
43170 dispatch_windows *window_list_next = dispatch_window_list->next;
43171 unsigned int num_restrict;
43172 enum dispatch_group group = get_insn_group (insn);
43173 enum insn_path path = get_insn_path (insn);
43174 int sum;
43176 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
43177 instructions should be given the lowest priority in the
43178 scheduling process in Haifa scheduler to make sure they will be
43179 scheduled in the same dispatch window as the reference to them. */
43180 if (group == disp_jcc || group == disp_cmp)
43181 return false;
43183 /* Check nonrestricted. */
43184 if (group == disp_no_group || group == disp_branch)
43185 return true;
43187 /* Get last dispatch window. */
43188 if (window_list_next)
43189 window_list = window_list_next;
43191 if (window_list->window_num == 1)
43193 sum = window_list->prev->window_size + window_list->window_size;
43195 if (sum == 32
43196 || (min_insn_size (insn) + sum) >= 48)
43197 /* Window 1 is full. Go for next window. */
43198 return true;
43201 num_restrict = count_num_restricted (insn, window_list);
43203 if (num_restrict > num_allowable_groups[group])
43204 return false;
43206 /* See if it fits in the first window. */
43207 if (window_list->window_num == 0)
43209 /* The first widow should have only single and double path
43210 uops. */
43211 if (path == path_double
43212 && (window_list->num_uops + 2) > MAX_INSN)
43213 return false;
43214 else if (path != path_single)
43215 return false;
43217 return true;
43220 /* Add an instruction INSN with NUM_UOPS micro-operations to the
43221 dispatch window WINDOW_LIST. */
43223 static void
43224 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
43226 int byte_len = min_insn_size (insn);
43227 int num_insn = window_list->num_insn;
43228 int imm_size;
43229 sched_insn_info *window = window_list->window;
43230 enum dispatch_group group = get_insn_group (insn);
43231 enum insn_path path = get_insn_path (insn);
43232 int num_imm_operand;
43233 int num_imm32_operand;
43234 int num_imm64_operand;
43236 if (!window_list->violation && group != disp_cmp
43237 && !fits_dispatch_window (insn))
43238 window_list->violation = true;
43240 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43241 &num_imm64_operand);
43243 /* Initialize window with new instruction. */
43244 window[num_insn].insn = insn;
43245 window[num_insn].byte_len = byte_len;
43246 window[num_insn].group = group;
43247 window[num_insn].path = path;
43248 window[num_insn].imm_bytes = imm_size;
43250 window_list->window_size += byte_len;
43251 window_list->num_insn = num_insn + 1;
43252 window_list->num_uops = window_list->num_uops + num_uops;
43253 window_list->imm_size += imm_size;
43254 window_list->num_imm += num_imm_operand;
43255 window_list->num_imm_32 += num_imm32_operand;
43256 window_list->num_imm_64 += num_imm64_operand;
43258 if (group == disp_store)
43259 window_list->num_stores += 1;
43260 else if (group == disp_load
43261 || group == disp_prefetch)
43262 window_list->num_loads += 1;
43263 else if (group == disp_load_store)
43265 window_list->num_stores += 1;
43266 window_list->num_loads += 1;
43270 /* Adds a scheduled instruction, INSN, to the current dispatch window.
43271 If the total bytes of instructions or the number of instructions in
43272 the window exceed allowable, it allocates a new window. */
43274 static void
43275 add_to_dispatch_window (rtx insn)
43277 int byte_len;
43278 dispatch_windows *window_list;
43279 dispatch_windows *next_list;
43280 dispatch_windows *window0_list;
43281 enum insn_path path;
43282 enum dispatch_group insn_group;
43283 bool insn_fits;
43284 int num_insn;
43285 int num_uops;
43286 int window_num;
43287 int insn_num_uops;
43288 int sum;
43290 if (INSN_CODE (insn) < 0)
43291 return;
43293 byte_len = min_insn_size (insn);
43294 window_list = dispatch_window_list;
43295 next_list = window_list->next;
43296 path = get_insn_path (insn);
43297 insn_group = get_insn_group (insn);
43299 /* Get the last dispatch window. */
43300 if (next_list)
43301 window_list = dispatch_window_list->next;
43303 if (path == path_single)
43304 insn_num_uops = 1;
43305 else if (path == path_double)
43306 insn_num_uops = 2;
43307 else
43308 insn_num_uops = (int) path;
43310 /* If current window is full, get a new window.
43311 Window number zero is full, if MAX_INSN uops are scheduled in it.
43312 Window number one is full, if window zero's bytes plus window
43313 one's bytes is 32, or if the bytes of the new instruction added
43314 to the total makes it greater than 48, or it has already MAX_INSN
43315 instructions in it. */
43316 num_insn = window_list->num_insn;
43317 num_uops = window_list->num_uops;
43318 window_num = window_list->window_num;
43319 insn_fits = fits_dispatch_window (insn);
43321 if (num_insn >= MAX_INSN
43322 || num_uops + insn_num_uops > MAX_INSN
43323 || !(insn_fits))
43325 window_num = ~window_num & 1;
43326 window_list = allocate_next_window (window_num);
43329 if (window_num == 0)
43331 add_insn_window (insn, window_list, insn_num_uops);
43332 if (window_list->num_insn >= MAX_INSN
43333 && insn_group == disp_branch)
43335 process_end_window ();
43336 return;
43339 else if (window_num == 1)
43341 window0_list = window_list->prev;
43342 sum = window0_list->window_size + window_list->window_size;
43343 if (sum == 32
43344 || (byte_len + sum) >= 48)
43346 process_end_window ();
43347 window_list = dispatch_window_list;
43350 add_insn_window (insn, window_list, insn_num_uops);
43352 else
43353 gcc_unreachable ();
43355 if (is_end_basic_block (insn_group))
43357 /* End of basic block is reached do end-basic-block process. */
43358 process_end_window ();
43359 return;
43363 /* Print the dispatch window, WINDOW_NUM, to FILE. */
43365 DEBUG_FUNCTION static void
43366 debug_dispatch_window_file (FILE *file, int window_num)
43368 dispatch_windows *list;
43369 int i;
43371 if (window_num == 0)
43372 list = dispatch_window_list;
43373 else
43374 list = dispatch_window_list1;
43376 fprintf (file, "Window #%d:\n", list->window_num);
43377 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
43378 list->num_insn, list->num_uops, list->window_size);
43379 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43380 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
43382 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
43383 list->num_stores);
43384 fprintf (file, " insn info:\n");
43386 for (i = 0; i < MAX_INSN; i++)
43388 if (!list->window[i].insn)
43389 break;
43390 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
43391 i, group_name[list->window[i].group],
43392 i, (void *)list->window[i].insn,
43393 i, list->window[i].path,
43394 i, list->window[i].byte_len,
43395 i, list->window[i].imm_bytes);
43399 /* Print to stdout a dispatch window. */
43401 DEBUG_FUNCTION void
43402 debug_dispatch_window (int window_num)
43404 debug_dispatch_window_file (stdout, window_num);
43407 /* Print INSN dispatch information to FILE. */
43409 DEBUG_FUNCTION static void
43410 debug_insn_dispatch_info_file (FILE *file, rtx insn)
43412 int byte_len;
43413 enum insn_path path;
43414 enum dispatch_group group;
43415 int imm_size;
43416 int num_imm_operand;
43417 int num_imm32_operand;
43418 int num_imm64_operand;
43420 if (INSN_CODE (insn) < 0)
43421 return;
43423 byte_len = min_insn_size (insn);
43424 path = get_insn_path (insn);
43425 group = get_insn_group (insn);
43426 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43427 &num_imm64_operand);
43429 fprintf (file, " insn info:\n");
43430 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
43431 group_name[group], path, byte_len);
43432 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43433 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
43436 /* Print to STDERR the status of the ready list with respect to
43437 dispatch windows. */
43439 DEBUG_FUNCTION void
43440 debug_ready_dispatch (void)
43442 int i;
43443 int no_ready = number_in_ready ();
43445 fprintf (stdout, "Number of ready: %d\n", no_ready);
43447 for (i = 0; i < no_ready; i++)
43448 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
43451 /* This routine is the driver of the dispatch scheduler. */
43453 static void
43454 do_dispatch (rtx insn, int mode)
43456 if (mode == DISPATCH_INIT)
43457 init_dispatch_sched ();
43458 else if (mode == ADD_TO_DISPATCH_WINDOW)
43459 add_to_dispatch_window (insn);
43462 /* Return TRUE if Dispatch Scheduling is supported. */
43464 static bool
43465 has_dispatch (rtx insn, int action)
43467 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3 || TARGET_BDVER4)
43468 && flag_dispatch_scheduler)
43469 switch (action)
43471 default:
43472 return false;
43474 case IS_DISPATCH_ON:
43475 return true;
43476 break;
43478 case IS_CMP:
43479 return is_cmp (insn);
43481 case DISPATCH_VIOLATION:
43482 return dispatch_violation ();
43484 case FITS_DISPATCH_WINDOW:
43485 return fits_dispatch_window (insn);
43488 return false;
43491 /* Implementation of reassociation_width target hook used by
43492 reassoc phase to identify parallelism level in reassociated
43493 tree. Statements tree_code is passed in OPC. Arguments type
43494 is passed in MODE.
43496 Currently parallel reassociation is enabled for Atom
43497 processors only and we set reassociation width to be 2
43498 because Atom may issue up to 2 instructions per cycle.
43500 Return value should be fixed if parallel reassociation is
43501 enabled for other processors. */
43503 static int
43504 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
43505 enum machine_mode mode)
43507 int res = 1;
43509 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
43510 res = 2;
43511 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
43512 res = 2;
43514 return res;
43517 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
43518 place emms and femms instructions. */
43520 static enum machine_mode
43521 ix86_preferred_simd_mode (enum machine_mode mode)
43523 if (!TARGET_SSE)
43524 return word_mode;
43526 switch (mode)
43528 case QImode:
43529 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
43530 case HImode:
43531 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
43532 case SImode:
43533 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
43534 case DImode:
43535 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
43537 case SFmode:
43538 if (TARGET_AVX && !TARGET_PREFER_AVX128)
43539 return V8SFmode;
43540 else
43541 return V4SFmode;
43543 case DFmode:
43544 if (!TARGET_VECTORIZE_DOUBLE)
43545 return word_mode;
43546 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
43547 return V4DFmode;
43548 else if (TARGET_SSE2)
43549 return V2DFmode;
43550 /* FALLTHRU */
43552 default:
43553 return word_mode;
43557 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
43558 vectors. */
43560 static unsigned int
43561 ix86_autovectorize_vector_sizes (void)
43563 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
43568 /* Return class of registers which could be used for pseudo of MODE
43569 and of class RCLASS for spilling instead of memory. Return NO_REGS
43570 if it is not possible or non-profitable. */
43571 static reg_class_t
43572 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
43574 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
43575 && (mode == SImode || (TARGET_64BIT && mode == DImode))
43576 && INTEGER_CLASS_P (rclass))
43577 return ALL_SSE_REGS;
43578 return NO_REGS;
43581 /* Implement targetm.vectorize.init_cost. */
43583 static void *
43584 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
43586 unsigned *cost = XNEWVEC (unsigned, 3);
43587 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
43588 return cost;
43591 /* Implement targetm.vectorize.add_stmt_cost. */
43593 static unsigned
43594 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
43595 struct _stmt_vec_info *stmt_info, int misalign,
43596 enum vect_cost_model_location where)
43598 unsigned *cost = (unsigned *) data;
43599 unsigned retval = 0;
43601 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
43602 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
43604 /* Statements in an inner loop relative to the loop being
43605 vectorized are weighted more heavily. The value here is
43606 arbitrary and could potentially be improved with analysis. */
43607 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
43608 count *= 50; /* FIXME. */
43610 retval = (unsigned) (count * stmt_cost);
43611 cost[where] += retval;
43613 return retval;
43616 /* Implement targetm.vectorize.finish_cost. */
43618 static void
43619 ix86_finish_cost (void *data, unsigned *prologue_cost,
43620 unsigned *body_cost, unsigned *epilogue_cost)
43622 unsigned *cost = (unsigned *) data;
43623 *prologue_cost = cost[vect_prologue];
43624 *body_cost = cost[vect_body];
43625 *epilogue_cost = cost[vect_epilogue];
43628 /* Implement targetm.vectorize.destroy_cost_data. */
43630 static void
43631 ix86_destroy_cost_data (void *data)
43633 free (data);
43636 /* Validate target specific memory model bits in VAL. */
43638 static unsigned HOST_WIDE_INT
43639 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
43641 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
43642 bool strong;
43644 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
43645 |MEMMODEL_MASK)
43646 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
43648 warning (OPT_Winvalid_memory_model,
43649 "Unknown architecture specific memory model");
43650 return MEMMODEL_SEQ_CST;
43652 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
43653 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
43655 warning (OPT_Winvalid_memory_model,
43656 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
43657 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
43659 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
43661 warning (OPT_Winvalid_memory_model,
43662 "HLE_RELEASE not used with RELEASE or stronger memory model");
43663 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
43665 return val;
43668 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
43670 static bool
43671 ix86_float_exceptions_rounding_supported_p (void)
43673 /* For x87 floating point with standard excess precision handling,
43674 there is no adddf3 pattern (since x87 floating point only has
43675 XFmode operations) so the default hook implementation gets this
43676 wrong. */
43677 return TARGET_80387 || TARGET_SSE_MATH;
43680 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
43682 static void
43683 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
43685 if (!TARGET_80387 && !TARGET_SSE_MATH)
43686 return;
43687 tree exceptions_var = create_tmp_var (integer_type_node, NULL);
43688 if (TARGET_80387)
43690 tree fenv_index_type = build_index_type (size_int (6));
43691 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
43692 tree fenv_var = create_tmp_var (fenv_type, NULL);
43693 mark_addressable (fenv_var);
43694 tree fenv_ptr = build_pointer_type (fenv_type);
43695 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
43696 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
43697 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
43698 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
43699 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
43700 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
43701 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
43702 tree hold_fnclex = build_call_expr (fnclex, 0);
43703 *hold = build2 (COMPOUND_EXPR, void_type_node, hold_fnstenv,
43704 hold_fnclex);
43705 *clear = build_call_expr (fnclex, 0);
43706 tree sw_var = create_tmp_var (short_unsigned_type_node, NULL);
43707 mark_addressable (sw_var);
43708 tree su_ptr = build_pointer_type (short_unsigned_type_node);
43709 tree sw_addr = build1 (ADDR_EXPR, su_ptr, sw_var);
43710 tree fnstsw_call = build_call_expr (fnstsw, 1, sw_addr);
43711 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
43712 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
43713 exceptions_var, exceptions_x87);
43714 *update = build2 (COMPOUND_EXPR, integer_type_node,
43715 fnstsw_call, update_mod);
43716 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
43717 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
43719 if (TARGET_SSE_MATH)
43721 tree mxcsr_orig_var = create_tmp_var (unsigned_type_node, NULL);
43722 tree mxcsr_mod_var = create_tmp_var (unsigned_type_node, NULL);
43723 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
43724 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
43725 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
43726 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
43727 mxcsr_orig_var, stmxcsr_hold_call);
43728 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
43729 mxcsr_orig_var,
43730 build_int_cst (unsigned_type_node, 0x1f80));
43731 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
43732 build_int_cst (unsigned_type_node, 0xffffffc0));
43733 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
43734 mxcsr_mod_var, hold_mod_val);
43735 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
43736 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
43737 hold_assign_orig, hold_assign_mod);
43738 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
43739 ldmxcsr_hold_call);
43740 if (*hold)
43741 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
43742 else
43743 *hold = hold_all;
43744 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
43745 if (*clear)
43746 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
43747 ldmxcsr_clear_call);
43748 else
43749 *clear = ldmxcsr_clear_call;
43750 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
43751 tree exceptions_sse = fold_convert (integer_type_node,
43752 stxmcsr_update_call);
43753 if (*update)
43755 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
43756 exceptions_var, exceptions_sse);
43757 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
43758 exceptions_var, exceptions_mod);
43759 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
43760 exceptions_assign);
43762 else
43763 *update = build2 (MODIFY_EXPR, integer_type_node,
43764 exceptions_var, exceptions_sse);
43765 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
43766 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
43767 ldmxcsr_update_call);
43769 tree atomic_feraiseexcept
43770 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
43771 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
43772 1, exceptions_var);
43773 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
43774 atomic_feraiseexcept_call);
43777 /* Initialize the GCC target structure. */
43778 #undef TARGET_RETURN_IN_MEMORY
43779 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
43781 #undef TARGET_LEGITIMIZE_ADDRESS
43782 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
43784 #undef TARGET_ATTRIBUTE_TABLE
43785 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
43786 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
43787 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
43788 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43789 # undef TARGET_MERGE_DECL_ATTRIBUTES
43790 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
43791 #endif
43793 #undef TARGET_COMP_TYPE_ATTRIBUTES
43794 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
43796 #undef TARGET_INIT_BUILTINS
43797 #define TARGET_INIT_BUILTINS ix86_init_builtins
43798 #undef TARGET_BUILTIN_DECL
43799 #define TARGET_BUILTIN_DECL ix86_builtin_decl
43800 #undef TARGET_EXPAND_BUILTIN
43801 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
43803 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
43804 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
43805 ix86_builtin_vectorized_function
43807 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
43808 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
43810 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
43811 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
43813 #undef TARGET_VECTORIZE_BUILTIN_GATHER
43814 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
43816 #undef TARGET_BUILTIN_RECIPROCAL
43817 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
43819 #undef TARGET_ASM_FUNCTION_EPILOGUE
43820 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
43822 #undef TARGET_ENCODE_SECTION_INFO
43823 #ifndef SUBTARGET_ENCODE_SECTION_INFO
43824 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
43825 #else
43826 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
43827 #endif
43829 #undef TARGET_ASM_OPEN_PAREN
43830 #define TARGET_ASM_OPEN_PAREN ""
43831 #undef TARGET_ASM_CLOSE_PAREN
43832 #define TARGET_ASM_CLOSE_PAREN ""
43834 #undef TARGET_ASM_BYTE_OP
43835 #define TARGET_ASM_BYTE_OP ASM_BYTE
43837 #undef TARGET_ASM_ALIGNED_HI_OP
43838 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
43839 #undef TARGET_ASM_ALIGNED_SI_OP
43840 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
43841 #ifdef ASM_QUAD
43842 #undef TARGET_ASM_ALIGNED_DI_OP
43843 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
43844 #endif
43846 #undef TARGET_PROFILE_BEFORE_PROLOGUE
43847 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
43849 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
43850 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
43852 #undef TARGET_ASM_UNALIGNED_HI_OP
43853 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
43854 #undef TARGET_ASM_UNALIGNED_SI_OP
43855 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
43856 #undef TARGET_ASM_UNALIGNED_DI_OP
43857 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
43859 #undef TARGET_PRINT_OPERAND
43860 #define TARGET_PRINT_OPERAND ix86_print_operand
43861 #undef TARGET_PRINT_OPERAND_ADDRESS
43862 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
43863 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
43864 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
43865 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
43866 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
43868 #undef TARGET_SCHED_INIT_GLOBAL
43869 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
43870 #undef TARGET_SCHED_ADJUST_COST
43871 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
43872 #undef TARGET_SCHED_ISSUE_RATE
43873 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
43874 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
43875 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
43876 ia32_multipass_dfa_lookahead
43877 #undef TARGET_SCHED_MACRO_FUSION_P
43878 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
43879 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
43880 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
43882 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
43883 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
43885 #undef TARGET_MEMMODEL_CHECK
43886 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
43888 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
43889 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
43891 #ifdef HAVE_AS_TLS
43892 #undef TARGET_HAVE_TLS
43893 #define TARGET_HAVE_TLS true
43894 #endif
43895 #undef TARGET_CANNOT_FORCE_CONST_MEM
43896 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
43897 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
43898 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
43900 #undef TARGET_DELEGITIMIZE_ADDRESS
43901 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
43903 #undef TARGET_MS_BITFIELD_LAYOUT_P
43904 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
43906 #if TARGET_MACHO
43907 #undef TARGET_BINDS_LOCAL_P
43908 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
43909 #endif
43910 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43911 #undef TARGET_BINDS_LOCAL_P
43912 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
43913 #endif
43915 #undef TARGET_ASM_OUTPUT_MI_THUNK
43916 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
43917 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
43918 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
43920 #undef TARGET_ASM_FILE_START
43921 #define TARGET_ASM_FILE_START x86_file_start
43923 #undef TARGET_OPTION_OVERRIDE
43924 #define TARGET_OPTION_OVERRIDE ix86_option_override
43926 #undef TARGET_REGISTER_MOVE_COST
43927 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
43928 #undef TARGET_MEMORY_MOVE_COST
43929 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
43930 #undef TARGET_RTX_COSTS
43931 #define TARGET_RTX_COSTS ix86_rtx_costs
43932 #undef TARGET_ADDRESS_COST
43933 #define TARGET_ADDRESS_COST ix86_address_cost
43935 #undef TARGET_FIXED_CONDITION_CODE_REGS
43936 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
43937 #undef TARGET_CC_MODES_COMPATIBLE
43938 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
43940 #undef TARGET_MACHINE_DEPENDENT_REORG
43941 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
43943 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
43944 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
43946 #undef TARGET_BUILD_BUILTIN_VA_LIST
43947 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
43949 #undef TARGET_FOLD_BUILTIN
43950 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
43952 #undef TARGET_COMPARE_VERSION_PRIORITY
43953 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
43955 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
43956 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
43957 ix86_generate_version_dispatcher_body
43959 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
43960 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
43961 ix86_get_function_versions_dispatcher
43963 #undef TARGET_ENUM_VA_LIST_P
43964 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
43966 #undef TARGET_FN_ABI_VA_LIST
43967 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
43969 #undef TARGET_CANONICAL_VA_LIST_TYPE
43970 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
43972 #undef TARGET_EXPAND_BUILTIN_VA_START
43973 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
43975 #undef TARGET_MD_ASM_CLOBBERS
43976 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
43978 #undef TARGET_PROMOTE_PROTOTYPES
43979 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
43980 #undef TARGET_STRUCT_VALUE_RTX
43981 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
43982 #undef TARGET_SETUP_INCOMING_VARARGS
43983 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
43984 #undef TARGET_MUST_PASS_IN_STACK
43985 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
43986 #undef TARGET_FUNCTION_ARG_ADVANCE
43987 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
43988 #undef TARGET_FUNCTION_ARG
43989 #define TARGET_FUNCTION_ARG ix86_function_arg
43990 #undef TARGET_FUNCTION_ARG_BOUNDARY
43991 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
43992 #undef TARGET_PASS_BY_REFERENCE
43993 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
43994 #undef TARGET_INTERNAL_ARG_POINTER
43995 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
43996 #undef TARGET_UPDATE_STACK_BOUNDARY
43997 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
43998 #undef TARGET_GET_DRAP_RTX
43999 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
44000 #undef TARGET_STRICT_ARGUMENT_NAMING
44001 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
44002 #undef TARGET_STATIC_CHAIN
44003 #define TARGET_STATIC_CHAIN ix86_static_chain
44004 #undef TARGET_TRAMPOLINE_INIT
44005 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
44006 #undef TARGET_RETURN_POPS_ARGS
44007 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
44009 #undef TARGET_LEGITIMATE_COMBINED_INSN
44010 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
44012 #undef TARGET_ASAN_SHADOW_OFFSET
44013 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
44015 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
44016 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
44018 #undef TARGET_SCALAR_MODE_SUPPORTED_P
44019 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
44021 #undef TARGET_VECTOR_MODE_SUPPORTED_P
44022 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
44024 #undef TARGET_C_MODE_FOR_SUFFIX
44025 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
44027 #ifdef HAVE_AS_TLS
44028 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
44029 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
44030 #endif
44032 #ifdef SUBTARGET_INSERT_ATTRIBUTES
44033 #undef TARGET_INSERT_ATTRIBUTES
44034 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
44035 #endif
44037 #undef TARGET_MANGLE_TYPE
44038 #define TARGET_MANGLE_TYPE ix86_mangle_type
44040 #if !TARGET_MACHO
44041 #undef TARGET_STACK_PROTECT_FAIL
44042 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
44043 #endif
44045 #undef TARGET_FUNCTION_VALUE
44046 #define TARGET_FUNCTION_VALUE ix86_function_value
44048 #undef TARGET_FUNCTION_VALUE_REGNO_P
44049 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
44051 #undef TARGET_PROMOTE_FUNCTION_MODE
44052 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
44054 #undef TARGET_MEMBER_TYPE_FORCES_BLK
44055 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
44057 #undef TARGET_INSTANTIATE_DECLS
44058 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
44060 #undef TARGET_SECONDARY_RELOAD
44061 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
44063 #undef TARGET_CLASS_MAX_NREGS
44064 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
44066 #undef TARGET_PREFERRED_RELOAD_CLASS
44067 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
44068 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
44069 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
44070 #undef TARGET_CLASS_LIKELY_SPILLED_P
44071 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
44073 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
44074 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
44075 ix86_builtin_vectorization_cost
44076 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
44077 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
44078 ix86_vectorize_vec_perm_const_ok
44079 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
44080 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
44081 ix86_preferred_simd_mode
44082 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
44083 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
44084 ix86_autovectorize_vector_sizes
44085 #undef TARGET_VECTORIZE_INIT_COST
44086 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
44087 #undef TARGET_VECTORIZE_ADD_STMT_COST
44088 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
44089 #undef TARGET_VECTORIZE_FINISH_COST
44090 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
44091 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
44092 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
44094 #undef TARGET_SET_CURRENT_FUNCTION
44095 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
44097 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
44098 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
44100 #undef TARGET_OPTION_SAVE
44101 #define TARGET_OPTION_SAVE ix86_function_specific_save
44103 #undef TARGET_OPTION_RESTORE
44104 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
44106 #undef TARGET_OPTION_PRINT
44107 #define TARGET_OPTION_PRINT ix86_function_specific_print
44109 #undef TARGET_OPTION_FUNCTION_VERSIONS
44110 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
44112 #undef TARGET_CAN_INLINE_P
44113 #define TARGET_CAN_INLINE_P ix86_can_inline_p
44115 #undef TARGET_EXPAND_TO_RTL_HOOK
44116 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
44118 #undef TARGET_LEGITIMATE_ADDRESS_P
44119 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
44121 #undef TARGET_LRA_P
44122 #define TARGET_LRA_P hook_bool_void_true
44124 #undef TARGET_REGISTER_PRIORITY
44125 #define TARGET_REGISTER_PRIORITY ix86_register_priority
44127 #undef TARGET_REGISTER_USAGE_LEVELING_P
44128 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
44130 #undef TARGET_LEGITIMATE_CONSTANT_P
44131 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
44133 #undef TARGET_FRAME_POINTER_REQUIRED
44134 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
44136 #undef TARGET_CAN_ELIMINATE
44137 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
44139 #undef TARGET_EXTRA_LIVE_ON_ENTRY
44140 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
44142 #undef TARGET_ASM_CODE_END
44143 #define TARGET_ASM_CODE_END ix86_code_end
44145 #undef TARGET_CONDITIONAL_REGISTER_USAGE
44146 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
44148 #if TARGET_MACHO
44149 #undef TARGET_INIT_LIBFUNCS
44150 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
44151 #endif
44153 #undef TARGET_SPILL_CLASS
44154 #define TARGET_SPILL_CLASS ix86_spill_class
44156 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
44157 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
44158 ix86_float_exceptions_rounding_supported_p
44160 struct gcc_target targetm = TARGET_INITIALIZER;
44162 #include "gt-i386.h"