Initial asan cleanups
[official-gcc.git] / gcc / config / i386 / i386.c
blob01c7a111d769e8f090b386bd0b3994fdd4ff9950
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64 #include "dumpfile.h"
65 #include "tree-pass.h"
66 #include "tree-flow.h"
68 static rtx legitimize_dllimport_symbol (rtx, bool);
70 #ifndef CHECK_STACK_LIMIT
71 #define CHECK_STACK_LIMIT (-1)
72 #endif
74 /* Return index of given mode in mult and division cost tables. */
75 #define MODE_INDEX(mode) \
76 ((mode) == QImode ? 0 \
77 : (mode) == HImode ? 1 \
78 : (mode) == SImode ? 2 \
79 : (mode) == DImode ? 3 \
80 : 4)
82 /* Processor costs (relative to an add) */
83 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
84 #define COSTS_N_BYTES(N) ((N) * 2)
86 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
88 const
89 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
90 COSTS_N_BYTES (2), /* cost of an add instruction */
91 COSTS_N_BYTES (3), /* cost of a lea instruction */
92 COSTS_N_BYTES (2), /* variable shift costs */
93 COSTS_N_BYTES (3), /* constant shift costs */
94 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
95 COSTS_N_BYTES (3), /* HI */
96 COSTS_N_BYTES (3), /* SI */
97 COSTS_N_BYTES (3), /* DI */
98 COSTS_N_BYTES (5)}, /* other */
99 0, /* cost of multiply per each bit set */
100 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
101 COSTS_N_BYTES (3), /* HI */
102 COSTS_N_BYTES (3), /* SI */
103 COSTS_N_BYTES (3), /* DI */
104 COSTS_N_BYTES (5)}, /* other */
105 COSTS_N_BYTES (3), /* cost of movsx */
106 COSTS_N_BYTES (3), /* cost of movzx */
107 0, /* "large" insn */
108 2, /* MOVE_RATIO */
109 2, /* cost for loading QImode using movzbl */
110 {2, 2, 2}, /* cost of loading integer registers
111 in QImode, HImode and SImode.
112 Relative to reg-reg move (2). */
113 {2, 2, 2}, /* cost of storing integer registers */
114 2, /* cost of reg,reg fld/fst */
115 {2, 2, 2}, /* cost of loading fp registers
116 in SFmode, DFmode and XFmode */
117 {2, 2, 2}, /* cost of storing fp registers
118 in SFmode, DFmode and XFmode */
119 3, /* cost of moving MMX register */
120 {3, 3}, /* cost of loading MMX registers
121 in SImode and DImode */
122 {3, 3}, /* cost of storing MMX registers
123 in SImode and DImode */
124 3, /* cost of moving SSE register */
125 {3, 3, 3}, /* cost of loading SSE registers
126 in SImode, DImode and TImode */
127 {3, 3, 3}, /* cost of storing SSE registers
128 in SImode, DImode and TImode */
129 3, /* MMX or SSE register to integer */
130 0, /* size of l1 cache */
131 0, /* size of l2 cache */
132 0, /* size of prefetch block */
133 0, /* number of parallel prefetches */
134 2, /* Branch cost */
135 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
136 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
137 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
138 COSTS_N_BYTES (2), /* cost of FABS instruction. */
139 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
140 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
143 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
144 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
145 1, /* scalar_stmt_cost. */
146 1, /* scalar load_cost. */
147 1, /* scalar_store_cost. */
148 1, /* vec_stmt_cost. */
149 1, /* vec_to_scalar_cost. */
150 1, /* scalar_to_vec_cost. */
151 1, /* vec_align_load_cost. */
152 1, /* vec_unalign_load_cost. */
153 1, /* vec_store_cost. */
154 1, /* cond_taken_branch_cost. */
155 1, /* cond_not_taken_branch_cost. */
158 /* Processor costs (relative to an add) */
159 static const
160 struct processor_costs i386_cost = { /* 386 specific costs */
161 COSTS_N_INSNS (1), /* cost of an add instruction */
162 COSTS_N_INSNS (1), /* cost of a lea instruction */
163 COSTS_N_INSNS (3), /* variable shift costs */
164 COSTS_N_INSNS (2), /* constant shift costs */
165 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
166 COSTS_N_INSNS (6), /* HI */
167 COSTS_N_INSNS (6), /* SI */
168 COSTS_N_INSNS (6), /* DI */
169 COSTS_N_INSNS (6)}, /* other */
170 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
171 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
172 COSTS_N_INSNS (23), /* HI */
173 COSTS_N_INSNS (23), /* SI */
174 COSTS_N_INSNS (23), /* DI */
175 COSTS_N_INSNS (23)}, /* other */
176 COSTS_N_INSNS (3), /* cost of movsx */
177 COSTS_N_INSNS (2), /* cost of movzx */
178 15, /* "large" insn */
179 3, /* MOVE_RATIO */
180 4, /* cost for loading QImode using movzbl */
181 {2, 4, 2}, /* cost of loading integer registers
182 in QImode, HImode and SImode.
183 Relative to reg-reg move (2). */
184 {2, 4, 2}, /* cost of storing integer registers */
185 2, /* cost of reg,reg fld/fst */
186 {8, 8, 8}, /* cost of loading fp registers
187 in SFmode, DFmode and XFmode */
188 {8, 8, 8}, /* cost of storing fp registers
189 in SFmode, DFmode and XFmode */
190 2, /* cost of moving MMX register */
191 {4, 8}, /* cost of loading MMX registers
192 in SImode and DImode */
193 {4, 8}, /* cost of storing MMX registers
194 in SImode and DImode */
195 2, /* cost of moving SSE register */
196 {4, 8, 16}, /* cost of loading SSE registers
197 in SImode, DImode and TImode */
198 {4, 8, 16}, /* cost of storing SSE registers
199 in SImode, DImode and TImode */
200 3, /* MMX or SSE register to integer */
201 0, /* size of l1 cache */
202 0, /* size of l2 cache */
203 0, /* size of prefetch block */
204 0, /* number of parallel prefetches */
205 1, /* Branch cost */
206 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
207 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
208 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
209 COSTS_N_INSNS (22), /* cost of FABS instruction. */
210 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
211 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
213 DUMMY_STRINGOP_ALGS},
214 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
215 DUMMY_STRINGOP_ALGS},
216 1, /* scalar_stmt_cost. */
217 1, /* scalar load_cost. */
218 1, /* scalar_store_cost. */
219 1, /* vec_stmt_cost. */
220 1, /* vec_to_scalar_cost. */
221 1, /* scalar_to_vec_cost. */
222 1, /* vec_align_load_cost. */
223 2, /* vec_unalign_load_cost. */
224 1, /* vec_store_cost. */
225 3, /* cond_taken_branch_cost. */
226 1, /* cond_not_taken_branch_cost. */
229 static const
230 struct processor_costs i486_cost = { /* 486 specific costs */
231 COSTS_N_INSNS (1), /* cost of an add instruction */
232 COSTS_N_INSNS (1), /* cost of a lea instruction */
233 COSTS_N_INSNS (3), /* variable shift costs */
234 COSTS_N_INSNS (2), /* constant shift costs */
235 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
236 COSTS_N_INSNS (12), /* HI */
237 COSTS_N_INSNS (12), /* SI */
238 COSTS_N_INSNS (12), /* DI */
239 COSTS_N_INSNS (12)}, /* other */
240 1, /* cost of multiply per each bit set */
241 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
242 COSTS_N_INSNS (40), /* HI */
243 COSTS_N_INSNS (40), /* SI */
244 COSTS_N_INSNS (40), /* DI */
245 COSTS_N_INSNS (40)}, /* other */
246 COSTS_N_INSNS (3), /* cost of movsx */
247 COSTS_N_INSNS (2), /* cost of movzx */
248 15, /* "large" insn */
249 3, /* MOVE_RATIO */
250 4, /* cost for loading QImode using movzbl */
251 {2, 4, 2}, /* cost of loading integer registers
252 in QImode, HImode and SImode.
253 Relative to reg-reg move (2). */
254 {2, 4, 2}, /* cost of storing integer registers */
255 2, /* cost of reg,reg fld/fst */
256 {8, 8, 8}, /* cost of loading fp registers
257 in SFmode, DFmode and XFmode */
258 {8, 8, 8}, /* cost of storing fp registers
259 in SFmode, DFmode and XFmode */
260 2, /* cost of moving MMX register */
261 {4, 8}, /* cost of loading MMX registers
262 in SImode and DImode */
263 {4, 8}, /* cost of storing MMX registers
264 in SImode and DImode */
265 2, /* cost of moving SSE register */
266 {4, 8, 16}, /* cost of loading SSE registers
267 in SImode, DImode and TImode */
268 {4, 8, 16}, /* cost of storing SSE registers
269 in SImode, DImode and TImode */
270 3, /* MMX or SSE register to integer */
271 4, /* size of l1 cache. 486 has 8kB cache
272 shared for code and data, so 4kB is
273 not really precise. */
274 4, /* size of l2 cache */
275 0, /* size of prefetch block */
276 0, /* number of parallel prefetches */
277 1, /* Branch cost */
278 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
279 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
280 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
281 COSTS_N_INSNS (3), /* cost of FABS instruction. */
282 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
283 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
285 DUMMY_STRINGOP_ALGS},
286 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
287 DUMMY_STRINGOP_ALGS},
288 1, /* scalar_stmt_cost. */
289 1, /* scalar load_cost. */
290 1, /* scalar_store_cost. */
291 1, /* vec_stmt_cost. */
292 1, /* vec_to_scalar_cost. */
293 1, /* scalar_to_vec_cost. */
294 1, /* vec_align_load_cost. */
295 2, /* vec_unalign_load_cost. */
296 1, /* vec_store_cost. */
297 3, /* cond_taken_branch_cost. */
298 1, /* cond_not_taken_branch_cost. */
301 static const
302 struct processor_costs pentium_cost = {
303 COSTS_N_INSNS (1), /* cost of an add instruction */
304 COSTS_N_INSNS (1), /* cost of a lea instruction */
305 COSTS_N_INSNS (4), /* variable shift costs */
306 COSTS_N_INSNS (1), /* constant shift costs */
307 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
308 COSTS_N_INSNS (11), /* HI */
309 COSTS_N_INSNS (11), /* SI */
310 COSTS_N_INSNS (11), /* DI */
311 COSTS_N_INSNS (11)}, /* other */
312 0, /* cost of multiply per each bit set */
313 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
314 COSTS_N_INSNS (25), /* HI */
315 COSTS_N_INSNS (25), /* SI */
316 COSTS_N_INSNS (25), /* DI */
317 COSTS_N_INSNS (25)}, /* other */
318 COSTS_N_INSNS (3), /* cost of movsx */
319 COSTS_N_INSNS (2), /* cost of movzx */
320 8, /* "large" insn */
321 6, /* MOVE_RATIO */
322 6, /* cost for loading QImode using movzbl */
323 {2, 4, 2}, /* cost of loading integer registers
324 in QImode, HImode and SImode.
325 Relative to reg-reg move (2). */
326 {2, 4, 2}, /* cost of storing integer registers */
327 2, /* cost of reg,reg fld/fst */
328 {2, 2, 6}, /* cost of loading fp registers
329 in SFmode, DFmode and XFmode */
330 {4, 4, 6}, /* cost of storing fp registers
331 in SFmode, DFmode and XFmode */
332 8, /* cost of moving MMX register */
333 {8, 8}, /* cost of loading MMX registers
334 in SImode and DImode */
335 {8, 8}, /* cost of storing MMX registers
336 in SImode and DImode */
337 2, /* cost of moving SSE register */
338 {4, 8, 16}, /* cost of loading SSE registers
339 in SImode, DImode and TImode */
340 {4, 8, 16}, /* cost of storing SSE registers
341 in SImode, DImode and TImode */
342 3, /* MMX or SSE register to integer */
343 8, /* size of l1 cache. */
344 8, /* size of l2 cache */
345 0, /* size of prefetch block */
346 0, /* number of parallel prefetches */
347 2, /* Branch cost */
348 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
349 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
350 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
351 COSTS_N_INSNS (1), /* cost of FABS instruction. */
352 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
353 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
354 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
355 DUMMY_STRINGOP_ALGS},
356 {{libcall, {{-1, rep_prefix_4_byte}}},
357 DUMMY_STRINGOP_ALGS},
358 1, /* scalar_stmt_cost. */
359 1, /* scalar load_cost. */
360 1, /* scalar_store_cost. */
361 1, /* vec_stmt_cost. */
362 1, /* vec_to_scalar_cost. */
363 1, /* scalar_to_vec_cost. */
364 1, /* vec_align_load_cost. */
365 2, /* vec_unalign_load_cost. */
366 1, /* vec_store_cost. */
367 3, /* cond_taken_branch_cost. */
368 1, /* cond_not_taken_branch_cost. */
371 static const
372 struct processor_costs pentiumpro_cost = {
373 COSTS_N_INSNS (1), /* cost of an add instruction */
374 COSTS_N_INSNS (1), /* cost of a lea instruction */
375 COSTS_N_INSNS (1), /* variable shift costs */
376 COSTS_N_INSNS (1), /* constant shift costs */
377 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
378 COSTS_N_INSNS (4), /* HI */
379 COSTS_N_INSNS (4), /* SI */
380 COSTS_N_INSNS (4), /* DI */
381 COSTS_N_INSNS (4)}, /* other */
382 0, /* cost of multiply per each bit set */
383 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
384 COSTS_N_INSNS (17), /* HI */
385 COSTS_N_INSNS (17), /* SI */
386 COSTS_N_INSNS (17), /* DI */
387 COSTS_N_INSNS (17)}, /* other */
388 COSTS_N_INSNS (1), /* cost of movsx */
389 COSTS_N_INSNS (1), /* cost of movzx */
390 8, /* "large" insn */
391 6, /* MOVE_RATIO */
392 2, /* cost for loading QImode using movzbl */
393 {4, 4, 4}, /* cost of loading integer registers
394 in QImode, HImode and SImode.
395 Relative to reg-reg move (2). */
396 {2, 2, 2}, /* cost of storing integer registers */
397 2, /* cost of reg,reg fld/fst */
398 {2, 2, 6}, /* cost of loading fp registers
399 in SFmode, DFmode and XFmode */
400 {4, 4, 6}, /* cost of storing fp registers
401 in SFmode, DFmode and XFmode */
402 2, /* cost of moving MMX register */
403 {2, 2}, /* cost of loading MMX registers
404 in SImode and DImode */
405 {2, 2}, /* cost of storing MMX registers
406 in SImode and DImode */
407 2, /* cost of moving SSE register */
408 {2, 2, 8}, /* cost of loading SSE registers
409 in SImode, DImode and TImode */
410 {2, 2, 8}, /* cost of storing SSE registers
411 in SImode, DImode and TImode */
412 3, /* MMX or SSE register to integer */
413 8, /* size of l1 cache. */
414 256, /* size of l2 cache */
415 32, /* size of prefetch block */
416 6, /* number of parallel prefetches */
417 2, /* Branch cost */
418 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
419 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
420 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
421 COSTS_N_INSNS (2), /* cost of FABS instruction. */
422 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
423 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
424 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
425 (we ensure the alignment). For small blocks inline loop is still a
426 noticeable win, for bigger blocks either rep movsl or rep movsb is
427 way to go. Rep movsb has apparently more expensive startup time in CPU,
428 but after 4K the difference is down in the noise. */
429 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
430 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
431 DUMMY_STRINGOP_ALGS},
432 {{rep_prefix_4_byte, {{1024, unrolled_loop},
433 {8192, rep_prefix_4_byte}, {-1, libcall}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
727 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
728 {{libcall, {{8, loop}, {24, unrolled_loop},
729 {2048, rep_prefix_4_byte}, {-1, libcall}}},
730 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
731 4, /* scalar_stmt_cost. */
732 2, /* scalar load_cost. */
733 2, /* scalar_store_cost. */
734 5, /* vec_stmt_cost. */
735 0, /* vec_to_scalar_cost. */
736 2, /* scalar_to_vec_cost. */
737 2, /* vec_align_load_cost. */
738 3, /* vec_unalign_load_cost. */
739 3, /* vec_store_cost. */
740 3, /* cond_taken_branch_cost. */
741 2, /* cond_not_taken_branch_cost. */
744 struct processor_costs amdfam10_cost = {
745 COSTS_N_INSNS (1), /* cost of an add instruction */
746 COSTS_N_INSNS (2), /* cost of a lea instruction */
747 COSTS_N_INSNS (1), /* variable shift costs */
748 COSTS_N_INSNS (1), /* constant shift costs */
749 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
750 COSTS_N_INSNS (4), /* HI */
751 COSTS_N_INSNS (3), /* SI */
752 COSTS_N_INSNS (4), /* DI */
753 COSTS_N_INSNS (5)}, /* other */
754 0, /* cost of multiply per each bit set */
755 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
756 COSTS_N_INSNS (35), /* HI */
757 COSTS_N_INSNS (51), /* SI */
758 COSTS_N_INSNS (83), /* DI */
759 COSTS_N_INSNS (83)}, /* other */
760 COSTS_N_INSNS (1), /* cost of movsx */
761 COSTS_N_INSNS (1), /* cost of movzx */
762 8, /* "large" insn */
763 9, /* MOVE_RATIO */
764 4, /* cost for loading QImode using movzbl */
765 {3, 4, 3}, /* cost of loading integer registers
766 in QImode, HImode and SImode.
767 Relative to reg-reg move (2). */
768 {3, 4, 3}, /* cost of storing integer registers */
769 4, /* cost of reg,reg fld/fst */
770 {4, 4, 12}, /* cost of loading fp registers
771 in SFmode, DFmode and XFmode */
772 {6, 6, 8}, /* cost of storing fp registers
773 in SFmode, DFmode and XFmode */
774 2, /* cost of moving MMX register */
775 {3, 3}, /* cost of loading MMX registers
776 in SImode and DImode */
777 {4, 4}, /* cost of storing MMX registers
778 in SImode and DImode */
779 2, /* cost of moving SSE register */
780 {4, 4, 3}, /* cost of loading SSE registers
781 in SImode, DImode and TImode */
782 {4, 4, 5}, /* cost of storing SSE registers
783 in SImode, DImode and TImode */
784 3, /* MMX or SSE register to integer */
785 /* On K8:
786 MOVD reg64, xmmreg Double FSTORE 4
787 MOVD reg32, xmmreg Double FSTORE 4
788 On AMDFAM10:
789 MOVD reg64, xmmreg Double FADD 3
790 1/1 1/1
791 MOVD reg32, xmmreg Double FADD 3
792 1/1 1/1 */
793 64, /* size of l1 cache. */
794 512, /* size of l2 cache. */
795 64, /* size of prefetch block */
796 /* New AMD processors never drop prefetches; if they cannot be performed
797 immediately, they are queued. We set number of simultaneous prefetches
798 to a large constant to reflect this (it probably is not a good idea not
799 to limit number of prefetches at all, as their execution also takes some
800 time). */
801 100, /* number of parallel prefetches */
802 2, /* Branch cost */
803 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
804 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
805 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
806 COSTS_N_INSNS (2), /* cost of FABS instruction. */
807 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
808 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
810 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
811 very small blocks it is better to use loop. For large blocks, libcall can
812 do nontemporary accesses and beat inline considerably. */
813 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
814 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
815 {{libcall, {{8, loop}, {24, unrolled_loop},
816 {2048, rep_prefix_4_byte}, {-1, libcall}}},
817 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
818 4, /* scalar_stmt_cost. */
819 2, /* scalar load_cost. */
820 2, /* scalar_store_cost. */
821 6, /* vec_stmt_cost. */
822 0, /* vec_to_scalar_cost. */
823 2, /* scalar_to_vec_cost. */
824 2, /* vec_align_load_cost. */
825 2, /* vec_unalign_load_cost. */
826 2, /* vec_store_cost. */
827 2, /* cond_taken_branch_cost. */
828 1, /* cond_not_taken_branch_cost. */
831 struct processor_costs bdver1_cost = {
832 COSTS_N_INSNS (1), /* cost of an add instruction */
833 COSTS_N_INSNS (1), /* cost of a lea instruction */
834 COSTS_N_INSNS (1), /* variable shift costs */
835 COSTS_N_INSNS (1), /* constant shift costs */
836 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
837 COSTS_N_INSNS (4), /* HI */
838 COSTS_N_INSNS (4), /* SI */
839 COSTS_N_INSNS (6), /* DI */
840 COSTS_N_INSNS (6)}, /* other */
841 0, /* cost of multiply per each bit set */
842 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
843 COSTS_N_INSNS (35), /* HI */
844 COSTS_N_INSNS (51), /* SI */
845 COSTS_N_INSNS (83), /* DI */
846 COSTS_N_INSNS (83)}, /* other */
847 COSTS_N_INSNS (1), /* cost of movsx */
848 COSTS_N_INSNS (1), /* cost of movzx */
849 8, /* "large" insn */
850 9, /* MOVE_RATIO */
851 4, /* cost for loading QImode using movzbl */
852 {5, 5, 4}, /* cost of loading integer registers
853 in QImode, HImode and SImode.
854 Relative to reg-reg move (2). */
855 {4, 4, 4}, /* cost of storing integer registers */
856 2, /* cost of reg,reg fld/fst */
857 {5, 5, 12}, /* cost of loading fp registers
858 in SFmode, DFmode and XFmode */
859 {4, 4, 8}, /* cost of storing fp registers
860 in SFmode, DFmode and XFmode */
861 2, /* cost of moving MMX register */
862 {4, 4}, /* cost of loading MMX registers
863 in SImode and DImode */
864 {4, 4}, /* cost of storing MMX registers
865 in SImode and DImode */
866 2, /* cost of moving SSE register */
867 {4, 4, 4}, /* cost of loading SSE registers
868 in SImode, DImode and TImode */
869 {4, 4, 4}, /* cost of storing SSE registers
870 in SImode, DImode and TImode */
871 2, /* MMX or SSE register to integer */
872 /* On K8:
873 MOVD reg64, xmmreg Double FSTORE 4
874 MOVD reg32, xmmreg Double FSTORE 4
875 On AMDFAM10:
876 MOVD reg64, xmmreg Double FADD 3
877 1/1 1/1
878 MOVD reg32, xmmreg Double FADD 3
879 1/1 1/1 */
880 16, /* size of l1 cache. */
881 2048, /* size of l2 cache. */
882 64, /* size of prefetch block */
883 /* New AMD processors never drop prefetches; if they cannot be performed
884 immediately, they are queued. We set number of simultaneous prefetches
885 to a large constant to reflect this (it probably is not a good idea not
886 to limit number of prefetches at all, as their execution also takes some
887 time). */
888 100, /* number of parallel prefetches */
889 2, /* Branch cost */
890 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
891 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
892 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
893 COSTS_N_INSNS (2), /* cost of FABS instruction. */
894 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
895 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
897 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
898 very small blocks it is better to use loop. For large blocks, libcall
899 can do nontemporary accesses and beat inline considerably. */
900 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
901 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
902 {{libcall, {{8, loop}, {24, unrolled_loop},
903 {2048, rep_prefix_4_byte}, {-1, libcall}}},
904 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
905 6, /* scalar_stmt_cost. */
906 4, /* scalar load_cost. */
907 4, /* scalar_store_cost. */
908 6, /* vec_stmt_cost. */
909 0, /* vec_to_scalar_cost. */
910 2, /* scalar_to_vec_cost. */
911 4, /* vec_align_load_cost. */
912 4, /* vec_unalign_load_cost. */
913 4, /* vec_store_cost. */
914 2, /* cond_taken_branch_cost. */
915 1, /* cond_not_taken_branch_cost. */
918 struct processor_costs bdver2_cost = {
919 COSTS_N_INSNS (1), /* cost of an add instruction */
920 COSTS_N_INSNS (1), /* cost of a lea instruction */
921 COSTS_N_INSNS (1), /* variable shift costs */
922 COSTS_N_INSNS (1), /* constant shift costs */
923 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
924 COSTS_N_INSNS (4), /* HI */
925 COSTS_N_INSNS (4), /* SI */
926 COSTS_N_INSNS (6), /* DI */
927 COSTS_N_INSNS (6)}, /* other */
928 0, /* cost of multiply per each bit set */
929 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
930 COSTS_N_INSNS (35), /* HI */
931 COSTS_N_INSNS (51), /* SI */
932 COSTS_N_INSNS (83), /* DI */
933 COSTS_N_INSNS (83)}, /* other */
934 COSTS_N_INSNS (1), /* cost of movsx */
935 COSTS_N_INSNS (1), /* cost of movzx */
936 8, /* "large" insn */
937 9, /* MOVE_RATIO */
938 4, /* cost for loading QImode using movzbl */
939 {5, 5, 4}, /* cost of loading integer registers
940 in QImode, HImode and SImode.
941 Relative to reg-reg move (2). */
942 {4, 4, 4}, /* cost of storing integer registers */
943 2, /* cost of reg,reg fld/fst */
944 {5, 5, 12}, /* cost of loading fp registers
945 in SFmode, DFmode and XFmode */
946 {4, 4, 8}, /* cost of storing fp registers
947 in SFmode, DFmode and XFmode */
948 2, /* cost of moving MMX register */
949 {4, 4}, /* cost of loading MMX registers
950 in SImode and DImode */
951 {4, 4}, /* cost of storing MMX registers
952 in SImode and DImode */
953 2, /* cost of moving SSE register */
954 {4, 4, 4}, /* cost of loading SSE registers
955 in SImode, DImode and TImode */
956 {4, 4, 4}, /* cost of storing SSE registers
957 in SImode, DImode and TImode */
958 2, /* MMX or SSE register to integer */
959 /* On K8:
960 MOVD reg64, xmmreg Double FSTORE 4
961 MOVD reg32, xmmreg Double FSTORE 4
962 On AMDFAM10:
963 MOVD reg64, xmmreg Double FADD 3
964 1/1 1/1
965 MOVD reg32, xmmreg Double FADD 3
966 1/1 1/1 */
967 16, /* size of l1 cache. */
968 2048, /* size of l2 cache. */
969 64, /* size of prefetch block */
970 /* New AMD processors never drop prefetches; if they cannot be performed
971 immediately, they are queued. We set number of simultaneous prefetches
972 to a large constant to reflect this (it probably is not a good idea not
973 to limit number of prefetches at all, as their execution also takes some
974 time). */
975 100, /* number of parallel prefetches */
976 2, /* Branch cost */
977 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
978 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
979 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
980 COSTS_N_INSNS (2), /* cost of FABS instruction. */
981 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
982 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
984 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
985 very small blocks it is better to use loop. For large blocks, libcall
986 can do nontemporary accesses and beat inline considerably. */
987 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
988 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
989 {{libcall, {{8, loop}, {24, unrolled_loop},
990 {2048, rep_prefix_4_byte}, {-1, libcall}}},
991 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
992 6, /* scalar_stmt_cost. */
993 4, /* scalar load_cost. */
994 4, /* scalar_store_cost. */
995 6, /* vec_stmt_cost. */
996 0, /* vec_to_scalar_cost. */
997 2, /* scalar_to_vec_cost. */
998 4, /* vec_align_load_cost. */
999 4, /* vec_unalign_load_cost. */
1000 4, /* vec_store_cost. */
1001 2, /* cond_taken_branch_cost. */
1002 1, /* cond_not_taken_branch_cost. */
1005 struct processor_costs btver1_cost = {
1006 COSTS_N_INSNS (1), /* cost of an add instruction */
1007 COSTS_N_INSNS (2), /* cost of a lea instruction */
1008 COSTS_N_INSNS (1), /* variable shift costs */
1009 COSTS_N_INSNS (1), /* constant shift costs */
1010 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1011 COSTS_N_INSNS (4), /* HI */
1012 COSTS_N_INSNS (3), /* SI */
1013 COSTS_N_INSNS (4), /* DI */
1014 COSTS_N_INSNS (5)}, /* other */
1015 0, /* cost of multiply per each bit set */
1016 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1017 COSTS_N_INSNS (35), /* HI */
1018 COSTS_N_INSNS (51), /* SI */
1019 COSTS_N_INSNS (83), /* DI */
1020 COSTS_N_INSNS (83)}, /* other */
1021 COSTS_N_INSNS (1), /* cost of movsx */
1022 COSTS_N_INSNS (1), /* cost of movzx */
1023 8, /* "large" insn */
1024 9, /* MOVE_RATIO */
1025 4, /* cost for loading QImode using movzbl */
1026 {3, 4, 3}, /* cost of loading integer registers
1027 in QImode, HImode and SImode.
1028 Relative to reg-reg move (2). */
1029 {3, 4, 3}, /* cost of storing integer registers */
1030 4, /* cost of reg,reg fld/fst */
1031 {4, 4, 12}, /* cost of loading fp registers
1032 in SFmode, DFmode and XFmode */
1033 {6, 6, 8}, /* cost of storing fp registers
1034 in SFmode, DFmode and XFmode */
1035 2, /* cost of moving MMX register */
1036 {3, 3}, /* cost of loading MMX registers
1037 in SImode and DImode */
1038 {4, 4}, /* cost of storing MMX registers
1039 in SImode and DImode */
1040 2, /* cost of moving SSE register */
1041 {4, 4, 3}, /* cost of loading SSE registers
1042 in SImode, DImode and TImode */
1043 {4, 4, 5}, /* cost of storing SSE registers
1044 in SImode, DImode and TImode */
1045 3, /* MMX or SSE register to integer */
1046 /* On K8:
1047 MOVD reg64, xmmreg Double FSTORE 4
1048 MOVD reg32, xmmreg Double FSTORE 4
1049 On AMDFAM10:
1050 MOVD reg64, xmmreg Double FADD 3
1051 1/1 1/1
1052 MOVD reg32, xmmreg Double FADD 3
1053 1/1 1/1 */
1054 32, /* size of l1 cache. */
1055 512, /* size of l2 cache. */
1056 64, /* size of prefetch block */
1057 100, /* number of parallel prefetches */
1058 2, /* Branch cost */
1059 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1060 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1061 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1062 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1063 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1064 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1066 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1067 very small blocks it is better to use loop. For large blocks, libcall can
1068 do nontemporary accesses and beat inline considerably. */
1069 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1070 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1071 {{libcall, {{8, loop}, {24, unrolled_loop},
1072 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1073 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1074 4, /* scalar_stmt_cost. */
1075 2, /* scalar load_cost. */
1076 2, /* scalar_store_cost. */
1077 6, /* vec_stmt_cost. */
1078 0, /* vec_to_scalar_cost. */
1079 2, /* scalar_to_vec_cost. */
1080 2, /* vec_align_load_cost. */
1081 2, /* vec_unalign_load_cost. */
1082 2, /* vec_store_cost. */
1083 2, /* cond_taken_branch_cost. */
1084 1, /* cond_not_taken_branch_cost. */
1087 struct processor_costs btver2_cost = {
1088 COSTS_N_INSNS (1), /* cost of an add instruction */
1089 COSTS_N_INSNS (2), /* cost of a lea instruction */
1090 COSTS_N_INSNS (1), /* variable shift costs */
1091 COSTS_N_INSNS (1), /* constant shift costs */
1092 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1093 COSTS_N_INSNS (4), /* HI */
1094 COSTS_N_INSNS (3), /* SI */
1095 COSTS_N_INSNS (4), /* DI */
1096 COSTS_N_INSNS (5)}, /* other */
1097 0, /* cost of multiply per each bit set */
1098 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1099 COSTS_N_INSNS (35), /* HI */
1100 COSTS_N_INSNS (51), /* SI */
1101 COSTS_N_INSNS (83), /* DI */
1102 COSTS_N_INSNS (83)}, /* other */
1103 COSTS_N_INSNS (1), /* cost of movsx */
1104 COSTS_N_INSNS (1), /* cost of movzx */
1105 8, /* "large" insn */
1106 9, /* MOVE_RATIO */
1107 4, /* cost for loading QImode using movzbl */
1108 {3, 4, 3}, /* cost of loading integer registers
1109 in QImode, HImode and SImode.
1110 Relative to reg-reg move (2). */
1111 {3, 4, 3}, /* cost of storing integer registers */
1112 4, /* cost of reg,reg fld/fst */
1113 {4, 4, 12}, /* cost of loading fp registers
1114 in SFmode, DFmode and XFmode */
1115 {6, 6, 8}, /* cost of storing fp registers
1116 in SFmode, DFmode and XFmode */
1117 2, /* cost of moving MMX register */
1118 {3, 3}, /* cost of loading MMX registers
1119 in SImode and DImode */
1120 {4, 4}, /* cost of storing MMX registers
1121 in SImode and DImode */
1122 2, /* cost of moving SSE register */
1123 {4, 4, 3}, /* cost of loading SSE registers
1124 in SImode, DImode and TImode */
1125 {4, 4, 5}, /* cost of storing SSE registers
1126 in SImode, DImode and TImode */
1127 3, /* MMX or SSE register to integer */
1128 /* On K8:
1129 MOVD reg64, xmmreg Double FSTORE 4
1130 MOVD reg32, xmmreg Double FSTORE 4
1131 On AMDFAM10:
1132 MOVD reg64, xmmreg Double FADD 3
1133 1/1 1/1
1134 MOVD reg32, xmmreg Double FADD 3
1135 1/1 1/1 */
1136 32, /* size of l1 cache. */
1137 2048, /* size of l2 cache. */
1138 64, /* size of prefetch block */
1139 100, /* number of parallel prefetches */
1140 2, /* Branch cost */
1141 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1148 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1149 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1150 {{libcall, {{8, loop}, {24, unrolled_loop},
1151 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1152 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1153 4, /* scalar_stmt_cost. */
1154 2, /* scalar load_cost. */
1155 2, /* scalar_store_cost. */
1156 6, /* vec_stmt_cost. */
1157 0, /* vec_to_scalar_cost. */
1158 2, /* scalar_to_vec_cost. */
1159 2, /* vec_align_load_cost. */
1160 2, /* vec_unalign_load_cost. */
1161 2, /* vec_store_cost. */
1162 2, /* cond_taken_branch_cost. */
1163 1, /* cond_not_taken_branch_cost. */
1166 static const
1167 struct processor_costs pentium4_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (3), /* cost of a lea instruction */
1170 COSTS_N_INSNS (4), /* variable shift costs */
1171 COSTS_N_INSNS (4), /* constant shift costs */
1172 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (15), /* HI */
1174 COSTS_N_INSNS (15), /* SI */
1175 COSTS_N_INSNS (15), /* DI */
1176 COSTS_N_INSNS (15)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (56), /* HI */
1180 COSTS_N_INSNS (56), /* SI */
1181 COSTS_N_INSNS (56), /* DI */
1182 COSTS_N_INSNS (56)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 16, /* "large" insn */
1186 6, /* MOVE_RATIO */
1187 2, /* cost for loading QImode using movzbl */
1188 {4, 5, 4}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {2, 3, 2}, /* cost of storing integer registers */
1192 2, /* cost of reg,reg fld/fst */
1193 {2, 2, 6}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {4, 4, 6}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {2, 2}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {2, 2}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 12, /* cost of moving SSE register */
1203 {12, 12, 12}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {2, 2, 8}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 10, /* MMX or SSE register to integer */
1208 8, /* size of l1 cache. */
1209 256, /* size of l2 cache. */
1210 64, /* size of prefetch block */
1211 6, /* number of parallel prefetches */
1212 2, /* Branch cost */
1213 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1214 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1215 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1216 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1217 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1218 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1219 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1220 DUMMY_STRINGOP_ALGS},
1221 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1222 {-1, libcall}}},
1223 DUMMY_STRINGOP_ALGS},
1224 1, /* scalar_stmt_cost. */
1225 1, /* scalar load_cost. */
1226 1, /* scalar_store_cost. */
1227 1, /* vec_stmt_cost. */
1228 1, /* vec_to_scalar_cost. */
1229 1, /* scalar_to_vec_cost. */
1230 1, /* vec_align_load_cost. */
1231 2, /* vec_unalign_load_cost. */
1232 1, /* vec_store_cost. */
1233 3, /* cond_taken_branch_cost. */
1234 1, /* cond_not_taken_branch_cost. */
1237 static const
1238 struct processor_costs nocona_cost = {
1239 COSTS_N_INSNS (1), /* cost of an add instruction */
1240 COSTS_N_INSNS (1), /* cost of a lea instruction */
1241 COSTS_N_INSNS (1), /* variable shift costs */
1242 COSTS_N_INSNS (1), /* constant shift costs */
1243 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1244 COSTS_N_INSNS (10), /* HI */
1245 COSTS_N_INSNS (10), /* SI */
1246 COSTS_N_INSNS (10), /* DI */
1247 COSTS_N_INSNS (10)}, /* other */
1248 0, /* cost of multiply per each bit set */
1249 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1250 COSTS_N_INSNS (66), /* HI */
1251 COSTS_N_INSNS (66), /* SI */
1252 COSTS_N_INSNS (66), /* DI */
1253 COSTS_N_INSNS (66)}, /* other */
1254 COSTS_N_INSNS (1), /* cost of movsx */
1255 COSTS_N_INSNS (1), /* cost of movzx */
1256 16, /* "large" insn */
1257 17, /* MOVE_RATIO */
1258 4, /* cost for loading QImode using movzbl */
1259 {4, 4, 4}, /* cost of loading integer registers
1260 in QImode, HImode and SImode.
1261 Relative to reg-reg move (2). */
1262 {4, 4, 4}, /* cost of storing integer registers */
1263 3, /* cost of reg,reg fld/fst */
1264 {12, 12, 12}, /* cost of loading fp registers
1265 in SFmode, DFmode and XFmode */
1266 {4, 4, 4}, /* cost of storing fp registers
1267 in SFmode, DFmode and XFmode */
1268 6, /* cost of moving MMX register */
1269 {12, 12}, /* cost of loading MMX registers
1270 in SImode and DImode */
1271 {12, 12}, /* cost of storing MMX registers
1272 in SImode and DImode */
1273 6, /* cost of moving SSE register */
1274 {12, 12, 12}, /* cost of loading SSE registers
1275 in SImode, DImode and TImode */
1276 {12, 12, 12}, /* cost of storing SSE registers
1277 in SImode, DImode and TImode */
1278 8, /* MMX or SSE register to integer */
1279 8, /* size of l1 cache. */
1280 1024, /* size of l2 cache. */
1281 128, /* size of prefetch block */
1282 8, /* number of parallel prefetches */
1283 1, /* Branch cost */
1284 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1285 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1286 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1287 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1288 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1289 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1290 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1291 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1292 {100000, unrolled_loop}, {-1, libcall}}}},
1293 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1294 {-1, libcall}}},
1295 {libcall, {{24, loop}, {64, unrolled_loop},
1296 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1297 1, /* scalar_stmt_cost. */
1298 1, /* scalar load_cost. */
1299 1, /* scalar_store_cost. */
1300 1, /* vec_stmt_cost. */
1301 1, /* vec_to_scalar_cost. */
1302 1, /* scalar_to_vec_cost. */
1303 1, /* vec_align_load_cost. */
1304 2, /* vec_unalign_load_cost. */
1305 1, /* vec_store_cost. */
1306 3, /* cond_taken_branch_cost. */
1307 1, /* cond_not_taken_branch_cost. */
1310 static const
1311 struct processor_costs atom_cost = {
1312 COSTS_N_INSNS (1), /* cost of an add instruction */
1313 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1314 COSTS_N_INSNS (1), /* variable shift costs */
1315 COSTS_N_INSNS (1), /* constant shift costs */
1316 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1317 COSTS_N_INSNS (4), /* HI */
1318 COSTS_N_INSNS (3), /* SI */
1319 COSTS_N_INSNS (4), /* DI */
1320 COSTS_N_INSNS (2)}, /* other */
1321 0, /* cost of multiply per each bit set */
1322 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1323 COSTS_N_INSNS (26), /* HI */
1324 COSTS_N_INSNS (42), /* SI */
1325 COSTS_N_INSNS (74), /* DI */
1326 COSTS_N_INSNS (74)}, /* other */
1327 COSTS_N_INSNS (1), /* cost of movsx */
1328 COSTS_N_INSNS (1), /* cost of movzx */
1329 8, /* "large" insn */
1330 17, /* MOVE_RATIO */
1331 4, /* cost for loading QImode using movzbl */
1332 {4, 4, 4}, /* cost of loading integer registers
1333 in QImode, HImode and SImode.
1334 Relative to reg-reg move (2). */
1335 {4, 4, 4}, /* cost of storing integer registers */
1336 4, /* cost of reg,reg fld/fst */
1337 {12, 12, 12}, /* cost of loading fp registers
1338 in SFmode, DFmode and XFmode */
1339 {6, 6, 8}, /* cost of storing fp registers
1340 in SFmode, DFmode and XFmode */
1341 2, /* cost of moving MMX register */
1342 {8, 8}, /* cost of loading MMX registers
1343 in SImode and DImode */
1344 {8, 8}, /* cost of storing MMX registers
1345 in SImode and DImode */
1346 2, /* cost of moving SSE register */
1347 {8, 8, 8}, /* cost of loading SSE registers
1348 in SImode, DImode and TImode */
1349 {8, 8, 8}, /* cost of storing SSE registers
1350 in SImode, DImode and TImode */
1351 5, /* MMX or SSE register to integer */
1352 32, /* size of l1 cache. */
1353 256, /* size of l2 cache. */
1354 64, /* size of prefetch block */
1355 6, /* number of parallel prefetches */
1356 3, /* Branch cost */
1357 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1358 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1359 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1360 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1361 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1362 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1363 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1364 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1365 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1366 {{libcall, {{8, loop}, {15, unrolled_loop},
1367 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1368 {libcall, {{24, loop}, {32, unrolled_loop},
1369 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1370 1, /* scalar_stmt_cost. */
1371 1, /* scalar load_cost. */
1372 1, /* scalar_store_cost. */
1373 1, /* vec_stmt_cost. */
1374 1, /* vec_to_scalar_cost. */
1375 1, /* scalar_to_vec_cost. */
1376 1, /* vec_align_load_cost. */
1377 2, /* vec_unalign_load_cost. */
1378 1, /* vec_store_cost. */
1379 3, /* cond_taken_branch_cost. */
1380 1, /* cond_not_taken_branch_cost. */
1383 /* Generic64 should produce code tuned for Nocona and K8. */
1384 static const
1385 struct processor_costs generic64_cost = {
1386 COSTS_N_INSNS (1), /* cost of an add instruction */
1387 /* On all chips taken into consideration lea is 2 cycles and more. With
1388 this cost however our current implementation of synth_mult results in
1389 use of unnecessary temporary registers causing regression on several
1390 SPECfp benchmarks. */
1391 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1392 COSTS_N_INSNS (1), /* variable shift costs */
1393 COSTS_N_INSNS (1), /* constant shift costs */
1394 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1395 COSTS_N_INSNS (4), /* HI */
1396 COSTS_N_INSNS (3), /* SI */
1397 COSTS_N_INSNS (4), /* DI */
1398 COSTS_N_INSNS (2)}, /* other */
1399 0, /* cost of multiply per each bit set */
1400 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1401 COSTS_N_INSNS (26), /* HI */
1402 COSTS_N_INSNS (42), /* SI */
1403 COSTS_N_INSNS (74), /* DI */
1404 COSTS_N_INSNS (74)}, /* other */
1405 COSTS_N_INSNS (1), /* cost of movsx */
1406 COSTS_N_INSNS (1), /* cost of movzx */
1407 8, /* "large" insn */
1408 17, /* MOVE_RATIO */
1409 4, /* cost for loading QImode using movzbl */
1410 {4, 4, 4}, /* cost of loading integer registers
1411 in QImode, HImode and SImode.
1412 Relative to reg-reg move (2). */
1413 {4, 4, 4}, /* cost of storing integer registers */
1414 4, /* cost of reg,reg fld/fst */
1415 {12, 12, 12}, /* cost of loading fp registers
1416 in SFmode, DFmode and XFmode */
1417 {6, 6, 8}, /* cost of storing fp registers
1418 in SFmode, DFmode and XFmode */
1419 2, /* cost of moving MMX register */
1420 {8, 8}, /* cost of loading MMX registers
1421 in SImode and DImode */
1422 {8, 8}, /* cost of storing MMX registers
1423 in SImode and DImode */
1424 2, /* cost of moving SSE register */
1425 {8, 8, 8}, /* cost of loading SSE registers
1426 in SImode, DImode and TImode */
1427 {8, 8, 8}, /* cost of storing SSE registers
1428 in SImode, DImode and TImode */
1429 5, /* MMX or SSE register to integer */
1430 32, /* size of l1 cache. */
1431 512, /* size of l2 cache. */
1432 64, /* size of prefetch block */
1433 6, /* number of parallel prefetches */
1434 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1435 value is increased to perhaps more appropriate value of 5. */
1436 3, /* Branch cost */
1437 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1438 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1439 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1440 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1441 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1442 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1443 {DUMMY_STRINGOP_ALGS,
1444 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1445 {DUMMY_STRINGOP_ALGS,
1446 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1447 1, /* scalar_stmt_cost. */
1448 1, /* scalar load_cost. */
1449 1, /* scalar_store_cost. */
1450 1, /* vec_stmt_cost. */
1451 1, /* vec_to_scalar_cost. */
1452 1, /* scalar_to_vec_cost. */
1453 1, /* vec_align_load_cost. */
1454 2, /* vec_unalign_load_cost. */
1455 1, /* vec_store_cost. */
1456 3, /* cond_taken_branch_cost. */
1457 1, /* cond_not_taken_branch_cost. */
1460 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1461 Athlon and K8. */
1462 static const
1463 struct processor_costs generic32_cost = {
1464 COSTS_N_INSNS (1), /* cost of an add instruction */
1465 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1466 COSTS_N_INSNS (1), /* variable shift costs */
1467 COSTS_N_INSNS (1), /* constant shift costs */
1468 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1469 COSTS_N_INSNS (4), /* HI */
1470 COSTS_N_INSNS (3), /* SI */
1471 COSTS_N_INSNS (4), /* DI */
1472 COSTS_N_INSNS (2)}, /* other */
1473 0, /* cost of multiply per each bit set */
1474 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1475 COSTS_N_INSNS (26), /* HI */
1476 COSTS_N_INSNS (42), /* SI */
1477 COSTS_N_INSNS (74), /* DI */
1478 COSTS_N_INSNS (74)}, /* other */
1479 COSTS_N_INSNS (1), /* cost of movsx */
1480 COSTS_N_INSNS (1), /* cost of movzx */
1481 8, /* "large" insn */
1482 17, /* MOVE_RATIO */
1483 4, /* cost for loading QImode using movzbl */
1484 {4, 4, 4}, /* cost of loading integer registers
1485 in QImode, HImode and SImode.
1486 Relative to reg-reg move (2). */
1487 {4, 4, 4}, /* cost of storing integer registers */
1488 4, /* cost of reg,reg fld/fst */
1489 {12, 12, 12}, /* cost of loading fp registers
1490 in SFmode, DFmode and XFmode */
1491 {6, 6, 8}, /* cost of storing fp registers
1492 in SFmode, DFmode and XFmode */
1493 2, /* cost of moving MMX register */
1494 {8, 8}, /* cost of loading MMX registers
1495 in SImode and DImode */
1496 {8, 8}, /* cost of storing MMX registers
1497 in SImode and DImode */
1498 2, /* cost of moving SSE register */
1499 {8, 8, 8}, /* cost of loading SSE registers
1500 in SImode, DImode and TImode */
1501 {8, 8, 8}, /* cost of storing SSE registers
1502 in SImode, DImode and TImode */
1503 5, /* MMX or SSE register to integer */
1504 32, /* size of l1 cache. */
1505 256, /* size of l2 cache. */
1506 64, /* size of prefetch block */
1507 6, /* number of parallel prefetches */
1508 3, /* Branch cost */
1509 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1510 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1511 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1512 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1513 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1514 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1515 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1516 DUMMY_STRINGOP_ALGS},
1517 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1518 DUMMY_STRINGOP_ALGS},
1519 1, /* scalar_stmt_cost. */
1520 1, /* scalar load_cost. */
1521 1, /* scalar_store_cost. */
1522 1, /* vec_stmt_cost. */
1523 1, /* vec_to_scalar_cost. */
1524 1, /* scalar_to_vec_cost. */
1525 1, /* vec_align_load_cost. */
1526 2, /* vec_unalign_load_cost. */
1527 1, /* vec_store_cost. */
1528 3, /* cond_taken_branch_cost. */
1529 1, /* cond_not_taken_branch_cost. */
1532 /* Set by -mtune. */
1533 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1535 /* Set by -mtune or -Os. */
1536 const struct processor_costs *ix86_cost = &pentium_cost;
1538 /* Processor feature/optimization bitmasks. */
1539 #define m_386 (1<<PROCESSOR_I386)
1540 #define m_486 (1<<PROCESSOR_I486)
1541 #define m_PENT (1<<PROCESSOR_PENTIUM)
1542 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1543 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1544 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1545 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1546 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1547 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1548 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1549 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1550 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1551 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1552 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1553 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1554 #define m_ATOM (1<<PROCESSOR_ATOM)
1556 #define m_GEODE (1<<PROCESSOR_GEODE)
1557 #define m_K6 (1<<PROCESSOR_K6)
1558 #define m_K6_GEODE (m_K6 | m_GEODE)
1559 #define m_K8 (1<<PROCESSOR_K8)
1560 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1561 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1562 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1563 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1564 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1565 #define m_BDVER (m_BDVER1 | m_BDVER2)
1566 #define m_BTVER (m_BTVER1 | m_BTVER2)
1567 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1568 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1569 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1571 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1572 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1574 /* Generic instruction choice should be common subset of supported CPUs
1575 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1576 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1578 /* Feature tests against the various tunings. */
1579 unsigned char ix86_tune_features[X86_TUNE_LAST];
1581 /* Feature tests against the various tunings used to create ix86_tune_features
1582 based on the processor mask. */
1583 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1584 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1585 negatively, so enabling for Generic64 seems like good code size
1586 tradeoff. We can't enable it for 32bit generic because it does not
1587 work well with PPro base chips. */
1588 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1590 /* X86_TUNE_PUSH_MEMORY */
1591 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1593 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1594 m_486 | m_PENT,
1596 /* X86_TUNE_UNROLL_STRLEN */
1597 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1599 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1600 on simulation result. But after P4 was made, no performance benefit
1601 was observed with branch hints. It also increases the code size.
1602 As a result, icc never generates branch hints. */
1605 /* X86_TUNE_DOUBLE_WITH_ADD */
1606 ~m_386,
1608 /* X86_TUNE_USE_SAHF */
1609 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1611 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1612 partial dependencies. */
1613 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1615 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1616 register stalls on Generic32 compilation setting as well. However
1617 in current implementation the partial register stalls are not eliminated
1618 very well - they can be introduced via subregs synthesized by combine
1619 and can happen in caller/callee saving sequences. Because this option
1620 pays back little on PPro based chips and is in conflict with partial reg
1621 dependencies used by Athlon/P4 based chips, it is better to leave it off
1622 for generic32 for now. */
1623 m_PPRO,
1625 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1626 m_CORE2I7 | m_GENERIC,
1628 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1629 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1630 m_CORE2I7 | m_GENERIC,
1632 /* X86_TUNE_USE_HIMODE_FIOP */
1633 m_386 | m_486 | m_K6_GEODE,
1635 /* X86_TUNE_USE_SIMODE_FIOP */
1636 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1638 /* X86_TUNE_USE_MOV0 */
1639 m_K6,
1641 /* X86_TUNE_USE_CLTD */
1642 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1644 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1645 m_PENT4,
1647 /* X86_TUNE_SPLIT_LONG_MOVES */
1648 m_PPRO,
1650 /* X86_TUNE_READ_MODIFY_WRITE */
1651 ~m_PENT,
1653 /* X86_TUNE_READ_MODIFY */
1654 ~(m_PENT | m_PPRO),
1656 /* X86_TUNE_PROMOTE_QIMODE */
1657 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1659 /* X86_TUNE_FAST_PREFIX */
1660 ~(m_386 | m_486 | m_PENT),
1662 /* X86_TUNE_SINGLE_STRINGOP */
1663 m_386 | m_P4_NOCONA,
1665 /* X86_TUNE_QIMODE_MATH */
1668 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1669 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1670 might be considered for Generic32 if our scheme for avoiding partial
1671 stalls was more effective. */
1672 ~m_PPRO,
1674 /* X86_TUNE_PROMOTE_QI_REGS */
1677 /* X86_TUNE_PROMOTE_HI_REGS */
1678 m_PPRO,
1680 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1681 over esp addition. */
1682 m_386 | m_486 | m_PENT | m_PPRO,
1684 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1685 over esp addition. */
1686 m_PENT,
1688 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1689 over esp subtraction. */
1690 m_386 | m_486 | m_PENT | m_K6_GEODE,
1692 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1693 over esp subtraction. */
1694 m_PENT | m_K6_GEODE,
1696 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1697 for DFmode copies */
1698 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1700 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1701 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1703 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1704 conflict here in between PPro/Pentium4 based chips that thread 128bit
1705 SSE registers as single units versus K8 based chips that divide SSE
1706 registers to two 64bit halves. This knob promotes all store destinations
1707 to be 128bit to allow register renaming on 128bit SSE units, but usually
1708 results in one extra microop on 64bit SSE units. Experimental results
1709 shows that disabling this option on P4 brings over 20% SPECfp regression,
1710 while enabling it on K8 brings roughly 2.4% regression that can be partly
1711 masked by careful scheduling of moves. */
1712 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1714 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1715 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
1717 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1718 m_COREI7 | m_BDVER,
1720 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1721 m_BDVER ,
1723 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1724 are resolved on SSE register parts instead of whole registers, so we may
1725 maintain just lower part of scalar values in proper format leaving the
1726 upper part undefined. */
1727 m_ATHLON_K8,
1729 /* X86_TUNE_SSE_TYPELESS_STORES */
1730 m_AMD_MULTIPLE,
1732 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1733 m_PPRO | m_P4_NOCONA,
1735 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1736 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1738 /* X86_TUNE_PROLOGUE_USING_MOVE */
1739 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
1741 /* X86_TUNE_EPILOGUE_USING_MOVE */
1742 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
1744 /* X86_TUNE_SHIFT1 */
1745 ~m_486,
1747 /* X86_TUNE_USE_FFREEP */
1748 m_AMD_MULTIPLE,
1750 /* X86_TUNE_INTER_UNIT_MOVES */
1751 ~(m_AMD_MULTIPLE | m_GENERIC),
1753 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1754 ~(m_AMDFAM10 | m_BDVER ),
1756 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1757 than 4 branch instructions in the 16 byte window. */
1758 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1760 /* X86_TUNE_SCHEDULE */
1761 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1763 /* X86_TUNE_USE_BT */
1764 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1766 /* X86_TUNE_USE_INCDEC */
1767 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
1769 /* X86_TUNE_PAD_RETURNS */
1770 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
1772 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1773 m_ATOM,
1775 /* X86_TUNE_EXT_80387_CONSTANTS */
1776 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1778 /* X86_TUNE_SHORTEN_X87_SSE */
1779 ~m_K8,
1781 /* X86_TUNE_AVOID_VECTOR_DECODE */
1782 m_CORE2I7_64 | m_K8 | m_GENERIC64,
1784 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1785 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1786 ~(m_386 | m_486),
1788 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1789 vector path on AMD machines. */
1790 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1792 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1793 machines. */
1794 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1796 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1797 than a MOV. */
1798 m_PENT,
1800 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1801 but one byte longer. */
1802 m_PENT,
1804 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1805 operand that cannot be represented using a modRM byte. The XOR
1806 replacement is long decoded, so this split helps here as well. */
1807 m_K6,
1809 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1810 from FP to FP. */
1811 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
1813 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1814 from integer to FP. */
1815 m_AMDFAM10,
1817 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1818 with a subsequent conditional jump instruction into a single
1819 compare-and-branch uop. */
1820 m_BDVER,
1822 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
1823 will impact LEA instruction selection. */
1824 m_ATOM,
1826 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
1827 instructions. */
1828 ~m_ATOM,
1830 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
1831 at -O3. For the moment, the prefetching seems badly tuned for Intel
1832 chips. */
1833 m_K6_GEODE | m_AMD_MULTIPLE,
1835 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
1836 the auto-vectorizer. */
1837 m_BDVER,
1839 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
1840 during reassociation of integer computation. */
1841 m_ATOM,
1843 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
1844 during reassociation of fp computation. */
1845 m_ATOM,
1847 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
1848 regs instead of memory. */
1849 m_COREI7 | m_CORE2I7
1852 /* Feature tests against the various architecture variations. */
1853 unsigned char ix86_arch_features[X86_ARCH_LAST];
1855 /* Feature tests against the various architecture variations, used to create
1856 ix86_arch_features based on the processor mask. */
1857 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1858 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1859 ~(m_386 | m_486 | m_PENT | m_K6),
1861 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1862 ~m_386,
1864 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1865 ~(m_386 | m_486),
1867 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1868 ~m_386,
1870 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1871 ~m_386,
1874 static const unsigned int x86_accumulate_outgoing_args
1875 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
1877 static const unsigned int x86_arch_always_fancy_math_387
1878 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
1880 static const unsigned int x86_avx256_split_unaligned_load
1881 = m_COREI7 | m_GENERIC;
1883 static const unsigned int x86_avx256_split_unaligned_store
1884 = m_COREI7 | m_BDVER | m_GENERIC;
1886 /* In case the average insn count for single function invocation is
1887 lower than this constant, emit fast (but longer) prologue and
1888 epilogue code. */
1889 #define FAST_PROLOGUE_INSN_COUNT 20
1891 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1892 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1893 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1894 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1896 /* Array of the smallest class containing reg number REGNO, indexed by
1897 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1899 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1901 /* ax, dx, cx, bx */
1902 AREG, DREG, CREG, BREG,
1903 /* si, di, bp, sp */
1904 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1905 /* FP registers */
1906 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1907 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1908 /* arg pointer */
1909 NON_Q_REGS,
1910 /* flags, fpsr, fpcr, frame */
1911 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1912 /* SSE registers */
1913 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1914 SSE_REGS, SSE_REGS,
1915 /* MMX registers */
1916 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1917 MMX_REGS, MMX_REGS,
1918 /* REX registers */
1919 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1920 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1921 /* SSE REX registers */
1922 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1923 SSE_REGS, SSE_REGS,
1926 /* The "default" register map used in 32bit mode. */
1928 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1930 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1931 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1932 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1933 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1934 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1935 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1936 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1939 /* The "default" register map used in 64bit mode. */
1941 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1943 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1944 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1945 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1946 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1947 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1948 8,9,10,11,12,13,14,15, /* extended integer registers */
1949 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1952 /* Define the register numbers to be used in Dwarf debugging information.
1953 The SVR4 reference port C compiler uses the following register numbers
1954 in its Dwarf output code:
1955 0 for %eax (gcc regno = 0)
1956 1 for %ecx (gcc regno = 2)
1957 2 for %edx (gcc regno = 1)
1958 3 for %ebx (gcc regno = 3)
1959 4 for %esp (gcc regno = 7)
1960 5 for %ebp (gcc regno = 6)
1961 6 for %esi (gcc regno = 4)
1962 7 for %edi (gcc regno = 5)
1963 The following three DWARF register numbers are never generated by
1964 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1965 believes these numbers have these meanings.
1966 8 for %eip (no gcc equivalent)
1967 9 for %eflags (gcc regno = 17)
1968 10 for %trapno (no gcc equivalent)
1969 It is not at all clear how we should number the FP stack registers
1970 for the x86 architecture. If the version of SDB on x86/svr4 were
1971 a bit less brain dead with respect to floating-point then we would
1972 have a precedent to follow with respect to DWARF register numbers
1973 for x86 FP registers, but the SDB on x86/svr4 is so completely
1974 broken with respect to FP registers that it is hardly worth thinking
1975 of it as something to strive for compatibility with.
1976 The version of x86/svr4 SDB I have at the moment does (partially)
1977 seem to believe that DWARF register number 11 is associated with
1978 the x86 register %st(0), but that's about all. Higher DWARF
1979 register numbers don't seem to be associated with anything in
1980 particular, and even for DWARF regno 11, SDB only seems to under-
1981 stand that it should say that a variable lives in %st(0) (when
1982 asked via an `=' command) if we said it was in DWARF regno 11,
1983 but SDB still prints garbage when asked for the value of the
1984 variable in question (via a `/' command).
1985 (Also note that the labels SDB prints for various FP stack regs
1986 when doing an `x' command are all wrong.)
1987 Note that these problems generally don't affect the native SVR4
1988 C compiler because it doesn't allow the use of -O with -g and
1989 because when it is *not* optimizing, it allocates a memory
1990 location for each floating-point variable, and the memory
1991 location is what gets described in the DWARF AT_location
1992 attribute for the variable in question.
1993 Regardless of the severe mental illness of the x86/svr4 SDB, we
1994 do something sensible here and we use the following DWARF
1995 register numbers. Note that these are all stack-top-relative
1996 numbers.
1997 11 for %st(0) (gcc regno = 8)
1998 12 for %st(1) (gcc regno = 9)
1999 13 for %st(2) (gcc regno = 10)
2000 14 for %st(3) (gcc regno = 11)
2001 15 for %st(4) (gcc regno = 12)
2002 16 for %st(5) (gcc regno = 13)
2003 17 for %st(6) (gcc regno = 14)
2004 18 for %st(7) (gcc regno = 15)
2006 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2008 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2009 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2010 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2011 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2012 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2013 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2014 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2017 /* Define parameter passing and return registers. */
2019 static int const x86_64_int_parameter_registers[6] =
2021 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2024 static int const x86_64_ms_abi_int_parameter_registers[4] =
2026 CX_REG, DX_REG, R8_REG, R9_REG
2029 static int const x86_64_int_return_registers[4] =
2031 AX_REG, DX_REG, DI_REG, SI_REG
2034 /* Define the structure for the machine field in struct function. */
2036 struct GTY(()) stack_local_entry {
2037 unsigned short mode;
2038 unsigned short n;
2039 rtx rtl;
2040 struct stack_local_entry *next;
2043 /* Structure describing stack frame layout.
2044 Stack grows downward:
2046 [arguments]
2047 <- ARG_POINTER
2048 saved pc
2050 saved static chain if ix86_static_chain_on_stack
2052 saved frame pointer if frame_pointer_needed
2053 <- HARD_FRAME_POINTER
2054 [saved regs]
2055 <- regs_save_offset
2056 [padding0]
2058 [saved SSE regs]
2059 <- sse_regs_save_offset
2060 [padding1] |
2061 | <- FRAME_POINTER
2062 [va_arg registers] |
2064 [frame] |
2066 [padding2] | = to_allocate
2067 <- STACK_POINTER
2069 struct ix86_frame
2071 int nsseregs;
2072 int nregs;
2073 int va_arg_size;
2074 int red_zone_size;
2075 int outgoing_arguments_size;
2077 /* The offsets relative to ARG_POINTER. */
2078 HOST_WIDE_INT frame_pointer_offset;
2079 HOST_WIDE_INT hard_frame_pointer_offset;
2080 HOST_WIDE_INT stack_pointer_offset;
2081 HOST_WIDE_INT hfp_save_offset;
2082 HOST_WIDE_INT reg_save_offset;
2083 HOST_WIDE_INT sse_reg_save_offset;
2085 /* When save_regs_using_mov is set, emit prologue using
2086 move instead of push instructions. */
2087 bool save_regs_using_mov;
2090 /* Which cpu are we scheduling for. */
2091 enum attr_cpu ix86_schedule;
2093 /* Which cpu are we optimizing for. */
2094 enum processor_type ix86_tune;
2096 /* Which instruction set architecture to use. */
2097 enum processor_type ix86_arch;
2099 /* True if processor has SSE prefetch instruction. */
2100 unsigned char x86_prefetch_sse;
2102 /* -mstackrealign option */
2103 static const char ix86_force_align_arg_pointer_string[]
2104 = "force_align_arg_pointer";
2106 static rtx (*ix86_gen_leave) (void);
2107 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2108 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2109 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2110 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2111 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2112 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2113 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2114 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2115 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2116 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2117 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2119 /* Preferred alignment for stack boundary in bits. */
2120 unsigned int ix86_preferred_stack_boundary;
2122 /* Alignment for incoming stack boundary in bits specified at
2123 command line. */
2124 static unsigned int ix86_user_incoming_stack_boundary;
2126 /* Default alignment for incoming stack boundary in bits. */
2127 static unsigned int ix86_default_incoming_stack_boundary;
2129 /* Alignment for incoming stack boundary in bits. */
2130 unsigned int ix86_incoming_stack_boundary;
2132 /* Calling abi specific va_list type nodes. */
2133 static GTY(()) tree sysv_va_list_type_node;
2134 static GTY(()) tree ms_va_list_type_node;
2136 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2137 char internal_label_prefix[16];
2138 int internal_label_prefix_len;
2140 /* Fence to use after loop using movnt. */
2141 tree x86_mfence;
2143 /* Register class used for passing given 64bit part of the argument.
2144 These represent classes as documented by the PS ABI, with the exception
2145 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2146 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2148 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2149 whenever possible (upper half does contain padding). */
2150 enum x86_64_reg_class
2152 X86_64_NO_CLASS,
2153 X86_64_INTEGER_CLASS,
2154 X86_64_INTEGERSI_CLASS,
2155 X86_64_SSE_CLASS,
2156 X86_64_SSESF_CLASS,
2157 X86_64_SSEDF_CLASS,
2158 X86_64_SSEUP_CLASS,
2159 X86_64_X87_CLASS,
2160 X86_64_X87UP_CLASS,
2161 X86_64_COMPLEX_X87_CLASS,
2162 X86_64_MEMORY_CLASS
2165 #define MAX_CLASSES 4
2167 /* Table of constants used by fldpi, fldln2, etc.... */
2168 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2169 static bool ext_80387_constants_init = 0;
2172 static struct machine_function * ix86_init_machine_status (void);
2173 static rtx ix86_function_value (const_tree, const_tree, bool);
2174 static bool ix86_function_value_regno_p (const unsigned int);
2175 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2176 const_tree);
2177 static rtx ix86_static_chain (const_tree, bool);
2178 static int ix86_function_regparm (const_tree, const_tree);
2179 static void ix86_compute_frame_layout (struct ix86_frame *);
2180 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2181 rtx, rtx, int);
2182 static void ix86_add_new_builtins (HOST_WIDE_INT);
2183 static tree ix86_canonical_va_list_type (tree);
2184 static void predict_jump (int);
2185 static unsigned int split_stack_prologue_scratch_regno (void);
2186 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2188 enum ix86_function_specific_strings
2190 IX86_FUNCTION_SPECIFIC_ARCH,
2191 IX86_FUNCTION_SPECIFIC_TUNE,
2192 IX86_FUNCTION_SPECIFIC_MAX
2195 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2196 const char *, enum fpmath_unit, bool);
2197 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2198 static void ix86_function_specific_save (struct cl_target_option *);
2199 static void ix86_function_specific_restore (struct cl_target_option *);
2200 static void ix86_function_specific_print (FILE *, int,
2201 struct cl_target_option *);
2202 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2203 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2204 struct gcc_options *);
2205 static bool ix86_can_inline_p (tree, tree);
2206 static void ix86_set_current_function (tree);
2207 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2209 static enum calling_abi ix86_function_abi (const_tree);
2212 #ifndef SUBTARGET32_DEFAULT_CPU
2213 #define SUBTARGET32_DEFAULT_CPU "i386"
2214 #endif
2216 /* The svr4 ABI for the i386 says that records and unions are returned
2217 in memory. */
2218 #ifndef DEFAULT_PCC_STRUCT_RETURN
2219 #define DEFAULT_PCC_STRUCT_RETURN 1
2220 #endif
2222 /* Whether -mtune= or -march= were specified */
2223 static int ix86_tune_defaulted;
2224 static int ix86_arch_specified;
2226 /* Vectorization library interface and handlers. */
2227 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2229 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2230 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2232 /* Processor target table, indexed by processor number */
2233 struct ptt
2235 const struct processor_costs *cost; /* Processor costs */
2236 const int align_loop; /* Default alignments. */
2237 const int align_loop_max_skip;
2238 const int align_jump;
2239 const int align_jump_max_skip;
2240 const int align_func;
2243 static const struct ptt processor_target_table[PROCESSOR_max] =
2245 {&i386_cost, 4, 3, 4, 3, 4},
2246 {&i486_cost, 16, 15, 16, 15, 16},
2247 {&pentium_cost, 16, 7, 16, 7, 16},
2248 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2249 {&geode_cost, 0, 0, 0, 0, 0},
2250 {&k6_cost, 32, 7, 32, 7, 32},
2251 {&athlon_cost, 16, 7, 16, 7, 16},
2252 {&pentium4_cost, 0, 0, 0, 0, 0},
2253 {&k8_cost, 16, 7, 16, 7, 16},
2254 {&nocona_cost, 0, 0, 0, 0, 0},
2255 /* Core 2 32-bit. */
2256 {&generic32_cost, 16, 10, 16, 10, 16},
2257 /* Core 2 64-bit. */
2258 {&generic64_cost, 16, 10, 16, 10, 16},
2259 /* Core i7 32-bit. */
2260 {&generic32_cost, 16, 10, 16, 10, 16},
2261 /* Core i7 64-bit. */
2262 {&generic64_cost, 16, 10, 16, 10, 16},
2263 {&generic32_cost, 16, 7, 16, 7, 16},
2264 {&generic64_cost, 16, 10, 16, 10, 16},
2265 {&amdfam10_cost, 32, 24, 32, 7, 32},
2266 {&bdver1_cost, 32, 24, 32, 7, 32},
2267 {&bdver2_cost, 32, 24, 32, 7, 32},
2268 {&btver1_cost, 32, 24, 32, 7, 32},
2269 {&btver2_cost, 32, 24, 32, 7, 32},
2270 {&atom_cost, 16, 15, 16, 7, 16}
2273 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2275 "generic",
2276 "i386",
2277 "i486",
2278 "pentium",
2279 "pentium-mmx",
2280 "pentiumpro",
2281 "pentium2",
2282 "pentium3",
2283 "pentium4",
2284 "pentium-m",
2285 "prescott",
2286 "nocona",
2287 "core2",
2288 "corei7",
2289 "atom",
2290 "geode",
2291 "k6",
2292 "k6-2",
2293 "k6-3",
2294 "athlon",
2295 "athlon-4",
2296 "k8",
2297 "amdfam10",
2298 "bdver1",
2299 "bdver2",
2300 "btver1",
2301 "btver2"
2304 /* Return true if a red-zone is in use. */
2306 static inline bool
2307 ix86_using_red_zone (void)
2309 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2312 /* Return a string that documents the current -m options. The caller is
2313 responsible for freeing the string. */
2315 static char *
2316 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2317 const char *tune, enum fpmath_unit fpmath,
2318 bool add_nl_p)
2320 struct ix86_target_opts
2322 const char *option; /* option string */
2323 HOST_WIDE_INT mask; /* isa mask options */
2326 /* This table is ordered so that options like -msse4.2 that imply
2327 preceding options while match those first. */
2328 static struct ix86_target_opts isa_opts[] =
2330 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2331 { "-mfma", OPTION_MASK_ISA_FMA },
2332 { "-mxop", OPTION_MASK_ISA_XOP },
2333 { "-mlwp", OPTION_MASK_ISA_LWP },
2334 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2335 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2336 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2337 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2338 { "-msse3", OPTION_MASK_ISA_SSE3 },
2339 { "-msse2", OPTION_MASK_ISA_SSE2 },
2340 { "-msse", OPTION_MASK_ISA_SSE },
2341 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2342 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2343 { "-mmmx", OPTION_MASK_ISA_MMX },
2344 { "-mabm", OPTION_MASK_ISA_ABM },
2345 { "-mbmi", OPTION_MASK_ISA_BMI },
2346 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2347 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2348 { "-mhle", OPTION_MASK_ISA_HLE },
2349 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2350 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2351 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2352 { "-madx", OPTION_MASK_ISA_ADX },
2353 { "-mtbm", OPTION_MASK_ISA_TBM },
2354 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2355 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2356 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2357 { "-maes", OPTION_MASK_ISA_AES },
2358 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2359 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2360 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2361 { "-mf16c", OPTION_MASK_ISA_F16C },
2362 { "-mrtm", OPTION_MASK_ISA_RTM },
2363 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2364 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2367 /* Flag options. */
2368 static struct ix86_target_opts flag_opts[] =
2370 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2371 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2372 { "-m80387", MASK_80387 },
2373 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2374 { "-malign-double", MASK_ALIGN_DOUBLE },
2375 { "-mcld", MASK_CLD },
2376 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2377 { "-mieee-fp", MASK_IEEE_FP },
2378 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2379 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2380 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2381 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2382 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2383 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2384 { "-mno-red-zone", MASK_NO_RED_ZONE },
2385 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2386 { "-mrecip", MASK_RECIP },
2387 { "-mrtd", MASK_RTD },
2388 { "-msseregparm", MASK_SSEREGPARM },
2389 { "-mstack-arg-probe", MASK_STACK_PROBE },
2390 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2391 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2392 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2393 { "-mvzeroupper", MASK_VZEROUPPER },
2394 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2395 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2396 { "-mprefer-avx128", MASK_PREFER_AVX128},
2399 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2401 char isa_other[40];
2402 char target_other[40];
2403 unsigned num = 0;
2404 unsigned i, j;
2405 char *ret;
2406 char *ptr;
2407 size_t len;
2408 size_t line_len;
2409 size_t sep_len;
2410 const char *abi;
2412 memset (opts, '\0', sizeof (opts));
2414 /* Add -march= option. */
2415 if (arch)
2417 opts[num][0] = "-march=";
2418 opts[num++][1] = arch;
2421 /* Add -mtune= option. */
2422 if (tune)
2424 opts[num][0] = "-mtune=";
2425 opts[num++][1] = tune;
2428 /* Add -m32/-m64/-mx32. */
2429 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2431 if ((isa & OPTION_MASK_ABI_64) != 0)
2432 abi = "-m64";
2433 else
2434 abi = "-mx32";
2435 isa &= ~ (OPTION_MASK_ISA_64BIT
2436 | OPTION_MASK_ABI_64
2437 | OPTION_MASK_ABI_X32);
2439 else
2440 abi = "-m32";
2441 opts[num++][0] = abi;
2443 /* Pick out the options in isa options. */
2444 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2446 if ((isa & isa_opts[i].mask) != 0)
2448 opts[num++][0] = isa_opts[i].option;
2449 isa &= ~ isa_opts[i].mask;
2453 if (isa && add_nl_p)
2455 opts[num++][0] = isa_other;
2456 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2457 isa);
2460 /* Add flag options. */
2461 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2463 if ((flags & flag_opts[i].mask) != 0)
2465 opts[num++][0] = flag_opts[i].option;
2466 flags &= ~ flag_opts[i].mask;
2470 if (flags && add_nl_p)
2472 opts[num++][0] = target_other;
2473 sprintf (target_other, "(other flags: %#x)", flags);
2476 /* Add -fpmath= option. */
2477 if (fpmath)
2479 opts[num][0] = "-mfpmath=";
2480 switch ((int) fpmath)
2482 case FPMATH_387:
2483 opts[num++][1] = "387";
2484 break;
2486 case FPMATH_SSE:
2487 opts[num++][1] = "sse";
2488 break;
2490 case FPMATH_387 | FPMATH_SSE:
2491 opts[num++][1] = "sse+387";
2492 break;
2494 default:
2495 gcc_unreachable ();
2499 /* Any options? */
2500 if (num == 0)
2501 return NULL;
2503 gcc_assert (num < ARRAY_SIZE (opts));
2505 /* Size the string. */
2506 len = 0;
2507 sep_len = (add_nl_p) ? 3 : 1;
2508 for (i = 0; i < num; i++)
2510 len += sep_len;
2511 for (j = 0; j < 2; j++)
2512 if (opts[i][j])
2513 len += strlen (opts[i][j]);
2516 /* Build the string. */
2517 ret = ptr = (char *) xmalloc (len);
2518 line_len = 0;
2520 for (i = 0; i < num; i++)
2522 size_t len2[2];
2524 for (j = 0; j < 2; j++)
2525 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2527 if (i != 0)
2529 *ptr++ = ' ';
2530 line_len++;
2532 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2534 *ptr++ = '\\';
2535 *ptr++ = '\n';
2536 line_len = 0;
2540 for (j = 0; j < 2; j++)
2541 if (opts[i][j])
2543 memcpy (ptr, opts[i][j], len2[j]);
2544 ptr += len2[j];
2545 line_len += len2[j];
2549 *ptr = '\0';
2550 gcc_assert (ret + len >= ptr);
2552 return ret;
2555 /* Return true, if profiling code should be emitted before
2556 prologue. Otherwise it returns false.
2557 Note: For x86 with "hotfix" it is sorried. */
2558 static bool
2559 ix86_profile_before_prologue (void)
2561 return flag_fentry != 0;
2564 /* Function that is callable from the debugger to print the current
2565 options. */
2566 void
2567 ix86_debug_options (void)
2569 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2570 ix86_arch_string, ix86_tune_string,
2571 ix86_fpmath, true);
2573 if (opts)
2575 fprintf (stderr, "%s\n\n", opts);
2576 free (opts);
2578 else
2579 fputs ("<no options>\n\n", stderr);
2581 return;
2584 /* Override various settings based on options. If MAIN_ARGS_P, the
2585 options are from the command line, otherwise they are from
2586 attributes. */
2588 static void
2589 ix86_option_override_internal (bool main_args_p)
2591 int i;
2592 unsigned int ix86_arch_mask, ix86_tune_mask;
2593 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2594 const char *prefix;
2595 const char *suffix;
2596 const char *sw;
2598 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2599 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2600 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2601 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2602 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2603 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2604 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2605 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2606 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2607 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2608 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2609 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2610 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2611 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2612 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2613 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2614 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2615 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2616 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2617 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2618 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2619 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2620 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2621 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2622 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2623 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2624 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2625 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2626 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2627 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2628 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2629 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2630 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2631 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2632 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2633 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2634 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2635 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2636 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2637 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2639 /* if this reaches 64, need to widen struct pta flags below */
2641 static struct pta
2643 const char *const name; /* processor name or nickname. */
2644 const enum processor_type processor;
2645 const enum attr_cpu schedule;
2646 const unsigned HOST_WIDE_INT flags;
2648 const processor_alias_table[] =
2650 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2651 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2652 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2653 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2654 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2655 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2656 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2657 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2658 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2659 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2660 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2661 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2662 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2663 PTA_MMX | PTA_SSE | PTA_FXSR},
2664 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2665 PTA_MMX | PTA_SSE | PTA_FXSR},
2666 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2667 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2668 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2669 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2670 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2671 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2672 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2673 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2674 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2675 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2676 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2677 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2678 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2679 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2680 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2681 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2682 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
2683 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2684 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2685 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2686 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2687 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2688 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2689 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2690 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2691 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2692 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2693 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2694 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2695 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2696 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2697 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2698 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2699 | PTA_XSAVEOPT},
2700 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2701 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2702 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2703 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2704 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2705 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2706 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2707 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2708 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2709 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2710 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2711 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2712 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2713 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2714 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2715 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2716 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2717 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2718 {"x86-64", PROCESSOR_K8, CPU_K8,
2719 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2720 {"k8", PROCESSOR_K8, CPU_K8,
2721 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2722 | PTA_SSE2 | PTA_NO_SAHF},
2723 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2724 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2725 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2726 {"opteron", PROCESSOR_K8, CPU_K8,
2727 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2728 | PTA_SSE2 | PTA_NO_SAHF},
2729 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2730 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2731 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2732 {"athlon64", PROCESSOR_K8, CPU_K8,
2733 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2734 | PTA_SSE2 | PTA_NO_SAHF},
2735 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2736 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2737 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2738 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2739 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2740 | PTA_SSE2 | PTA_NO_SAHF},
2741 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2742 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2743 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2744 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2745 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2746 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2747 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2748 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2749 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2750 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2751 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2752 | PTA_XSAVEOPT},
2753 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2754 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2755 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2756 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2757 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2758 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2759 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2760 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2761 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2762 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2763 {"btver2", PROCESSOR_BTVER2, CPU_GENERIC64,
2764 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2765 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2766 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2767 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2768 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2770 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
2771 PTA_HLE /* flags are only used for -march switch. */ },
2772 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
2773 PTA_64BIT
2774 | PTA_HLE /* flags are only used for -march switch. */ },
2777 /* -mrecip options. */
2778 static struct
2780 const char *string; /* option name */
2781 unsigned int mask; /* mask bits to set */
2783 const recip_options[] =
2785 { "all", RECIP_MASK_ALL },
2786 { "none", RECIP_MASK_NONE },
2787 { "div", RECIP_MASK_DIV },
2788 { "sqrt", RECIP_MASK_SQRT },
2789 { "vec-div", RECIP_MASK_VEC_DIV },
2790 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
2793 int const pta_size = ARRAY_SIZE (processor_alias_table);
2795 /* Set up prefix/suffix so the error messages refer to either the command
2796 line argument, or the attribute(target). */
2797 if (main_args_p)
2799 prefix = "-m";
2800 suffix = "";
2801 sw = "switch";
2803 else
2805 prefix = "option(\"";
2806 suffix = "\")";
2807 sw = "attribute";
2810 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
2811 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
2812 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
2813 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
2814 #ifdef TARGET_BI_ARCH
2815 else
2817 #if TARGET_BI_ARCH == 1
2818 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
2819 is on and OPTION_MASK_ABI_X32 is off. We turn off
2820 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
2821 -mx32. */
2822 if (TARGET_X32)
2823 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
2824 #else
2825 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
2826 on and OPTION_MASK_ABI_64 is off. We turn off
2827 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
2828 -m64. */
2829 if (TARGET_LP64)
2830 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
2831 #endif
2833 #endif
2835 if (TARGET_X32)
2837 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
2838 OPTION_MASK_ABI_64 for TARGET_X32. */
2839 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
2840 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
2842 else if (TARGET_LP64)
2844 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
2845 OPTION_MASK_ABI_X32 for TARGET_LP64. */
2846 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
2847 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
2850 #ifdef SUBTARGET_OVERRIDE_OPTIONS
2851 SUBTARGET_OVERRIDE_OPTIONS;
2852 #endif
2854 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
2855 SUBSUBTARGET_OVERRIDE_OPTIONS;
2856 #endif
2858 /* -fPIC is the default for x86_64. */
2859 if (TARGET_MACHO && TARGET_64BIT)
2860 flag_pic = 2;
2862 /* Need to check -mtune=generic first. */
2863 if (ix86_tune_string)
2865 if (!strcmp (ix86_tune_string, "generic")
2866 || !strcmp (ix86_tune_string, "i686")
2867 /* As special support for cross compilers we read -mtune=native
2868 as -mtune=generic. With native compilers we won't see the
2869 -mtune=native, as it was changed by the driver. */
2870 || !strcmp (ix86_tune_string, "native"))
2872 if (TARGET_64BIT)
2873 ix86_tune_string = "generic64";
2874 else
2875 ix86_tune_string = "generic32";
2877 /* If this call is for setting the option attribute, allow the
2878 generic32/generic64 that was previously set. */
2879 else if (!main_args_p
2880 && (!strcmp (ix86_tune_string, "generic32")
2881 || !strcmp (ix86_tune_string, "generic64")))
2883 else if (!strncmp (ix86_tune_string, "generic", 7))
2884 error ("bad value (%s) for %stune=%s %s",
2885 ix86_tune_string, prefix, suffix, sw);
2886 else if (!strcmp (ix86_tune_string, "x86-64"))
2887 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
2888 "%stune=k8%s or %stune=generic%s instead as appropriate",
2889 prefix, suffix, prefix, suffix, prefix, suffix);
2891 else
2893 if (ix86_arch_string)
2894 ix86_tune_string = ix86_arch_string;
2895 if (!ix86_tune_string)
2897 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
2898 ix86_tune_defaulted = 1;
2901 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
2902 need to use a sensible tune option. */
2903 if (!strcmp (ix86_tune_string, "generic")
2904 || !strcmp (ix86_tune_string, "x86-64")
2905 || !strcmp (ix86_tune_string, "i686"))
2907 if (TARGET_64BIT)
2908 ix86_tune_string = "generic64";
2909 else
2910 ix86_tune_string = "generic32";
2914 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
2916 /* rep; movq isn't available in 32-bit code. */
2917 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
2918 ix86_stringop_alg = no_stringop;
2921 if (!ix86_arch_string)
2922 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
2923 else
2924 ix86_arch_specified = 1;
2926 if (global_options_set.x_ix86_pmode)
2928 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
2929 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
2930 error ("address mode %qs not supported in the %s bit mode",
2931 TARGET_64BIT ? "short" : "long",
2932 TARGET_64BIT ? "64" : "32");
2934 else
2935 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
2937 if (!global_options_set.x_ix86_abi)
2938 ix86_abi = DEFAULT_ABI;
2940 if (global_options_set.x_ix86_cmodel)
2942 switch (ix86_cmodel)
2944 case CM_SMALL:
2945 case CM_SMALL_PIC:
2946 if (flag_pic)
2947 ix86_cmodel = CM_SMALL_PIC;
2948 if (!TARGET_64BIT)
2949 error ("code model %qs not supported in the %s bit mode",
2950 "small", "32");
2951 break;
2953 case CM_MEDIUM:
2954 case CM_MEDIUM_PIC:
2955 if (flag_pic)
2956 ix86_cmodel = CM_MEDIUM_PIC;
2957 if (!TARGET_64BIT)
2958 error ("code model %qs not supported in the %s bit mode",
2959 "medium", "32");
2960 else if (TARGET_X32)
2961 error ("code model %qs not supported in x32 mode",
2962 "medium");
2963 break;
2965 case CM_LARGE:
2966 case CM_LARGE_PIC:
2967 if (flag_pic)
2968 ix86_cmodel = CM_LARGE_PIC;
2969 if (!TARGET_64BIT)
2970 error ("code model %qs not supported in the %s bit mode",
2971 "large", "32");
2972 else if (TARGET_X32)
2973 error ("code model %qs not supported in x32 mode",
2974 "large");
2975 break;
2977 case CM_32:
2978 if (flag_pic)
2979 error ("code model %s does not support PIC mode", "32");
2980 if (TARGET_64BIT)
2981 error ("code model %qs not supported in the %s bit mode",
2982 "32", "64");
2983 break;
2985 case CM_KERNEL:
2986 if (flag_pic)
2988 error ("code model %s does not support PIC mode", "kernel");
2989 ix86_cmodel = CM_32;
2991 if (!TARGET_64BIT)
2992 error ("code model %qs not supported in the %s bit mode",
2993 "kernel", "32");
2994 break;
2996 default:
2997 gcc_unreachable ();
3000 else
3002 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3003 use of rip-relative addressing. This eliminates fixups that
3004 would otherwise be needed if this object is to be placed in a
3005 DLL, and is essentially just as efficient as direct addressing. */
3006 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3007 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3008 else if (TARGET_64BIT)
3009 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3010 else
3011 ix86_cmodel = CM_32;
3013 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3015 error ("-masm=intel not supported in this configuration");
3016 ix86_asm_dialect = ASM_ATT;
3018 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3019 sorry ("%i-bit mode not compiled in",
3020 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3022 for (i = 0; i < pta_size; i++)
3023 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3025 ix86_schedule = processor_alias_table[i].schedule;
3026 ix86_arch = processor_alias_table[i].processor;
3027 /* Default cpu tuning to the architecture. */
3028 ix86_tune = ix86_arch;
3030 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3031 error ("CPU you selected does not support x86-64 "
3032 "instruction set");
3034 if (processor_alias_table[i].flags & PTA_MMX
3035 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3036 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3037 if (processor_alias_table[i].flags & PTA_3DNOW
3038 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3039 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3040 if (processor_alias_table[i].flags & PTA_3DNOW_A
3041 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3042 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3043 if (processor_alias_table[i].flags & PTA_SSE
3044 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3045 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3046 if (processor_alias_table[i].flags & PTA_SSE2
3047 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3048 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3049 if (processor_alias_table[i].flags & PTA_SSE3
3050 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3051 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3052 if (processor_alias_table[i].flags & PTA_SSSE3
3053 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3054 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3055 if (processor_alias_table[i].flags & PTA_SSE4_1
3056 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3057 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3058 if (processor_alias_table[i].flags & PTA_SSE4_2
3059 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3060 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3061 if (processor_alias_table[i].flags & PTA_AVX
3062 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3063 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3064 if (processor_alias_table[i].flags & PTA_AVX2
3065 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3066 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3067 if (processor_alias_table[i].flags & PTA_FMA
3068 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3069 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3070 if (processor_alias_table[i].flags & PTA_SSE4A
3071 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3072 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3073 if (processor_alias_table[i].flags & PTA_FMA4
3074 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3075 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3076 if (processor_alias_table[i].flags & PTA_XOP
3077 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3078 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3079 if (processor_alias_table[i].flags & PTA_LWP
3080 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3081 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3082 if (processor_alias_table[i].flags & PTA_ABM
3083 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3084 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3085 if (processor_alias_table[i].flags & PTA_BMI
3086 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3087 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3088 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3089 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3090 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3091 if (processor_alias_table[i].flags & PTA_TBM
3092 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3093 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3094 if (processor_alias_table[i].flags & PTA_BMI2
3095 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3096 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3097 if (processor_alias_table[i].flags & PTA_CX16
3098 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3099 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3100 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3101 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3102 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3103 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3104 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3105 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3106 if (processor_alias_table[i].flags & PTA_MOVBE
3107 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3108 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3109 if (processor_alias_table[i].flags & PTA_AES
3110 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3111 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3112 if (processor_alias_table[i].flags & PTA_PCLMUL
3113 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3114 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3115 if (processor_alias_table[i].flags & PTA_FSGSBASE
3116 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3117 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3118 if (processor_alias_table[i].flags & PTA_RDRND
3119 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3120 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3121 if (processor_alias_table[i].flags & PTA_F16C
3122 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3123 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3124 if (processor_alias_table[i].flags & PTA_RTM
3125 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3126 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3127 if (processor_alias_table[i].flags & PTA_HLE
3128 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3129 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3130 if (processor_alias_table[i].flags & PTA_PRFCHW
3131 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3132 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3133 if (processor_alias_table[i].flags & PTA_RDSEED
3134 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3135 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3136 if (processor_alias_table[i].flags & PTA_ADX
3137 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3138 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3139 if (processor_alias_table[i].flags & PTA_FXSR
3140 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3141 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3142 if (processor_alias_table[i].flags & PTA_XSAVE
3143 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3144 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3145 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3146 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3147 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3148 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3149 x86_prefetch_sse = true;
3151 break;
3154 if (!strcmp (ix86_arch_string, "generic"))
3155 error ("generic CPU can be used only for %stune=%s %s",
3156 prefix, suffix, sw);
3157 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3158 error ("bad value (%s) for %sarch=%s %s",
3159 ix86_arch_string, prefix, suffix, sw);
3161 ix86_arch_mask = 1u << ix86_arch;
3162 for (i = 0; i < X86_ARCH_LAST; ++i)
3163 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3165 for (i = 0; i < pta_size; i++)
3166 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3168 ix86_schedule = processor_alias_table[i].schedule;
3169 ix86_tune = processor_alias_table[i].processor;
3170 if (TARGET_64BIT)
3172 if (!(processor_alias_table[i].flags & PTA_64BIT))
3174 if (ix86_tune_defaulted)
3176 ix86_tune_string = "x86-64";
3177 for (i = 0; i < pta_size; i++)
3178 if (! strcmp (ix86_tune_string,
3179 processor_alias_table[i].name))
3180 break;
3181 ix86_schedule = processor_alias_table[i].schedule;
3182 ix86_tune = processor_alias_table[i].processor;
3184 else
3185 error ("CPU you selected does not support x86-64 "
3186 "instruction set");
3189 else
3191 /* Adjust tuning when compiling for 32-bit ABI. */
3192 switch (ix86_tune)
3194 case PROCESSOR_GENERIC64:
3195 ix86_tune = PROCESSOR_GENERIC32;
3196 ix86_schedule = CPU_PENTIUMPRO;
3197 break;
3199 case PROCESSOR_CORE2_64:
3200 ix86_tune = PROCESSOR_CORE2_32;
3201 break;
3203 case PROCESSOR_COREI7_64:
3204 ix86_tune = PROCESSOR_COREI7_32;
3205 break;
3207 default:
3208 break;
3211 /* Intel CPUs have always interpreted SSE prefetch instructions as
3212 NOPs; so, we can enable SSE prefetch instructions even when
3213 -mtune (rather than -march) points us to a processor that has them.
3214 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3215 higher processors. */
3216 if (TARGET_CMOV
3217 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3218 x86_prefetch_sse = true;
3219 break;
3222 if (ix86_tune_specified && i == pta_size)
3223 error ("bad value (%s) for %stune=%s %s",
3224 ix86_tune_string, prefix, suffix, sw);
3226 ix86_tune_mask = 1u << ix86_tune;
3227 for (i = 0; i < X86_TUNE_LAST; ++i)
3228 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3230 #ifndef USE_IX86_FRAME_POINTER
3231 #define USE_IX86_FRAME_POINTER 0
3232 #endif
3234 #ifndef USE_X86_64_FRAME_POINTER
3235 #define USE_X86_64_FRAME_POINTER 0
3236 #endif
3238 /* Set the default values for switches whose default depends on TARGET_64BIT
3239 in case they weren't overwritten by command line options. */
3240 if (TARGET_64BIT)
3242 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3243 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3244 if (flag_asynchronous_unwind_tables == 2)
3245 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3246 if (flag_pcc_struct_return == 2)
3247 flag_pcc_struct_return = 0;
3249 else
3251 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3252 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3253 if (flag_asynchronous_unwind_tables == 2)
3254 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3255 if (flag_pcc_struct_return == 2)
3256 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3259 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3260 if (optimize_size)
3261 ix86_cost = &ix86_size_cost;
3262 else
3263 ix86_cost = ix86_tune_cost;
3265 /* Arrange to set up i386_stack_locals for all functions. */
3266 init_machine_status = ix86_init_machine_status;
3268 /* Validate -mregparm= value. */
3269 if (global_options_set.x_ix86_regparm)
3271 if (TARGET_64BIT)
3272 warning (0, "-mregparm is ignored in 64-bit mode");
3273 if (ix86_regparm > REGPARM_MAX)
3275 error ("-mregparm=%d is not between 0 and %d",
3276 ix86_regparm, REGPARM_MAX);
3277 ix86_regparm = 0;
3280 if (TARGET_64BIT)
3281 ix86_regparm = REGPARM_MAX;
3283 /* Default align_* from the processor table. */
3284 if (align_loops == 0)
3286 align_loops = processor_target_table[ix86_tune].align_loop;
3287 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3289 if (align_jumps == 0)
3291 align_jumps = processor_target_table[ix86_tune].align_jump;
3292 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3294 if (align_functions == 0)
3296 align_functions = processor_target_table[ix86_tune].align_func;
3299 /* Provide default for -mbranch-cost= value. */
3300 if (!global_options_set.x_ix86_branch_cost)
3301 ix86_branch_cost = ix86_cost->branch_cost;
3303 if (TARGET_64BIT)
3305 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3307 /* Enable by default the SSE and MMX builtins. Do allow the user to
3308 explicitly disable any of these. In particular, disabling SSE and
3309 MMX for kernel code is extremely useful. */
3310 if (!ix86_arch_specified)
3311 ix86_isa_flags
3312 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3313 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3315 if (TARGET_RTD)
3316 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3318 else
3320 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3322 if (!ix86_arch_specified)
3323 ix86_isa_flags
3324 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3326 /* i386 ABI does not specify red zone. It still makes sense to use it
3327 when programmer takes care to stack from being destroyed. */
3328 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3329 target_flags |= MASK_NO_RED_ZONE;
3332 /* Keep nonleaf frame pointers. */
3333 if (flag_omit_frame_pointer)
3334 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3335 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3336 flag_omit_frame_pointer = 1;
3338 /* If we're doing fast math, we don't care about comparison order
3339 wrt NaNs. This lets us use a shorter comparison sequence. */
3340 if (flag_finite_math_only)
3341 target_flags &= ~MASK_IEEE_FP;
3343 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3344 since the insns won't need emulation. */
3345 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3346 target_flags &= ~MASK_NO_FANCY_MATH_387;
3348 /* Likewise, if the target doesn't have a 387, or we've specified
3349 software floating point, don't use 387 inline intrinsics. */
3350 if (!TARGET_80387)
3351 target_flags |= MASK_NO_FANCY_MATH_387;
3353 /* Turn on MMX builtins for -msse. */
3354 if (TARGET_SSE)
3355 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3357 /* Enable SSE prefetch. */
3358 if (TARGET_SSE || TARGET_PRFCHW)
3359 x86_prefetch_sse = true;
3361 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3362 if (TARGET_SSE4_2 || TARGET_ABM)
3363 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3365 /* Turn on lzcnt instruction for -mabm. */
3366 if (TARGET_ABM)
3367 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3369 /* Validate -mpreferred-stack-boundary= value or default it to
3370 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3371 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3372 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3374 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3375 int max = (TARGET_SEH ? 4 : 12);
3377 if (ix86_preferred_stack_boundary_arg < min
3378 || ix86_preferred_stack_boundary_arg > max)
3380 if (min == max)
3381 error ("-mpreferred-stack-boundary is not supported "
3382 "for this target");
3383 else
3384 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3385 ix86_preferred_stack_boundary_arg, min, max);
3387 else
3388 ix86_preferred_stack_boundary
3389 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3392 /* Set the default value for -mstackrealign. */
3393 if (ix86_force_align_arg_pointer == -1)
3394 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3396 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3398 /* Validate -mincoming-stack-boundary= value or default it to
3399 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3400 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3401 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3403 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3404 || ix86_incoming_stack_boundary_arg > 12)
3405 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3406 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3407 else
3409 ix86_user_incoming_stack_boundary
3410 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3411 ix86_incoming_stack_boundary
3412 = ix86_user_incoming_stack_boundary;
3416 /* Accept -msseregparm only if at least SSE support is enabled. */
3417 if (TARGET_SSEREGPARM
3418 && ! TARGET_SSE)
3419 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3421 if (global_options_set.x_ix86_fpmath)
3423 if (ix86_fpmath & FPMATH_SSE)
3425 if (!TARGET_SSE)
3427 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3428 ix86_fpmath = FPMATH_387;
3430 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3432 warning (0, "387 instruction set disabled, using SSE arithmetics");
3433 ix86_fpmath = FPMATH_SSE;
3437 else
3438 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3440 /* If the i387 is disabled, then do not return values in it. */
3441 if (!TARGET_80387)
3442 target_flags &= ~MASK_FLOAT_RETURNS;
3444 /* Use external vectorized library in vectorizing intrinsics. */
3445 if (global_options_set.x_ix86_veclibabi_type)
3446 switch (ix86_veclibabi_type)
3448 case ix86_veclibabi_type_svml:
3449 ix86_veclib_handler = ix86_veclibabi_svml;
3450 break;
3452 case ix86_veclibabi_type_acml:
3453 ix86_veclib_handler = ix86_veclibabi_acml;
3454 break;
3456 default:
3457 gcc_unreachable ();
3460 if ((!USE_IX86_FRAME_POINTER
3461 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3462 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3463 && !optimize_size)
3464 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3466 /* ??? Unwind info is not correct around the CFG unless either a frame
3467 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3468 unwind info generation to be aware of the CFG and propagating states
3469 around edges. */
3470 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3471 || flag_exceptions || flag_non_call_exceptions)
3472 && flag_omit_frame_pointer
3473 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3475 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3476 warning (0, "unwind tables currently require either a frame pointer "
3477 "or %saccumulate-outgoing-args%s for correctness",
3478 prefix, suffix);
3479 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3482 /* If stack probes are required, the space used for large function
3483 arguments on the stack must also be probed, so enable
3484 -maccumulate-outgoing-args so this happens in the prologue. */
3485 if (TARGET_STACK_PROBE
3486 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3488 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3489 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3490 "for correctness", prefix, suffix);
3491 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3494 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3496 char *p;
3497 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3498 p = strchr (internal_label_prefix, 'X');
3499 internal_label_prefix_len = p - internal_label_prefix;
3500 *p = '\0';
3503 /* When scheduling description is not available, disable scheduler pass
3504 so it won't slow down the compilation and make x87 code slower. */
3505 if (!TARGET_SCHEDULE)
3506 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3508 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3509 ix86_tune_cost->simultaneous_prefetches,
3510 global_options.x_param_values,
3511 global_options_set.x_param_values);
3512 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3513 ix86_tune_cost->prefetch_block,
3514 global_options.x_param_values,
3515 global_options_set.x_param_values);
3516 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3517 ix86_tune_cost->l1_cache_size,
3518 global_options.x_param_values,
3519 global_options_set.x_param_values);
3520 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3521 ix86_tune_cost->l2_cache_size,
3522 global_options.x_param_values,
3523 global_options_set.x_param_values);
3525 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3526 if (flag_prefetch_loop_arrays < 0
3527 && HAVE_prefetch
3528 && (optimize >= 3 || flag_profile_use)
3529 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3530 flag_prefetch_loop_arrays = 1;
3532 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3533 can be optimized to ap = __builtin_next_arg (0). */
3534 if (!TARGET_64BIT && !flag_split_stack)
3535 targetm.expand_builtin_va_start = NULL;
3537 if (TARGET_64BIT)
3539 ix86_gen_leave = gen_leave_rex64;
3540 if (Pmode == DImode)
3542 ix86_gen_monitor = gen_sse3_monitor64_di;
3543 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3544 ix86_gen_tls_local_dynamic_base_64
3545 = gen_tls_local_dynamic_base_64_di;
3547 else
3549 ix86_gen_monitor = gen_sse3_monitor64_si;
3550 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3551 ix86_gen_tls_local_dynamic_base_64
3552 = gen_tls_local_dynamic_base_64_si;
3555 else
3557 ix86_gen_leave = gen_leave;
3558 ix86_gen_monitor = gen_sse3_monitor;
3561 if (Pmode == DImode)
3563 ix86_gen_add3 = gen_adddi3;
3564 ix86_gen_sub3 = gen_subdi3;
3565 ix86_gen_sub3_carry = gen_subdi3_carry;
3566 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3567 ix86_gen_andsp = gen_anddi3;
3568 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3569 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3570 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3572 else
3574 ix86_gen_add3 = gen_addsi3;
3575 ix86_gen_sub3 = gen_subsi3;
3576 ix86_gen_sub3_carry = gen_subsi3_carry;
3577 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3578 ix86_gen_andsp = gen_andsi3;
3579 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3580 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3581 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3584 #ifdef USE_IX86_CLD
3585 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3586 if (!TARGET_64BIT)
3587 target_flags |= MASK_CLD & ~target_flags_explicit;
3588 #endif
3590 if (!TARGET_64BIT && flag_pic)
3592 if (flag_fentry > 0)
3593 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3594 "with -fpic");
3595 flag_fentry = 0;
3597 else if (TARGET_SEH)
3599 if (flag_fentry == 0)
3600 sorry ("-mno-fentry isn%'t compatible with SEH");
3601 flag_fentry = 1;
3603 else if (flag_fentry < 0)
3605 #if defined(PROFILE_BEFORE_PROLOGUE)
3606 flag_fentry = 1;
3607 #else
3608 flag_fentry = 0;
3609 #endif
3612 if (TARGET_AVX)
3614 /* When not optimize for size, enable vzeroupper optimization for
3615 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3616 AVX unaligned load/store. */
3617 if (!optimize_size)
3619 if (flag_expensive_optimizations
3620 && !(target_flags_explicit & MASK_VZEROUPPER))
3621 target_flags |= MASK_VZEROUPPER;
3622 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3623 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3624 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3625 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3626 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3627 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3628 /* Enable 128-bit AVX instruction generation
3629 for the auto-vectorizer. */
3630 if (TARGET_AVX128_OPTIMAL
3631 && !(target_flags_explicit & MASK_PREFER_AVX128))
3632 target_flags |= MASK_PREFER_AVX128;
3635 else
3637 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3638 target_flags &= ~MASK_VZEROUPPER;
3641 if (ix86_recip_name)
3643 char *p = ASTRDUP (ix86_recip_name);
3644 char *q;
3645 unsigned int mask, i;
3646 bool invert;
3648 while ((q = strtok (p, ",")) != NULL)
3650 p = NULL;
3651 if (*q == '!')
3653 invert = true;
3654 q++;
3656 else
3657 invert = false;
3659 if (!strcmp (q, "default"))
3660 mask = RECIP_MASK_ALL;
3661 else
3663 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3664 if (!strcmp (q, recip_options[i].string))
3666 mask = recip_options[i].mask;
3667 break;
3670 if (i == ARRAY_SIZE (recip_options))
3672 error ("unknown option for -mrecip=%s", q);
3673 invert = false;
3674 mask = RECIP_MASK_NONE;
3678 recip_mask_explicit |= mask;
3679 if (invert)
3680 recip_mask &= ~mask;
3681 else
3682 recip_mask |= mask;
3686 if (TARGET_RECIP)
3687 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3688 else if (target_flags_explicit & MASK_RECIP)
3689 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3691 /* Default long double to 64-bit for Bionic. */
3692 if (TARGET_HAS_BIONIC
3693 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3694 target_flags |= MASK_LONG_DOUBLE_64;
3696 /* Save the initial options in case the user does function specific
3697 options. */
3698 if (main_args_p)
3699 target_option_default_node = target_option_current_node
3700 = build_target_option_node ();
3703 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3705 static void
3706 ix86_option_override (void)
3708 ix86_option_override_internal (true);
3711 /* Update register usage after having seen the compiler flags. */
3713 static void
3714 ix86_conditional_register_usage (void)
3716 int i, c_mask;
3717 unsigned int j;
3719 /* The PIC register, if it exists, is fixed. */
3720 j = PIC_OFFSET_TABLE_REGNUM;
3721 if (j != INVALID_REGNUM)
3722 fixed_regs[j] = call_used_regs[j] = 1;
3724 /* For 32-bit targets, squash the REX registers. */
3725 if (! TARGET_64BIT)
3727 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3728 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3729 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3730 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3733 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3734 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3735 : TARGET_64BIT ? (1 << 2)
3736 : (1 << 1));
3738 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3740 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3742 /* Set/reset conditionally defined registers from
3743 CALL_USED_REGISTERS initializer. */
3744 if (call_used_regs[i] > 1)
3745 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3747 /* Calculate registers of CLOBBERED_REGS register set
3748 as call used registers from GENERAL_REGS register set. */
3749 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3750 && call_used_regs[i])
3751 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3754 /* If MMX is disabled, squash the registers. */
3755 if (! TARGET_MMX)
3756 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3757 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3758 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3760 /* If SSE is disabled, squash the registers. */
3761 if (! TARGET_SSE)
3762 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3763 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3764 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3766 /* If the FPU is disabled, squash the registers. */
3767 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3768 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3769 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3770 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3774 /* Save the current options */
3776 static void
3777 ix86_function_specific_save (struct cl_target_option *ptr)
3779 ptr->arch = ix86_arch;
3780 ptr->schedule = ix86_schedule;
3781 ptr->tune = ix86_tune;
3782 ptr->branch_cost = ix86_branch_cost;
3783 ptr->tune_defaulted = ix86_tune_defaulted;
3784 ptr->arch_specified = ix86_arch_specified;
3785 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
3786 ptr->ix86_target_flags_explicit = target_flags_explicit;
3787 ptr->x_recip_mask_explicit = recip_mask_explicit;
3789 /* The fields are char but the variables are not; make sure the
3790 values fit in the fields. */
3791 gcc_assert (ptr->arch == ix86_arch);
3792 gcc_assert (ptr->schedule == ix86_schedule);
3793 gcc_assert (ptr->tune == ix86_tune);
3794 gcc_assert (ptr->branch_cost == ix86_branch_cost);
3797 /* Restore the current options */
3799 static void
3800 ix86_function_specific_restore (struct cl_target_option *ptr)
3802 enum processor_type old_tune = ix86_tune;
3803 enum processor_type old_arch = ix86_arch;
3804 unsigned int ix86_arch_mask, ix86_tune_mask;
3805 int i;
3807 ix86_arch = (enum processor_type) ptr->arch;
3808 ix86_schedule = (enum attr_cpu) ptr->schedule;
3809 ix86_tune = (enum processor_type) ptr->tune;
3810 ix86_branch_cost = ptr->branch_cost;
3811 ix86_tune_defaulted = ptr->tune_defaulted;
3812 ix86_arch_specified = ptr->arch_specified;
3813 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
3814 target_flags_explicit = ptr->ix86_target_flags_explicit;
3815 recip_mask_explicit = ptr->x_recip_mask_explicit;
3817 /* Recreate the arch feature tests if the arch changed */
3818 if (old_arch != ix86_arch)
3820 ix86_arch_mask = 1u << ix86_arch;
3821 for (i = 0; i < X86_ARCH_LAST; ++i)
3822 ix86_arch_features[i]
3823 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3826 /* Recreate the tune optimization tests */
3827 if (old_tune != ix86_tune)
3829 ix86_tune_mask = 1u << ix86_tune;
3830 for (i = 0; i < X86_TUNE_LAST; ++i)
3831 ix86_tune_features[i]
3832 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3836 /* Print the current options */
3838 static void
3839 ix86_function_specific_print (FILE *file, int indent,
3840 struct cl_target_option *ptr)
3842 char *target_string
3843 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
3844 NULL, NULL, ptr->x_ix86_fpmath, false);
3846 fprintf (file, "%*sarch = %d (%s)\n",
3847 indent, "",
3848 ptr->arch,
3849 ((ptr->arch < TARGET_CPU_DEFAULT_max)
3850 ? cpu_names[ptr->arch]
3851 : "<unknown>"));
3853 fprintf (file, "%*stune = %d (%s)\n",
3854 indent, "",
3855 ptr->tune,
3856 ((ptr->tune < TARGET_CPU_DEFAULT_max)
3857 ? cpu_names[ptr->tune]
3858 : "<unknown>"));
3860 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
3862 if (target_string)
3864 fprintf (file, "%*s%s\n", indent, "", target_string);
3865 free (target_string);
3870 /* Inner function to process the attribute((target(...))), take an argument and
3871 set the current options from the argument. If we have a list, recursively go
3872 over the list. */
3874 static bool
3875 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
3876 struct gcc_options *enum_opts_set)
3878 char *next_optstr;
3879 bool ret = true;
3881 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
3882 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
3883 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
3884 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
3885 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
3887 enum ix86_opt_type
3889 ix86_opt_unknown,
3890 ix86_opt_yes,
3891 ix86_opt_no,
3892 ix86_opt_str,
3893 ix86_opt_enum,
3894 ix86_opt_isa
3897 static const struct
3899 const char *string;
3900 size_t len;
3901 enum ix86_opt_type type;
3902 int opt;
3903 int mask;
3904 } attrs[] = {
3905 /* isa options */
3906 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
3907 IX86_ATTR_ISA ("abm", OPT_mabm),
3908 IX86_ATTR_ISA ("bmi", OPT_mbmi),
3909 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
3910 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
3911 IX86_ATTR_ISA ("tbm", OPT_mtbm),
3912 IX86_ATTR_ISA ("aes", OPT_maes),
3913 IX86_ATTR_ISA ("avx", OPT_mavx),
3914 IX86_ATTR_ISA ("avx2", OPT_mavx2),
3915 IX86_ATTR_ISA ("mmx", OPT_mmmx),
3916 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
3917 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
3918 IX86_ATTR_ISA ("sse", OPT_msse),
3919 IX86_ATTR_ISA ("sse2", OPT_msse2),
3920 IX86_ATTR_ISA ("sse3", OPT_msse3),
3921 IX86_ATTR_ISA ("sse4", OPT_msse4),
3922 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
3923 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
3924 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
3925 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
3926 IX86_ATTR_ISA ("fma4", OPT_mfma4),
3927 IX86_ATTR_ISA ("fma", OPT_mfma),
3928 IX86_ATTR_ISA ("xop", OPT_mxop),
3929 IX86_ATTR_ISA ("lwp", OPT_mlwp),
3930 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
3931 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
3932 IX86_ATTR_ISA ("f16c", OPT_mf16c),
3933 IX86_ATTR_ISA ("rtm", OPT_mrtm),
3934 IX86_ATTR_ISA ("hle", OPT_mhle),
3935 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
3936 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
3937 IX86_ATTR_ISA ("adx", OPT_madx),
3938 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
3939 IX86_ATTR_ISA ("xsave", OPT_mxsave),
3940 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
3942 /* enum options */
3943 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
3945 /* string options */
3946 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
3947 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
3949 /* flag options */
3950 IX86_ATTR_YES ("cld",
3951 OPT_mcld,
3952 MASK_CLD),
3954 IX86_ATTR_NO ("fancy-math-387",
3955 OPT_mfancy_math_387,
3956 MASK_NO_FANCY_MATH_387),
3958 IX86_ATTR_YES ("ieee-fp",
3959 OPT_mieee_fp,
3960 MASK_IEEE_FP),
3962 IX86_ATTR_YES ("inline-all-stringops",
3963 OPT_minline_all_stringops,
3964 MASK_INLINE_ALL_STRINGOPS),
3966 IX86_ATTR_YES ("inline-stringops-dynamically",
3967 OPT_minline_stringops_dynamically,
3968 MASK_INLINE_STRINGOPS_DYNAMICALLY),
3970 IX86_ATTR_NO ("align-stringops",
3971 OPT_mno_align_stringops,
3972 MASK_NO_ALIGN_STRINGOPS),
3974 IX86_ATTR_YES ("recip",
3975 OPT_mrecip,
3976 MASK_RECIP),
3980 /* If this is a list, recurse to get the options. */
3981 if (TREE_CODE (args) == TREE_LIST)
3983 bool ret = true;
3985 for (; args; args = TREE_CHAIN (args))
3986 if (TREE_VALUE (args)
3987 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
3988 p_strings, enum_opts_set))
3989 ret = false;
3991 return ret;
3994 else if (TREE_CODE (args) != STRING_CST)
3995 gcc_unreachable ();
3997 /* Handle multiple arguments separated by commas. */
3998 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4000 while (next_optstr && *next_optstr != '\0')
4002 char *p = next_optstr;
4003 char *orig_p = p;
4004 char *comma = strchr (next_optstr, ',');
4005 const char *opt_string;
4006 size_t len, opt_len;
4007 int opt;
4008 bool opt_set_p;
4009 char ch;
4010 unsigned i;
4011 enum ix86_opt_type type = ix86_opt_unknown;
4012 int mask = 0;
4014 if (comma)
4016 *comma = '\0';
4017 len = comma - next_optstr;
4018 next_optstr = comma + 1;
4020 else
4022 len = strlen (p);
4023 next_optstr = NULL;
4026 /* Recognize no-xxx. */
4027 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4029 opt_set_p = false;
4030 p += 3;
4031 len -= 3;
4033 else
4034 opt_set_p = true;
4036 /* Find the option. */
4037 ch = *p;
4038 opt = N_OPTS;
4039 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4041 type = attrs[i].type;
4042 opt_len = attrs[i].len;
4043 if (ch == attrs[i].string[0]
4044 && ((type != ix86_opt_str && type != ix86_opt_enum)
4045 ? len == opt_len
4046 : len > opt_len)
4047 && memcmp (p, attrs[i].string, opt_len) == 0)
4049 opt = attrs[i].opt;
4050 mask = attrs[i].mask;
4051 opt_string = attrs[i].string;
4052 break;
4056 /* Process the option. */
4057 if (opt == N_OPTS)
4059 error ("attribute(target(\"%s\")) is unknown", orig_p);
4060 ret = false;
4063 else if (type == ix86_opt_isa)
4065 struct cl_decoded_option decoded;
4067 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4068 ix86_handle_option (&global_options, &global_options_set,
4069 &decoded, input_location);
4072 else if (type == ix86_opt_yes || type == ix86_opt_no)
4074 if (type == ix86_opt_no)
4075 opt_set_p = !opt_set_p;
4077 if (opt_set_p)
4078 target_flags |= mask;
4079 else
4080 target_flags &= ~mask;
4083 else if (type == ix86_opt_str)
4085 if (p_strings[opt])
4087 error ("option(\"%s\") was already specified", opt_string);
4088 ret = false;
4090 else
4091 p_strings[opt] = xstrdup (p + opt_len);
4094 else if (type == ix86_opt_enum)
4096 bool arg_ok;
4097 int value;
4099 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4100 if (arg_ok)
4101 set_option (&global_options, enum_opts_set, opt, value,
4102 p + opt_len, DK_UNSPECIFIED, input_location,
4103 global_dc);
4104 else
4106 error ("attribute(target(\"%s\")) is unknown", orig_p);
4107 ret = false;
4111 else
4112 gcc_unreachable ();
4115 return ret;
4118 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4120 tree
4121 ix86_valid_target_attribute_tree (tree args)
4123 const char *orig_arch_string = ix86_arch_string;
4124 const char *orig_tune_string = ix86_tune_string;
4125 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4126 int orig_tune_defaulted = ix86_tune_defaulted;
4127 int orig_arch_specified = ix86_arch_specified;
4128 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4129 tree t = NULL_TREE;
4130 int i;
4131 struct cl_target_option *def
4132 = TREE_TARGET_OPTION (target_option_default_node);
4133 struct gcc_options enum_opts_set;
4135 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4137 /* Process each of the options on the chain. */
4138 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4139 &enum_opts_set))
4140 return NULL_TREE;
4142 /* If the changed options are different from the default, rerun
4143 ix86_option_override_internal, and then save the options away.
4144 The string options are are attribute options, and will be undone
4145 when we copy the save structure. */
4146 if (ix86_isa_flags != def->x_ix86_isa_flags
4147 || target_flags != def->x_target_flags
4148 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4149 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4150 || enum_opts_set.x_ix86_fpmath)
4152 /* If we are using the default tune= or arch=, undo the string assigned,
4153 and use the default. */
4154 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4155 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4156 else if (!orig_arch_specified)
4157 ix86_arch_string = NULL;
4159 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4160 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4161 else if (orig_tune_defaulted)
4162 ix86_tune_string = NULL;
4164 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4165 if (enum_opts_set.x_ix86_fpmath)
4166 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4167 else if (!TARGET_64BIT && TARGET_SSE)
4169 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4170 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4173 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4174 ix86_option_override_internal (false);
4176 /* Add any builtin functions with the new isa if any. */
4177 ix86_add_new_builtins (ix86_isa_flags);
4179 /* Save the current options unless we are validating options for
4180 #pragma. */
4181 t = build_target_option_node ();
4183 ix86_arch_string = orig_arch_string;
4184 ix86_tune_string = orig_tune_string;
4185 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4187 /* Free up memory allocated to hold the strings */
4188 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4189 free (option_strings[i]);
4192 return t;
4195 /* Hook to validate attribute((target("string"))). */
4197 static bool
4198 ix86_valid_target_attribute_p (tree fndecl,
4199 tree ARG_UNUSED (name),
4200 tree args,
4201 int ARG_UNUSED (flags))
4203 struct cl_target_option cur_target;
4204 bool ret = true;
4205 tree old_optimize = build_optimization_node ();
4206 tree new_target, new_optimize;
4207 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4209 /* If the function changed the optimization levels as well as setting target
4210 options, start with the optimizations specified. */
4211 if (func_optimize && func_optimize != old_optimize)
4212 cl_optimization_restore (&global_options,
4213 TREE_OPTIMIZATION (func_optimize));
4215 /* The target attributes may also change some optimization flags, so update
4216 the optimization options if necessary. */
4217 cl_target_option_save (&cur_target, &global_options);
4218 new_target = ix86_valid_target_attribute_tree (args);
4219 new_optimize = build_optimization_node ();
4221 if (!new_target)
4222 ret = false;
4224 else if (fndecl)
4226 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4228 if (old_optimize != new_optimize)
4229 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4232 cl_target_option_restore (&global_options, &cur_target);
4234 if (old_optimize != new_optimize)
4235 cl_optimization_restore (&global_options,
4236 TREE_OPTIMIZATION (old_optimize));
4238 return ret;
4242 /* Hook to determine if one function can safely inline another. */
4244 static bool
4245 ix86_can_inline_p (tree caller, tree callee)
4247 bool ret = false;
4248 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4249 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4251 /* If callee has no option attributes, then it is ok to inline. */
4252 if (!callee_tree)
4253 ret = true;
4255 /* If caller has no option attributes, but callee does then it is not ok to
4256 inline. */
4257 else if (!caller_tree)
4258 ret = false;
4260 else
4262 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4263 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4265 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4266 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4267 function. */
4268 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4269 != callee_opts->x_ix86_isa_flags)
4270 ret = false;
4272 /* See if we have the same non-isa options. */
4273 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4274 ret = false;
4276 /* See if arch, tune, etc. are the same. */
4277 else if (caller_opts->arch != callee_opts->arch)
4278 ret = false;
4280 else if (caller_opts->tune != callee_opts->tune)
4281 ret = false;
4283 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4284 ret = false;
4286 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4287 ret = false;
4289 else
4290 ret = true;
4293 return ret;
4297 /* Remember the last target of ix86_set_current_function. */
4298 static GTY(()) tree ix86_previous_fndecl;
4300 /* Establish appropriate back-end context for processing the function
4301 FNDECL. The argument might be NULL to indicate processing at top
4302 level, outside of any function scope. */
4303 static void
4304 ix86_set_current_function (tree fndecl)
4306 /* Only change the context if the function changes. This hook is called
4307 several times in the course of compiling a function, and we don't want to
4308 slow things down too much or call target_reinit when it isn't safe. */
4309 if (fndecl && fndecl != ix86_previous_fndecl)
4311 tree old_tree = (ix86_previous_fndecl
4312 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4313 : NULL_TREE);
4315 tree new_tree = (fndecl
4316 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4317 : NULL_TREE);
4319 ix86_previous_fndecl = fndecl;
4320 if (old_tree == new_tree)
4323 else if (new_tree)
4325 cl_target_option_restore (&global_options,
4326 TREE_TARGET_OPTION (new_tree));
4327 target_reinit ();
4330 else if (old_tree)
4332 struct cl_target_option *def
4333 = TREE_TARGET_OPTION (target_option_current_node);
4335 cl_target_option_restore (&global_options, def);
4336 target_reinit ();
4342 /* Return true if this goes in large data/bss. */
4344 static bool
4345 ix86_in_large_data_p (tree exp)
4347 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4348 return false;
4350 /* Functions are never large data. */
4351 if (TREE_CODE (exp) == FUNCTION_DECL)
4352 return false;
4354 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4356 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4357 if (strcmp (section, ".ldata") == 0
4358 || strcmp (section, ".lbss") == 0)
4359 return true;
4360 return false;
4362 else
4364 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4366 /* If this is an incomplete type with size 0, then we can't put it
4367 in data because it might be too big when completed. */
4368 if (!size || size > ix86_section_threshold)
4369 return true;
4372 return false;
4375 /* Switch to the appropriate section for output of DECL.
4376 DECL is either a `VAR_DECL' node or a constant of some sort.
4377 RELOC indicates whether forming the initial value of DECL requires
4378 link-time relocations. */
4380 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4381 ATTRIBUTE_UNUSED;
4383 static section *
4384 x86_64_elf_select_section (tree decl, int reloc,
4385 unsigned HOST_WIDE_INT align)
4387 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4388 && ix86_in_large_data_p (decl))
4390 const char *sname = NULL;
4391 unsigned int flags = SECTION_WRITE;
4392 switch (categorize_decl_for_section (decl, reloc))
4394 case SECCAT_DATA:
4395 sname = ".ldata";
4396 break;
4397 case SECCAT_DATA_REL:
4398 sname = ".ldata.rel";
4399 break;
4400 case SECCAT_DATA_REL_LOCAL:
4401 sname = ".ldata.rel.local";
4402 break;
4403 case SECCAT_DATA_REL_RO:
4404 sname = ".ldata.rel.ro";
4405 break;
4406 case SECCAT_DATA_REL_RO_LOCAL:
4407 sname = ".ldata.rel.ro.local";
4408 break;
4409 case SECCAT_BSS:
4410 sname = ".lbss";
4411 flags |= SECTION_BSS;
4412 break;
4413 case SECCAT_RODATA:
4414 case SECCAT_RODATA_MERGE_STR:
4415 case SECCAT_RODATA_MERGE_STR_INIT:
4416 case SECCAT_RODATA_MERGE_CONST:
4417 sname = ".lrodata";
4418 flags = 0;
4419 break;
4420 case SECCAT_SRODATA:
4421 case SECCAT_SDATA:
4422 case SECCAT_SBSS:
4423 gcc_unreachable ();
4424 case SECCAT_TEXT:
4425 case SECCAT_TDATA:
4426 case SECCAT_TBSS:
4427 /* We don't split these for medium model. Place them into
4428 default sections and hope for best. */
4429 break;
4431 if (sname)
4433 /* We might get called with string constants, but get_named_section
4434 doesn't like them as they are not DECLs. Also, we need to set
4435 flags in that case. */
4436 if (!DECL_P (decl))
4437 return get_section (sname, flags, NULL);
4438 return get_named_section (decl, sname, reloc);
4441 return default_elf_select_section (decl, reloc, align);
4444 /* Build up a unique section name, expressed as a
4445 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4446 RELOC indicates whether the initial value of EXP requires
4447 link-time relocations. */
4449 static void ATTRIBUTE_UNUSED
4450 x86_64_elf_unique_section (tree decl, int reloc)
4452 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4453 && ix86_in_large_data_p (decl))
4455 const char *prefix = NULL;
4456 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4457 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4459 switch (categorize_decl_for_section (decl, reloc))
4461 case SECCAT_DATA:
4462 case SECCAT_DATA_REL:
4463 case SECCAT_DATA_REL_LOCAL:
4464 case SECCAT_DATA_REL_RO:
4465 case SECCAT_DATA_REL_RO_LOCAL:
4466 prefix = one_only ? ".ld" : ".ldata";
4467 break;
4468 case SECCAT_BSS:
4469 prefix = one_only ? ".lb" : ".lbss";
4470 break;
4471 case SECCAT_RODATA:
4472 case SECCAT_RODATA_MERGE_STR:
4473 case SECCAT_RODATA_MERGE_STR_INIT:
4474 case SECCAT_RODATA_MERGE_CONST:
4475 prefix = one_only ? ".lr" : ".lrodata";
4476 break;
4477 case SECCAT_SRODATA:
4478 case SECCAT_SDATA:
4479 case SECCAT_SBSS:
4480 gcc_unreachable ();
4481 case SECCAT_TEXT:
4482 case SECCAT_TDATA:
4483 case SECCAT_TBSS:
4484 /* We don't split these for medium model. Place them into
4485 default sections and hope for best. */
4486 break;
4488 if (prefix)
4490 const char *name, *linkonce;
4491 char *string;
4493 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4494 name = targetm.strip_name_encoding (name);
4496 /* If we're using one_only, then there needs to be a .gnu.linkonce
4497 prefix to the section name. */
4498 linkonce = one_only ? ".gnu.linkonce" : "";
4500 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4502 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4503 return;
4506 default_unique_section (decl, reloc);
4509 #ifdef COMMON_ASM_OP
4510 /* This says how to output assembler code to declare an
4511 uninitialized external linkage data object.
4513 For medium model x86-64 we need to use .largecomm opcode for
4514 large objects. */
4515 void
4516 x86_elf_aligned_common (FILE *file,
4517 const char *name, unsigned HOST_WIDE_INT size,
4518 int align)
4520 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4521 && size > (unsigned int)ix86_section_threshold)
4522 fputs (".largecomm\t", file);
4523 else
4524 fputs (COMMON_ASM_OP, file);
4525 assemble_name (file, name);
4526 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4527 size, align / BITS_PER_UNIT);
4529 #endif
4531 /* Utility function for targets to use in implementing
4532 ASM_OUTPUT_ALIGNED_BSS. */
4534 void
4535 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4536 const char *name, unsigned HOST_WIDE_INT size,
4537 int align)
4539 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4540 && size > (unsigned int)ix86_section_threshold)
4541 switch_to_section (get_named_section (decl, ".lbss", 0));
4542 else
4543 switch_to_section (bss_section);
4544 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4545 #ifdef ASM_DECLARE_OBJECT_NAME
4546 last_assemble_variable_decl = decl;
4547 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4548 #else
4549 /* Standard thing is just output label for the object. */
4550 ASM_OUTPUT_LABEL (file, name);
4551 #endif /* ASM_DECLARE_OBJECT_NAME */
4552 ASM_OUTPUT_SKIP (file, size ? size : 1);
4555 /* Decide whether we must probe the stack before any space allocation
4556 on this target. It's essentially TARGET_STACK_PROBE except when
4557 -fstack-check causes the stack to be already probed differently. */
4559 bool
4560 ix86_target_stack_probe (void)
4562 /* Do not probe the stack twice if static stack checking is enabled. */
4563 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4564 return false;
4566 return TARGET_STACK_PROBE;
4569 /* Decide whether we can make a sibling call to a function. DECL is the
4570 declaration of the function being targeted by the call and EXP is the
4571 CALL_EXPR representing the call. */
4573 static bool
4574 ix86_function_ok_for_sibcall (tree decl, tree exp)
4576 tree type, decl_or_type;
4577 rtx a, b;
4579 /* If we are generating position-independent code, we cannot sibcall
4580 optimize any indirect call, or a direct call to a global function,
4581 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4582 if (!TARGET_MACHO
4583 && !TARGET_64BIT
4584 && flag_pic
4585 && (!decl || !targetm.binds_local_p (decl)))
4586 return false;
4588 /* If we need to align the outgoing stack, then sibcalling would
4589 unalign the stack, which may break the called function. */
4590 if (ix86_minimum_incoming_stack_boundary (true)
4591 < PREFERRED_STACK_BOUNDARY)
4592 return false;
4594 if (decl)
4596 decl_or_type = decl;
4597 type = TREE_TYPE (decl);
4599 else
4601 /* We're looking at the CALL_EXPR, we need the type of the function. */
4602 type = CALL_EXPR_FN (exp); /* pointer expression */
4603 type = TREE_TYPE (type); /* pointer type */
4604 type = TREE_TYPE (type); /* function type */
4605 decl_or_type = type;
4608 /* Check that the return value locations are the same. Like
4609 if we are returning floats on the 80387 register stack, we cannot
4610 make a sibcall from a function that doesn't return a float to a
4611 function that does or, conversely, from a function that does return
4612 a float to a function that doesn't; the necessary stack adjustment
4613 would not be executed. This is also the place we notice
4614 differences in the return value ABI. Note that it is ok for one
4615 of the functions to have void return type as long as the return
4616 value of the other is passed in a register. */
4617 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4618 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4619 cfun->decl, false);
4620 if (STACK_REG_P (a) || STACK_REG_P (b))
4622 if (!rtx_equal_p (a, b))
4623 return false;
4625 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4627 else if (!rtx_equal_p (a, b))
4628 return false;
4630 if (TARGET_64BIT)
4632 /* The SYSV ABI has more call-clobbered registers;
4633 disallow sibcalls from MS to SYSV. */
4634 if (cfun->machine->call_abi == MS_ABI
4635 && ix86_function_type_abi (type) == SYSV_ABI)
4636 return false;
4638 else
4640 /* If this call is indirect, we'll need to be able to use a
4641 call-clobbered register for the address of the target function.
4642 Make sure that all such registers are not used for passing
4643 parameters. Note that DLLIMPORT functions are indirect. */
4644 if (!decl
4645 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4647 if (ix86_function_regparm (type, NULL) >= 3)
4649 /* ??? Need to count the actual number of registers to be used,
4650 not the possible number of registers. Fix later. */
4651 return false;
4656 /* Otherwise okay. That also includes certain types of indirect calls. */
4657 return true;
4660 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4661 and "sseregparm" calling convention attributes;
4662 arguments as in struct attribute_spec.handler. */
4664 static tree
4665 ix86_handle_cconv_attribute (tree *node, tree name,
4666 tree args,
4667 int flags ATTRIBUTE_UNUSED,
4668 bool *no_add_attrs)
4670 if (TREE_CODE (*node) != FUNCTION_TYPE
4671 && TREE_CODE (*node) != METHOD_TYPE
4672 && TREE_CODE (*node) != FIELD_DECL
4673 && TREE_CODE (*node) != TYPE_DECL)
4675 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4676 name);
4677 *no_add_attrs = true;
4678 return NULL_TREE;
4681 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4682 if (is_attribute_p ("regparm", name))
4684 tree cst;
4686 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4688 error ("fastcall and regparm attributes are not compatible");
4691 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4693 error ("regparam and thiscall attributes are not compatible");
4696 cst = TREE_VALUE (args);
4697 if (TREE_CODE (cst) != INTEGER_CST)
4699 warning (OPT_Wattributes,
4700 "%qE attribute requires an integer constant argument",
4701 name);
4702 *no_add_attrs = true;
4704 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4706 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4707 name, REGPARM_MAX);
4708 *no_add_attrs = true;
4711 return NULL_TREE;
4714 if (TARGET_64BIT)
4716 /* Do not warn when emulating the MS ABI. */
4717 if ((TREE_CODE (*node) != FUNCTION_TYPE
4718 && TREE_CODE (*node) != METHOD_TYPE)
4719 || ix86_function_type_abi (*node) != MS_ABI)
4720 warning (OPT_Wattributes, "%qE attribute ignored",
4721 name);
4722 *no_add_attrs = true;
4723 return NULL_TREE;
4726 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4727 if (is_attribute_p ("fastcall", name))
4729 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4731 error ("fastcall and cdecl attributes are not compatible");
4733 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4735 error ("fastcall and stdcall attributes are not compatible");
4737 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4739 error ("fastcall and regparm attributes are not compatible");
4741 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4743 error ("fastcall and thiscall attributes are not compatible");
4747 /* Can combine stdcall with fastcall (redundant), regparm and
4748 sseregparm. */
4749 else if (is_attribute_p ("stdcall", name))
4751 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4753 error ("stdcall and cdecl attributes are not compatible");
4755 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4757 error ("stdcall and fastcall attributes are not compatible");
4759 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4761 error ("stdcall and thiscall attributes are not compatible");
4765 /* Can combine cdecl with regparm and sseregparm. */
4766 else if (is_attribute_p ("cdecl", name))
4768 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4770 error ("stdcall and cdecl attributes are not compatible");
4772 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4774 error ("fastcall and cdecl attributes are not compatible");
4776 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4778 error ("cdecl and thiscall attributes are not compatible");
4781 else if (is_attribute_p ("thiscall", name))
4783 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
4784 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
4785 name);
4786 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4788 error ("stdcall and thiscall attributes are not compatible");
4790 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4792 error ("fastcall and thiscall attributes are not compatible");
4794 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4796 error ("cdecl and thiscall attributes are not compatible");
4800 /* Can combine sseregparm with all attributes. */
4802 return NULL_TREE;
4805 /* The transactional memory builtins are implicitly regparm or fastcall
4806 depending on the ABI. Override the generic do-nothing attribute that
4807 these builtins were declared with, and replace it with one of the two
4808 attributes that we expect elsewhere. */
4810 static tree
4811 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
4812 tree args ATTRIBUTE_UNUSED,
4813 int flags ATTRIBUTE_UNUSED,
4814 bool *no_add_attrs)
4816 tree alt;
4818 /* In no case do we want to add the placeholder attribute. */
4819 *no_add_attrs = true;
4821 /* The 64-bit ABI is unchanged for transactional memory. */
4822 if (TARGET_64BIT)
4823 return NULL_TREE;
4825 /* ??? Is there a better way to validate 32-bit windows? We have
4826 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
4827 if (CHECK_STACK_LIMIT > 0)
4828 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
4829 else
4831 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
4832 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
4834 decl_attributes (node, alt, flags);
4836 return NULL_TREE;
4839 /* This function determines from TYPE the calling-convention. */
4841 unsigned int
4842 ix86_get_callcvt (const_tree type)
4844 unsigned int ret = 0;
4845 bool is_stdarg;
4846 tree attrs;
4848 if (TARGET_64BIT)
4849 return IX86_CALLCVT_CDECL;
4851 attrs = TYPE_ATTRIBUTES (type);
4852 if (attrs != NULL_TREE)
4854 if (lookup_attribute ("cdecl", attrs))
4855 ret |= IX86_CALLCVT_CDECL;
4856 else if (lookup_attribute ("stdcall", attrs))
4857 ret |= IX86_CALLCVT_STDCALL;
4858 else if (lookup_attribute ("fastcall", attrs))
4859 ret |= IX86_CALLCVT_FASTCALL;
4860 else if (lookup_attribute ("thiscall", attrs))
4861 ret |= IX86_CALLCVT_THISCALL;
4863 /* Regparam isn't allowed for thiscall and fastcall. */
4864 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
4866 if (lookup_attribute ("regparm", attrs))
4867 ret |= IX86_CALLCVT_REGPARM;
4868 if (lookup_attribute ("sseregparm", attrs))
4869 ret |= IX86_CALLCVT_SSEREGPARM;
4872 if (IX86_BASE_CALLCVT(ret) != 0)
4873 return ret;
4876 is_stdarg = stdarg_p (type);
4877 if (TARGET_RTD && !is_stdarg)
4878 return IX86_CALLCVT_STDCALL | ret;
4880 if (ret != 0
4881 || is_stdarg
4882 || TREE_CODE (type) != METHOD_TYPE
4883 || ix86_function_type_abi (type) != MS_ABI)
4884 return IX86_CALLCVT_CDECL | ret;
4886 return IX86_CALLCVT_THISCALL;
4889 /* Return 0 if the attributes for two types are incompatible, 1 if they
4890 are compatible, and 2 if they are nearly compatible (which causes a
4891 warning to be generated). */
4893 static int
4894 ix86_comp_type_attributes (const_tree type1, const_tree type2)
4896 unsigned int ccvt1, ccvt2;
4898 if (TREE_CODE (type1) != FUNCTION_TYPE
4899 && TREE_CODE (type1) != METHOD_TYPE)
4900 return 1;
4902 ccvt1 = ix86_get_callcvt (type1);
4903 ccvt2 = ix86_get_callcvt (type2);
4904 if (ccvt1 != ccvt2)
4905 return 0;
4906 if (ix86_function_regparm (type1, NULL)
4907 != ix86_function_regparm (type2, NULL))
4908 return 0;
4910 return 1;
4913 /* Return the regparm value for a function with the indicated TYPE and DECL.
4914 DECL may be NULL when calling function indirectly
4915 or considering a libcall. */
4917 static int
4918 ix86_function_regparm (const_tree type, const_tree decl)
4920 tree attr;
4921 int regparm;
4922 unsigned int ccvt;
4924 if (TARGET_64BIT)
4925 return (ix86_function_type_abi (type) == SYSV_ABI
4926 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
4927 ccvt = ix86_get_callcvt (type);
4928 regparm = ix86_regparm;
4930 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
4932 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
4933 if (attr)
4935 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
4936 return regparm;
4939 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
4940 return 2;
4941 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
4942 return 1;
4944 /* Use register calling convention for local functions when possible. */
4945 if (decl
4946 && TREE_CODE (decl) == FUNCTION_DECL
4947 && optimize
4948 && !(profile_flag && !flag_fentry))
4950 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
4951 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
4952 if (i && i->local && i->can_change_signature)
4954 int local_regparm, globals = 0, regno;
4956 /* Make sure no regparm register is taken by a
4957 fixed register variable. */
4958 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
4959 if (fixed_regs[local_regparm])
4960 break;
4962 /* We don't want to use regparm(3) for nested functions as
4963 these use a static chain pointer in the third argument. */
4964 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
4965 local_regparm = 2;
4967 /* In 32-bit mode save a register for the split stack. */
4968 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
4969 local_regparm = 2;
4971 /* Each fixed register usage increases register pressure,
4972 so less registers should be used for argument passing.
4973 This functionality can be overriden by an explicit
4974 regparm value. */
4975 for (regno = AX_REG; regno <= DI_REG; regno++)
4976 if (fixed_regs[regno])
4977 globals++;
4979 local_regparm
4980 = globals < local_regparm ? local_regparm - globals : 0;
4982 if (local_regparm > regparm)
4983 regparm = local_regparm;
4987 return regparm;
4990 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
4991 DFmode (2) arguments in SSE registers for a function with the
4992 indicated TYPE and DECL. DECL may be NULL when calling function
4993 indirectly or considering a libcall. Otherwise return 0. */
4995 static int
4996 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
4998 gcc_assert (!TARGET_64BIT);
5000 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5001 by the sseregparm attribute. */
5002 if (TARGET_SSEREGPARM
5003 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5005 if (!TARGET_SSE)
5007 if (warn)
5009 if (decl)
5010 error ("calling %qD with attribute sseregparm without "
5011 "SSE/SSE2 enabled", decl);
5012 else
5013 error ("calling %qT with attribute sseregparm without "
5014 "SSE/SSE2 enabled", type);
5016 return 0;
5019 return 2;
5022 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5023 (and DFmode for SSE2) arguments in SSE registers. */
5024 if (decl && TARGET_SSE_MATH && optimize
5025 && !(profile_flag && !flag_fentry))
5027 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5028 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5029 if (i && i->local && i->can_change_signature)
5030 return TARGET_SSE2 ? 2 : 1;
5033 return 0;
5036 /* Return true if EAX is live at the start of the function. Used by
5037 ix86_expand_prologue to determine if we need special help before
5038 calling allocate_stack_worker. */
5040 static bool
5041 ix86_eax_live_at_start_p (void)
5043 /* Cheat. Don't bother working forward from ix86_function_regparm
5044 to the function type to whether an actual argument is located in
5045 eax. Instead just look at cfg info, which is still close enough
5046 to correct at this point. This gives false positives for broken
5047 functions that might use uninitialized data that happens to be
5048 allocated in eax, but who cares? */
5049 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5052 static bool
5053 ix86_keep_aggregate_return_pointer (tree fntype)
5055 tree attr;
5057 if (!TARGET_64BIT)
5059 attr = lookup_attribute ("callee_pop_aggregate_return",
5060 TYPE_ATTRIBUTES (fntype));
5061 if (attr)
5062 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5064 /* For 32-bit MS-ABI the default is to keep aggregate
5065 return pointer. */
5066 if (ix86_function_type_abi (fntype) == MS_ABI)
5067 return true;
5069 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5072 /* Value is the number of bytes of arguments automatically
5073 popped when returning from a subroutine call.
5074 FUNDECL is the declaration node of the function (as a tree),
5075 FUNTYPE is the data type of the function (as a tree),
5076 or for a library call it is an identifier node for the subroutine name.
5077 SIZE is the number of bytes of arguments passed on the stack.
5079 On the 80386, the RTD insn may be used to pop them if the number
5080 of args is fixed, but if the number is variable then the caller
5081 must pop them all. RTD can't be used for library calls now
5082 because the library is compiled with the Unix compiler.
5083 Use of RTD is a selectable option, since it is incompatible with
5084 standard Unix calling sequences. If the option is not selected,
5085 the caller must always pop the args.
5087 The attribute stdcall is equivalent to RTD on a per module basis. */
5089 static int
5090 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5092 unsigned int ccvt;
5094 /* None of the 64-bit ABIs pop arguments. */
5095 if (TARGET_64BIT)
5096 return 0;
5098 ccvt = ix86_get_callcvt (funtype);
5100 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5101 | IX86_CALLCVT_THISCALL)) != 0
5102 && ! stdarg_p (funtype))
5103 return size;
5105 /* Lose any fake structure return argument if it is passed on the stack. */
5106 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5107 && !ix86_keep_aggregate_return_pointer (funtype))
5109 int nregs = ix86_function_regparm (funtype, fundecl);
5110 if (nregs == 0)
5111 return GET_MODE_SIZE (Pmode);
5114 return 0;
5117 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5119 static bool
5120 ix86_legitimate_combined_insn (rtx insn)
5122 /* Check operand constraints in case hard registers were propagated
5123 into insn pattern. This check prevents combine pass from
5124 generating insn patterns with invalid hard register operands.
5125 These invalid insns can eventually confuse reload to error out
5126 with a spill failure. See also PRs 46829 and 46843. */
5127 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5129 int i;
5131 extract_insn (insn);
5132 preprocess_constraints ();
5134 for (i = 0; i < recog_data.n_operands; i++)
5136 rtx op = recog_data.operand[i];
5137 enum machine_mode mode = GET_MODE (op);
5138 struct operand_alternative *op_alt;
5139 int offset = 0;
5140 bool win;
5141 int j;
5143 /* A unary operator may be accepted by the predicate, but it
5144 is irrelevant for matching constraints. */
5145 if (UNARY_P (op))
5146 op = XEXP (op, 0);
5148 if (GET_CODE (op) == SUBREG)
5150 if (REG_P (SUBREG_REG (op))
5151 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5152 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5153 GET_MODE (SUBREG_REG (op)),
5154 SUBREG_BYTE (op),
5155 GET_MODE (op));
5156 op = SUBREG_REG (op);
5159 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5160 continue;
5162 op_alt = recog_op_alt[i];
5164 /* Operand has no constraints, anything is OK. */
5165 win = !recog_data.n_alternatives;
5167 for (j = 0; j < recog_data.n_alternatives; j++)
5169 if (op_alt[j].anything_ok
5170 || (op_alt[j].matches != -1
5171 && operands_match_p
5172 (recog_data.operand[i],
5173 recog_data.operand[op_alt[j].matches]))
5174 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5176 win = true;
5177 break;
5181 if (!win)
5182 return false;
5186 return true;
5189 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5191 static unsigned HOST_WIDE_INT
5192 ix86_asan_shadow_offset (void)
5194 return (unsigned HOST_WIDE_INT) 1 << (TARGET_LP64 ? 44 : 29);
5197 /* Argument support functions. */
5199 /* Return true when register may be used to pass function parameters. */
5200 bool
5201 ix86_function_arg_regno_p (int regno)
5203 int i;
5204 const int *parm_regs;
5206 if (!TARGET_64BIT)
5208 if (TARGET_MACHO)
5209 return (regno < REGPARM_MAX
5210 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5211 else
5212 return (regno < REGPARM_MAX
5213 || (TARGET_MMX && MMX_REGNO_P (regno)
5214 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5215 || (TARGET_SSE && SSE_REGNO_P (regno)
5216 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5219 if (TARGET_MACHO)
5221 if (SSE_REGNO_P (regno) && TARGET_SSE)
5222 return true;
5224 else
5226 if (TARGET_SSE && SSE_REGNO_P (regno)
5227 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5228 return true;
5231 /* TODO: The function should depend on current function ABI but
5232 builtins.c would need updating then. Therefore we use the
5233 default ABI. */
5235 /* RAX is used as hidden argument to va_arg functions. */
5236 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5237 return true;
5239 if (ix86_abi == MS_ABI)
5240 parm_regs = x86_64_ms_abi_int_parameter_registers;
5241 else
5242 parm_regs = x86_64_int_parameter_registers;
5243 for (i = 0; i < (ix86_abi == MS_ABI
5244 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5245 if (regno == parm_regs[i])
5246 return true;
5247 return false;
5250 /* Return if we do not know how to pass TYPE solely in registers. */
5252 static bool
5253 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5255 if (must_pass_in_stack_var_size_or_pad (mode, type))
5256 return true;
5258 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5259 The layout_type routine is crafty and tries to trick us into passing
5260 currently unsupported vector types on the stack by using TImode. */
5261 return (!TARGET_64BIT && mode == TImode
5262 && type && TREE_CODE (type) != VECTOR_TYPE);
5265 /* It returns the size, in bytes, of the area reserved for arguments passed
5266 in registers for the function represented by fndecl dependent to the used
5267 abi format. */
5269 ix86_reg_parm_stack_space (const_tree fndecl)
5271 enum calling_abi call_abi = SYSV_ABI;
5272 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5273 call_abi = ix86_function_abi (fndecl);
5274 else
5275 call_abi = ix86_function_type_abi (fndecl);
5276 if (TARGET_64BIT && call_abi == MS_ABI)
5277 return 32;
5278 return 0;
5281 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5282 call abi used. */
5283 enum calling_abi
5284 ix86_function_type_abi (const_tree fntype)
5286 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5288 enum calling_abi abi = ix86_abi;
5289 if (abi == SYSV_ABI)
5291 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5292 abi = MS_ABI;
5294 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5295 abi = SYSV_ABI;
5296 return abi;
5298 return ix86_abi;
5301 static bool
5302 ix86_function_ms_hook_prologue (const_tree fn)
5304 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5306 if (decl_function_context (fn) != NULL_TREE)
5307 error_at (DECL_SOURCE_LOCATION (fn),
5308 "ms_hook_prologue is not compatible with nested function");
5309 else
5310 return true;
5312 return false;
5315 static enum calling_abi
5316 ix86_function_abi (const_tree fndecl)
5318 if (! fndecl)
5319 return ix86_abi;
5320 return ix86_function_type_abi (TREE_TYPE (fndecl));
5323 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5324 call abi used. */
5325 enum calling_abi
5326 ix86_cfun_abi (void)
5328 if (! cfun)
5329 return ix86_abi;
5330 return cfun->machine->call_abi;
5333 /* Write the extra assembler code needed to declare a function properly. */
5335 void
5336 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5337 tree decl)
5339 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5341 if (is_ms_hook)
5343 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5344 unsigned int filler_cc = 0xcccccccc;
5346 for (i = 0; i < filler_count; i += 4)
5347 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5350 #ifdef SUBTARGET_ASM_UNWIND_INIT
5351 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5352 #endif
5354 ASM_OUTPUT_LABEL (asm_out_file, fname);
5356 /* Output magic byte marker, if hot-patch attribute is set. */
5357 if (is_ms_hook)
5359 if (TARGET_64BIT)
5361 /* leaq [%rsp + 0], %rsp */
5362 asm_fprintf (asm_out_file, ASM_BYTE
5363 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5365 else
5367 /* movl.s %edi, %edi
5368 push %ebp
5369 movl.s %esp, %ebp */
5370 asm_fprintf (asm_out_file, ASM_BYTE
5371 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5376 /* regclass.c */
5377 extern void init_regs (void);
5379 /* Implementation of call abi switching target hook. Specific to FNDECL
5380 the specific call register sets are set. See also
5381 ix86_conditional_register_usage for more details. */
5382 void
5383 ix86_call_abi_override (const_tree fndecl)
5385 if (fndecl == NULL_TREE)
5386 cfun->machine->call_abi = ix86_abi;
5387 else
5388 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5391 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5392 expensive re-initialization of init_regs each time we switch function context
5393 since this is needed only during RTL expansion. */
5394 static void
5395 ix86_maybe_switch_abi (void)
5397 if (TARGET_64BIT &&
5398 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5399 reinit_regs ();
5402 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5403 for a call to a function whose data type is FNTYPE.
5404 For a library call, FNTYPE is 0. */
5406 void
5407 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5408 tree fntype, /* tree ptr for function decl */
5409 rtx libname, /* SYMBOL_REF of library name or 0 */
5410 tree fndecl,
5411 int caller)
5413 struct cgraph_local_info *i;
5415 memset (cum, 0, sizeof (*cum));
5417 if (fndecl)
5419 i = cgraph_local_info (fndecl);
5420 cum->call_abi = ix86_function_abi (fndecl);
5422 else
5424 i = NULL;
5425 cum->call_abi = ix86_function_type_abi (fntype);
5428 cum->caller = caller;
5430 /* Set up the number of registers to use for passing arguments. */
5432 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5433 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5434 "or subtarget optimization implying it");
5435 cum->nregs = ix86_regparm;
5436 if (TARGET_64BIT)
5438 cum->nregs = (cum->call_abi == SYSV_ABI
5439 ? X86_64_REGPARM_MAX
5440 : X86_64_MS_REGPARM_MAX);
5442 if (TARGET_SSE)
5444 cum->sse_nregs = SSE_REGPARM_MAX;
5445 if (TARGET_64BIT)
5447 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5448 ? X86_64_SSE_REGPARM_MAX
5449 : X86_64_MS_SSE_REGPARM_MAX);
5452 if (TARGET_MMX)
5453 cum->mmx_nregs = MMX_REGPARM_MAX;
5454 cum->warn_avx = true;
5455 cum->warn_sse = true;
5456 cum->warn_mmx = true;
5458 /* Because type might mismatch in between caller and callee, we need to
5459 use actual type of function for local calls.
5460 FIXME: cgraph_analyze can be told to actually record if function uses
5461 va_start so for local functions maybe_vaarg can be made aggressive
5462 helping K&R code.
5463 FIXME: once typesytem is fixed, we won't need this code anymore. */
5464 if (i && i->local && i->can_change_signature)
5465 fntype = TREE_TYPE (fndecl);
5466 cum->maybe_vaarg = (fntype
5467 ? (!prototype_p (fntype) || stdarg_p (fntype))
5468 : !libname);
5470 if (!TARGET_64BIT)
5472 /* If there are variable arguments, then we won't pass anything
5473 in registers in 32-bit mode. */
5474 if (stdarg_p (fntype))
5476 cum->nregs = 0;
5477 cum->sse_nregs = 0;
5478 cum->mmx_nregs = 0;
5479 cum->warn_avx = 0;
5480 cum->warn_sse = 0;
5481 cum->warn_mmx = 0;
5482 return;
5485 /* Use ecx and edx registers if function has fastcall attribute,
5486 else look for regparm information. */
5487 if (fntype)
5489 unsigned int ccvt = ix86_get_callcvt (fntype);
5490 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5492 cum->nregs = 1;
5493 cum->fastcall = 1; /* Same first register as in fastcall. */
5495 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5497 cum->nregs = 2;
5498 cum->fastcall = 1;
5500 else
5501 cum->nregs = ix86_function_regparm (fntype, fndecl);
5504 /* Set up the number of SSE registers used for passing SFmode
5505 and DFmode arguments. Warn for mismatching ABI. */
5506 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5510 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5511 But in the case of vector types, it is some vector mode.
5513 When we have only some of our vector isa extensions enabled, then there
5514 are some modes for which vector_mode_supported_p is false. For these
5515 modes, the generic vector support in gcc will choose some non-vector mode
5516 in order to implement the type. By computing the natural mode, we'll
5517 select the proper ABI location for the operand and not depend on whatever
5518 the middle-end decides to do with these vector types.
5520 The midde-end can't deal with the vector types > 16 bytes. In this
5521 case, we return the original mode and warn ABI change if CUM isn't
5522 NULL. */
5524 static enum machine_mode
5525 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5527 enum machine_mode mode = TYPE_MODE (type);
5529 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5531 HOST_WIDE_INT size = int_size_in_bytes (type);
5532 if ((size == 8 || size == 16 || size == 32)
5533 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5534 && TYPE_VECTOR_SUBPARTS (type) > 1)
5536 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5538 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5539 mode = MIN_MODE_VECTOR_FLOAT;
5540 else
5541 mode = MIN_MODE_VECTOR_INT;
5543 /* Get the mode which has this inner mode and number of units. */
5544 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5545 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5546 && GET_MODE_INNER (mode) == innermode)
5548 if (size == 32 && !TARGET_AVX)
5550 static bool warnedavx;
5552 if (cum
5553 && !warnedavx
5554 && cum->warn_avx)
5556 warnedavx = true;
5557 warning (0, "AVX vector argument without AVX "
5558 "enabled changes the ABI");
5560 return TYPE_MODE (type);
5562 else if ((size == 8 || size == 16) && !TARGET_SSE)
5564 static bool warnedsse;
5566 if (cum
5567 && !warnedsse
5568 && cum->warn_sse)
5570 warnedsse = true;
5571 warning (0, "SSE vector argument without SSE "
5572 "enabled changes the ABI");
5574 return mode;
5576 else
5577 return mode;
5580 gcc_unreachable ();
5584 return mode;
5587 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5588 this may not agree with the mode that the type system has chosen for the
5589 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5590 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5592 static rtx
5593 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5594 unsigned int regno)
5596 rtx tmp;
5598 if (orig_mode != BLKmode)
5599 tmp = gen_rtx_REG (orig_mode, regno);
5600 else
5602 tmp = gen_rtx_REG (mode, regno);
5603 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5604 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5607 return tmp;
5610 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5611 of this code is to classify each 8bytes of incoming argument by the register
5612 class and assign registers accordingly. */
5614 /* Return the union class of CLASS1 and CLASS2.
5615 See the x86-64 PS ABI for details. */
5617 static enum x86_64_reg_class
5618 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5620 /* Rule #1: If both classes are equal, this is the resulting class. */
5621 if (class1 == class2)
5622 return class1;
5624 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5625 the other class. */
5626 if (class1 == X86_64_NO_CLASS)
5627 return class2;
5628 if (class2 == X86_64_NO_CLASS)
5629 return class1;
5631 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5632 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5633 return X86_64_MEMORY_CLASS;
5635 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5636 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5637 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5638 return X86_64_INTEGERSI_CLASS;
5639 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5640 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5641 return X86_64_INTEGER_CLASS;
5643 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5644 MEMORY is used. */
5645 if (class1 == X86_64_X87_CLASS
5646 || class1 == X86_64_X87UP_CLASS
5647 || class1 == X86_64_COMPLEX_X87_CLASS
5648 || class2 == X86_64_X87_CLASS
5649 || class2 == X86_64_X87UP_CLASS
5650 || class2 == X86_64_COMPLEX_X87_CLASS)
5651 return X86_64_MEMORY_CLASS;
5653 /* Rule #6: Otherwise class SSE is used. */
5654 return X86_64_SSE_CLASS;
5657 /* Classify the argument of type TYPE and mode MODE.
5658 CLASSES will be filled by the register class used to pass each word
5659 of the operand. The number of words is returned. In case the parameter
5660 should be passed in memory, 0 is returned. As a special case for zero
5661 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5663 BIT_OFFSET is used internally for handling records and specifies offset
5664 of the offset in bits modulo 256 to avoid overflow cases.
5666 See the x86-64 PS ABI for details.
5669 static int
5670 classify_argument (enum machine_mode mode, const_tree type,
5671 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5673 HOST_WIDE_INT bytes =
5674 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5675 int words
5676 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5678 /* Variable sized entities are always passed/returned in memory. */
5679 if (bytes < 0)
5680 return 0;
5682 if (mode != VOIDmode
5683 && targetm.calls.must_pass_in_stack (mode, type))
5684 return 0;
5686 if (type && AGGREGATE_TYPE_P (type))
5688 int i;
5689 tree field;
5690 enum x86_64_reg_class subclasses[MAX_CLASSES];
5692 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5693 if (bytes > 32)
5694 return 0;
5696 for (i = 0; i < words; i++)
5697 classes[i] = X86_64_NO_CLASS;
5699 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5700 signalize memory class, so handle it as special case. */
5701 if (!words)
5703 classes[0] = X86_64_NO_CLASS;
5704 return 1;
5707 /* Classify each field of record and merge classes. */
5708 switch (TREE_CODE (type))
5710 case RECORD_TYPE:
5711 /* And now merge the fields of structure. */
5712 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5714 if (TREE_CODE (field) == FIELD_DECL)
5716 int num;
5718 if (TREE_TYPE (field) == error_mark_node)
5719 continue;
5721 /* Bitfields are always classified as integer. Handle them
5722 early, since later code would consider them to be
5723 misaligned integers. */
5724 if (DECL_BIT_FIELD (field))
5726 for (i = (int_bit_position (field)
5727 + (bit_offset % 64)) / 8 / 8;
5728 i < ((int_bit_position (field) + (bit_offset % 64))
5729 + tree_low_cst (DECL_SIZE (field), 0)
5730 + 63) / 8 / 8; i++)
5731 classes[i] =
5732 merge_classes (X86_64_INTEGER_CLASS,
5733 classes[i]);
5735 else
5737 int pos;
5739 type = TREE_TYPE (field);
5741 /* Flexible array member is ignored. */
5742 if (TYPE_MODE (type) == BLKmode
5743 && TREE_CODE (type) == ARRAY_TYPE
5744 && TYPE_SIZE (type) == NULL_TREE
5745 && TYPE_DOMAIN (type) != NULL_TREE
5746 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5747 == NULL_TREE))
5749 static bool warned;
5751 if (!warned && warn_psabi)
5753 warned = true;
5754 inform (input_location,
5755 "the ABI of passing struct with"
5756 " a flexible array member has"
5757 " changed in GCC 4.4");
5759 continue;
5761 num = classify_argument (TYPE_MODE (type), type,
5762 subclasses,
5763 (int_bit_position (field)
5764 + bit_offset) % 256);
5765 if (!num)
5766 return 0;
5767 pos = (int_bit_position (field)
5768 + (bit_offset % 64)) / 8 / 8;
5769 for (i = 0; i < num && (i + pos) < words; i++)
5770 classes[i + pos] =
5771 merge_classes (subclasses[i], classes[i + pos]);
5775 break;
5777 case ARRAY_TYPE:
5778 /* Arrays are handled as small records. */
5780 int num;
5781 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5782 TREE_TYPE (type), subclasses, bit_offset);
5783 if (!num)
5784 return 0;
5786 /* The partial classes are now full classes. */
5787 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5788 subclasses[0] = X86_64_SSE_CLASS;
5789 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5790 && !((bit_offset % 64) == 0 && bytes == 4))
5791 subclasses[0] = X86_64_INTEGER_CLASS;
5793 for (i = 0; i < words; i++)
5794 classes[i] = subclasses[i % num];
5796 break;
5798 case UNION_TYPE:
5799 case QUAL_UNION_TYPE:
5800 /* Unions are similar to RECORD_TYPE but offset is always 0.
5802 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5804 if (TREE_CODE (field) == FIELD_DECL)
5806 int num;
5808 if (TREE_TYPE (field) == error_mark_node)
5809 continue;
5811 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5812 TREE_TYPE (field), subclasses,
5813 bit_offset);
5814 if (!num)
5815 return 0;
5816 for (i = 0; i < num; i++)
5817 classes[i] = merge_classes (subclasses[i], classes[i]);
5820 break;
5822 default:
5823 gcc_unreachable ();
5826 if (words > 2)
5828 /* When size > 16 bytes, if the first one isn't
5829 X86_64_SSE_CLASS or any other ones aren't
5830 X86_64_SSEUP_CLASS, everything should be passed in
5831 memory. */
5832 if (classes[0] != X86_64_SSE_CLASS)
5833 return 0;
5835 for (i = 1; i < words; i++)
5836 if (classes[i] != X86_64_SSEUP_CLASS)
5837 return 0;
5840 /* Final merger cleanup. */
5841 for (i = 0; i < words; i++)
5843 /* If one class is MEMORY, everything should be passed in
5844 memory. */
5845 if (classes[i] == X86_64_MEMORY_CLASS)
5846 return 0;
5848 /* The X86_64_SSEUP_CLASS should be always preceded by
5849 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
5850 if (classes[i] == X86_64_SSEUP_CLASS
5851 && classes[i - 1] != X86_64_SSE_CLASS
5852 && classes[i - 1] != X86_64_SSEUP_CLASS)
5854 /* The first one should never be X86_64_SSEUP_CLASS. */
5855 gcc_assert (i != 0);
5856 classes[i] = X86_64_SSE_CLASS;
5859 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
5860 everything should be passed in memory. */
5861 if (classes[i] == X86_64_X87UP_CLASS
5862 && (classes[i - 1] != X86_64_X87_CLASS))
5864 static bool warned;
5866 /* The first one should never be X86_64_X87UP_CLASS. */
5867 gcc_assert (i != 0);
5868 if (!warned && warn_psabi)
5870 warned = true;
5871 inform (input_location,
5872 "the ABI of passing union with long double"
5873 " has changed in GCC 4.4");
5875 return 0;
5878 return words;
5881 /* Compute alignment needed. We align all types to natural boundaries with
5882 exception of XFmode that is aligned to 64bits. */
5883 if (mode != VOIDmode && mode != BLKmode)
5885 int mode_alignment = GET_MODE_BITSIZE (mode);
5887 if (mode == XFmode)
5888 mode_alignment = 128;
5889 else if (mode == XCmode)
5890 mode_alignment = 256;
5891 if (COMPLEX_MODE_P (mode))
5892 mode_alignment /= 2;
5893 /* Misaligned fields are always returned in memory. */
5894 if (bit_offset % mode_alignment)
5895 return 0;
5898 /* for V1xx modes, just use the base mode */
5899 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
5900 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
5901 mode = GET_MODE_INNER (mode);
5903 /* Classification of atomic types. */
5904 switch (mode)
5906 case SDmode:
5907 case DDmode:
5908 classes[0] = X86_64_SSE_CLASS;
5909 return 1;
5910 case TDmode:
5911 classes[0] = X86_64_SSE_CLASS;
5912 classes[1] = X86_64_SSEUP_CLASS;
5913 return 2;
5914 case DImode:
5915 case SImode:
5916 case HImode:
5917 case QImode:
5918 case CSImode:
5919 case CHImode:
5920 case CQImode:
5922 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
5924 if (size <= 32)
5926 classes[0] = X86_64_INTEGERSI_CLASS;
5927 return 1;
5929 else if (size <= 64)
5931 classes[0] = X86_64_INTEGER_CLASS;
5932 return 1;
5934 else if (size <= 64+32)
5936 classes[0] = X86_64_INTEGER_CLASS;
5937 classes[1] = X86_64_INTEGERSI_CLASS;
5938 return 2;
5940 else if (size <= 64+64)
5942 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
5943 return 2;
5945 else
5946 gcc_unreachable ();
5948 case CDImode:
5949 case TImode:
5950 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
5951 return 2;
5952 case COImode:
5953 case OImode:
5954 /* OImode shouldn't be used directly. */
5955 gcc_unreachable ();
5956 case CTImode:
5957 return 0;
5958 case SFmode:
5959 if (!(bit_offset % 64))
5960 classes[0] = X86_64_SSESF_CLASS;
5961 else
5962 classes[0] = X86_64_SSE_CLASS;
5963 return 1;
5964 case DFmode:
5965 classes[0] = X86_64_SSEDF_CLASS;
5966 return 1;
5967 case XFmode:
5968 classes[0] = X86_64_X87_CLASS;
5969 classes[1] = X86_64_X87UP_CLASS;
5970 return 2;
5971 case TFmode:
5972 classes[0] = X86_64_SSE_CLASS;
5973 classes[1] = X86_64_SSEUP_CLASS;
5974 return 2;
5975 case SCmode:
5976 classes[0] = X86_64_SSE_CLASS;
5977 if (!(bit_offset % 64))
5978 return 1;
5979 else
5981 static bool warned;
5983 if (!warned && warn_psabi)
5985 warned = true;
5986 inform (input_location,
5987 "the ABI of passing structure with complex float"
5988 " member has changed in GCC 4.4");
5990 classes[1] = X86_64_SSESF_CLASS;
5991 return 2;
5993 case DCmode:
5994 classes[0] = X86_64_SSEDF_CLASS;
5995 classes[1] = X86_64_SSEDF_CLASS;
5996 return 2;
5997 case XCmode:
5998 classes[0] = X86_64_COMPLEX_X87_CLASS;
5999 return 1;
6000 case TCmode:
6001 /* This modes is larger than 16 bytes. */
6002 return 0;
6003 case V8SFmode:
6004 case V8SImode:
6005 case V32QImode:
6006 case V16HImode:
6007 case V4DFmode:
6008 case V4DImode:
6009 classes[0] = X86_64_SSE_CLASS;
6010 classes[1] = X86_64_SSEUP_CLASS;
6011 classes[2] = X86_64_SSEUP_CLASS;
6012 classes[3] = X86_64_SSEUP_CLASS;
6013 return 4;
6014 case V4SFmode:
6015 case V4SImode:
6016 case V16QImode:
6017 case V8HImode:
6018 case V2DFmode:
6019 case V2DImode:
6020 classes[0] = X86_64_SSE_CLASS;
6021 classes[1] = X86_64_SSEUP_CLASS;
6022 return 2;
6023 case V1TImode:
6024 case V1DImode:
6025 case V2SFmode:
6026 case V2SImode:
6027 case V4HImode:
6028 case V8QImode:
6029 classes[0] = X86_64_SSE_CLASS;
6030 return 1;
6031 case BLKmode:
6032 case VOIDmode:
6033 return 0;
6034 default:
6035 gcc_assert (VECTOR_MODE_P (mode));
6037 if (bytes > 16)
6038 return 0;
6040 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6042 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6043 classes[0] = X86_64_INTEGERSI_CLASS;
6044 else
6045 classes[0] = X86_64_INTEGER_CLASS;
6046 classes[1] = X86_64_INTEGER_CLASS;
6047 return 1 + (bytes > 8);
6051 /* Examine the argument and return set number of register required in each
6052 class. Return 0 iff parameter should be passed in memory. */
6053 static int
6054 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6055 int *int_nregs, int *sse_nregs)
6057 enum x86_64_reg_class regclass[MAX_CLASSES];
6058 int n = classify_argument (mode, type, regclass, 0);
6060 *int_nregs = 0;
6061 *sse_nregs = 0;
6062 if (!n)
6063 return 0;
6064 for (n--; n >= 0; n--)
6065 switch (regclass[n])
6067 case X86_64_INTEGER_CLASS:
6068 case X86_64_INTEGERSI_CLASS:
6069 (*int_nregs)++;
6070 break;
6071 case X86_64_SSE_CLASS:
6072 case X86_64_SSESF_CLASS:
6073 case X86_64_SSEDF_CLASS:
6074 (*sse_nregs)++;
6075 break;
6076 case X86_64_NO_CLASS:
6077 case X86_64_SSEUP_CLASS:
6078 break;
6079 case X86_64_X87_CLASS:
6080 case X86_64_X87UP_CLASS:
6081 if (!in_return)
6082 return 0;
6083 break;
6084 case X86_64_COMPLEX_X87_CLASS:
6085 return in_return ? 2 : 0;
6086 case X86_64_MEMORY_CLASS:
6087 gcc_unreachable ();
6089 return 1;
6092 /* Construct container for the argument used by GCC interface. See
6093 FUNCTION_ARG for the detailed description. */
6095 static rtx
6096 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6097 const_tree type, int in_return, int nintregs, int nsseregs,
6098 const int *intreg, int sse_regno)
6100 /* The following variables hold the static issued_error state. */
6101 static bool issued_sse_arg_error;
6102 static bool issued_sse_ret_error;
6103 static bool issued_x87_ret_error;
6105 enum machine_mode tmpmode;
6106 int bytes =
6107 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6108 enum x86_64_reg_class regclass[MAX_CLASSES];
6109 int n;
6110 int i;
6111 int nexps = 0;
6112 int needed_sseregs, needed_intregs;
6113 rtx exp[MAX_CLASSES];
6114 rtx ret;
6116 n = classify_argument (mode, type, regclass, 0);
6117 if (!n)
6118 return NULL;
6119 if (!examine_argument (mode, type, in_return, &needed_intregs,
6120 &needed_sseregs))
6121 return NULL;
6122 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6123 return NULL;
6125 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6126 some less clueful developer tries to use floating-point anyway. */
6127 if (needed_sseregs && !TARGET_SSE)
6129 if (in_return)
6131 if (!issued_sse_ret_error)
6133 error ("SSE register return with SSE disabled");
6134 issued_sse_ret_error = true;
6137 else if (!issued_sse_arg_error)
6139 error ("SSE register argument with SSE disabled");
6140 issued_sse_arg_error = true;
6142 return NULL;
6145 /* Likewise, error if the ABI requires us to return values in the
6146 x87 registers and the user specified -mno-80387. */
6147 if (!TARGET_80387 && in_return)
6148 for (i = 0; i < n; i++)
6149 if (regclass[i] == X86_64_X87_CLASS
6150 || regclass[i] == X86_64_X87UP_CLASS
6151 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6153 if (!issued_x87_ret_error)
6155 error ("x87 register return with x87 disabled");
6156 issued_x87_ret_error = true;
6158 return NULL;
6161 /* First construct simple cases. Avoid SCmode, since we want to use
6162 single register to pass this type. */
6163 if (n == 1 && mode != SCmode)
6164 switch (regclass[0])
6166 case X86_64_INTEGER_CLASS:
6167 case X86_64_INTEGERSI_CLASS:
6168 return gen_rtx_REG (mode, intreg[0]);
6169 case X86_64_SSE_CLASS:
6170 case X86_64_SSESF_CLASS:
6171 case X86_64_SSEDF_CLASS:
6172 if (mode != BLKmode)
6173 return gen_reg_or_parallel (mode, orig_mode,
6174 SSE_REGNO (sse_regno));
6175 break;
6176 case X86_64_X87_CLASS:
6177 case X86_64_COMPLEX_X87_CLASS:
6178 return gen_rtx_REG (mode, FIRST_STACK_REG);
6179 case X86_64_NO_CLASS:
6180 /* Zero sized array, struct or class. */
6181 return NULL;
6182 default:
6183 gcc_unreachable ();
6185 if (n == 2
6186 && regclass[0] == X86_64_SSE_CLASS
6187 && regclass[1] == X86_64_SSEUP_CLASS
6188 && mode != BLKmode)
6189 return gen_reg_or_parallel (mode, orig_mode,
6190 SSE_REGNO (sse_regno));
6191 if (n == 4
6192 && regclass[0] == X86_64_SSE_CLASS
6193 && regclass[1] == X86_64_SSEUP_CLASS
6194 && regclass[2] == X86_64_SSEUP_CLASS
6195 && regclass[3] == X86_64_SSEUP_CLASS
6196 && mode != BLKmode)
6197 return gen_reg_or_parallel (mode, orig_mode,
6198 SSE_REGNO (sse_regno));
6199 if (n == 2
6200 && regclass[0] == X86_64_X87_CLASS
6201 && regclass[1] == X86_64_X87UP_CLASS)
6202 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6204 if (n == 2
6205 && regclass[0] == X86_64_INTEGER_CLASS
6206 && regclass[1] == X86_64_INTEGER_CLASS
6207 && (mode == CDImode || mode == TImode || mode == TFmode)
6208 && intreg[0] + 1 == intreg[1])
6209 return gen_rtx_REG (mode, intreg[0]);
6211 /* Otherwise figure out the entries of the PARALLEL. */
6212 for (i = 0; i < n; i++)
6214 int pos;
6216 switch (regclass[i])
6218 case X86_64_NO_CLASS:
6219 break;
6220 case X86_64_INTEGER_CLASS:
6221 case X86_64_INTEGERSI_CLASS:
6222 /* Merge TImodes on aligned occasions here too. */
6223 if (i * 8 + 8 > bytes)
6224 tmpmode
6225 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6226 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6227 tmpmode = SImode;
6228 else
6229 tmpmode = DImode;
6230 /* We've requested 24 bytes we
6231 don't have mode for. Use DImode. */
6232 if (tmpmode == BLKmode)
6233 tmpmode = DImode;
6234 exp [nexps++]
6235 = gen_rtx_EXPR_LIST (VOIDmode,
6236 gen_rtx_REG (tmpmode, *intreg),
6237 GEN_INT (i*8));
6238 intreg++;
6239 break;
6240 case X86_64_SSESF_CLASS:
6241 exp [nexps++]
6242 = gen_rtx_EXPR_LIST (VOIDmode,
6243 gen_rtx_REG (SFmode,
6244 SSE_REGNO (sse_regno)),
6245 GEN_INT (i*8));
6246 sse_regno++;
6247 break;
6248 case X86_64_SSEDF_CLASS:
6249 exp [nexps++]
6250 = gen_rtx_EXPR_LIST (VOIDmode,
6251 gen_rtx_REG (DFmode,
6252 SSE_REGNO (sse_regno)),
6253 GEN_INT (i*8));
6254 sse_regno++;
6255 break;
6256 case X86_64_SSE_CLASS:
6257 pos = i;
6258 switch (n)
6260 case 1:
6261 tmpmode = DImode;
6262 break;
6263 case 2:
6264 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6266 tmpmode = TImode;
6267 i++;
6269 else
6270 tmpmode = DImode;
6271 break;
6272 case 4:
6273 gcc_assert (i == 0
6274 && regclass[1] == X86_64_SSEUP_CLASS
6275 && regclass[2] == X86_64_SSEUP_CLASS
6276 && regclass[3] == X86_64_SSEUP_CLASS);
6277 tmpmode = OImode;
6278 i += 3;
6279 break;
6280 default:
6281 gcc_unreachable ();
6283 exp [nexps++]
6284 = gen_rtx_EXPR_LIST (VOIDmode,
6285 gen_rtx_REG (tmpmode,
6286 SSE_REGNO (sse_regno)),
6287 GEN_INT (pos*8));
6288 sse_regno++;
6289 break;
6290 default:
6291 gcc_unreachable ();
6295 /* Empty aligned struct, union or class. */
6296 if (nexps == 0)
6297 return NULL;
6299 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6300 for (i = 0; i < nexps; i++)
6301 XVECEXP (ret, 0, i) = exp [i];
6302 return ret;
6305 /* Update the data in CUM to advance over an argument of mode MODE
6306 and data type TYPE. (TYPE is null for libcalls where that information
6307 may not be available.) */
6309 static void
6310 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6311 const_tree type, HOST_WIDE_INT bytes,
6312 HOST_WIDE_INT words)
6314 switch (mode)
6316 default:
6317 break;
6319 case BLKmode:
6320 if (bytes < 0)
6321 break;
6322 /* FALLTHRU */
6324 case DImode:
6325 case SImode:
6326 case HImode:
6327 case QImode:
6328 cum->words += words;
6329 cum->nregs -= words;
6330 cum->regno += words;
6332 if (cum->nregs <= 0)
6334 cum->nregs = 0;
6335 cum->regno = 0;
6337 break;
6339 case OImode:
6340 /* OImode shouldn't be used directly. */
6341 gcc_unreachable ();
6343 case DFmode:
6344 if (cum->float_in_sse < 2)
6345 break;
6346 case SFmode:
6347 if (cum->float_in_sse < 1)
6348 break;
6349 /* FALLTHRU */
6351 case V8SFmode:
6352 case V8SImode:
6353 case V32QImode:
6354 case V16HImode:
6355 case V4DFmode:
6356 case V4DImode:
6357 case TImode:
6358 case V16QImode:
6359 case V8HImode:
6360 case V4SImode:
6361 case V2DImode:
6362 case V4SFmode:
6363 case V2DFmode:
6364 if (!type || !AGGREGATE_TYPE_P (type))
6366 cum->sse_words += words;
6367 cum->sse_nregs -= 1;
6368 cum->sse_regno += 1;
6369 if (cum->sse_nregs <= 0)
6371 cum->sse_nregs = 0;
6372 cum->sse_regno = 0;
6375 break;
6377 case V8QImode:
6378 case V4HImode:
6379 case V2SImode:
6380 case V2SFmode:
6381 case V1TImode:
6382 case V1DImode:
6383 if (!type || !AGGREGATE_TYPE_P (type))
6385 cum->mmx_words += words;
6386 cum->mmx_nregs -= 1;
6387 cum->mmx_regno += 1;
6388 if (cum->mmx_nregs <= 0)
6390 cum->mmx_nregs = 0;
6391 cum->mmx_regno = 0;
6394 break;
6398 static void
6399 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6400 const_tree type, HOST_WIDE_INT words, bool named)
6402 int int_nregs, sse_nregs;
6404 /* Unnamed 256bit vector mode parameters are passed on stack. */
6405 if (!named && VALID_AVX256_REG_MODE (mode))
6406 return;
6408 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6409 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6411 cum->nregs -= int_nregs;
6412 cum->sse_nregs -= sse_nregs;
6413 cum->regno += int_nregs;
6414 cum->sse_regno += sse_nregs;
6416 else
6418 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6419 cum->words = (cum->words + align - 1) & ~(align - 1);
6420 cum->words += words;
6424 static void
6425 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6426 HOST_WIDE_INT words)
6428 /* Otherwise, this should be passed indirect. */
6429 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6431 cum->words += words;
6432 if (cum->nregs > 0)
6434 cum->nregs -= 1;
6435 cum->regno += 1;
6439 /* Update the data in CUM to advance over an argument of mode MODE and
6440 data type TYPE. (TYPE is null for libcalls where that information
6441 may not be available.) */
6443 static void
6444 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6445 const_tree type, bool named)
6447 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6448 HOST_WIDE_INT bytes, words;
6450 if (mode == BLKmode)
6451 bytes = int_size_in_bytes (type);
6452 else
6453 bytes = GET_MODE_SIZE (mode);
6454 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6456 if (type)
6457 mode = type_natural_mode (type, NULL);
6459 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6460 function_arg_advance_ms_64 (cum, bytes, words);
6461 else if (TARGET_64BIT)
6462 function_arg_advance_64 (cum, mode, type, words, named);
6463 else
6464 function_arg_advance_32 (cum, mode, type, bytes, words);
6467 /* Define where to put the arguments to a function.
6468 Value is zero to push the argument on the stack,
6469 or a hard register in which to store the argument.
6471 MODE is the argument's machine mode.
6472 TYPE is the data type of the argument (as a tree).
6473 This is null for libcalls where that information may
6474 not be available.
6475 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6476 the preceding args and about the function being called.
6477 NAMED is nonzero if this argument is a named parameter
6478 (otherwise it is an extra parameter matching an ellipsis). */
6480 static rtx
6481 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6482 enum machine_mode orig_mode, const_tree type,
6483 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6485 static bool warnedsse, warnedmmx;
6487 /* Avoid the AL settings for the Unix64 ABI. */
6488 if (mode == VOIDmode)
6489 return constm1_rtx;
6491 switch (mode)
6493 default:
6494 break;
6496 case BLKmode:
6497 if (bytes < 0)
6498 break;
6499 /* FALLTHRU */
6500 case DImode:
6501 case SImode:
6502 case HImode:
6503 case QImode:
6504 if (words <= cum->nregs)
6506 int regno = cum->regno;
6508 /* Fastcall allocates the first two DWORD (SImode) or
6509 smaller arguments to ECX and EDX if it isn't an
6510 aggregate type . */
6511 if (cum->fastcall)
6513 if (mode == BLKmode
6514 || mode == DImode
6515 || (type && AGGREGATE_TYPE_P (type)))
6516 break;
6518 /* ECX not EAX is the first allocated register. */
6519 if (regno == AX_REG)
6520 regno = CX_REG;
6522 return gen_rtx_REG (mode, regno);
6524 break;
6526 case DFmode:
6527 if (cum->float_in_sse < 2)
6528 break;
6529 case SFmode:
6530 if (cum->float_in_sse < 1)
6531 break;
6532 /* FALLTHRU */
6533 case TImode:
6534 /* In 32bit, we pass TImode in xmm registers. */
6535 case V16QImode:
6536 case V8HImode:
6537 case V4SImode:
6538 case V2DImode:
6539 case V4SFmode:
6540 case V2DFmode:
6541 if (!type || !AGGREGATE_TYPE_P (type))
6543 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6545 warnedsse = true;
6546 warning (0, "SSE vector argument without SSE enabled "
6547 "changes the ABI");
6549 if (cum->sse_nregs)
6550 return gen_reg_or_parallel (mode, orig_mode,
6551 cum->sse_regno + FIRST_SSE_REG);
6553 break;
6555 case OImode:
6556 /* OImode shouldn't be used directly. */
6557 gcc_unreachable ();
6559 case V8SFmode:
6560 case V8SImode:
6561 case V32QImode:
6562 case V16HImode:
6563 case V4DFmode:
6564 case V4DImode:
6565 if (!type || !AGGREGATE_TYPE_P (type))
6567 if (cum->sse_nregs)
6568 return gen_reg_or_parallel (mode, orig_mode,
6569 cum->sse_regno + FIRST_SSE_REG);
6571 break;
6573 case V8QImode:
6574 case V4HImode:
6575 case V2SImode:
6576 case V2SFmode:
6577 case V1TImode:
6578 case V1DImode:
6579 if (!type || !AGGREGATE_TYPE_P (type))
6581 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6583 warnedmmx = true;
6584 warning (0, "MMX vector argument without MMX enabled "
6585 "changes the ABI");
6587 if (cum->mmx_nregs)
6588 return gen_reg_or_parallel (mode, orig_mode,
6589 cum->mmx_regno + FIRST_MMX_REG);
6591 break;
6594 return NULL_RTX;
6597 static rtx
6598 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6599 enum machine_mode orig_mode, const_tree type, bool named)
6601 /* Handle a hidden AL argument containing number of registers
6602 for varargs x86-64 functions. */
6603 if (mode == VOIDmode)
6604 return GEN_INT (cum->maybe_vaarg
6605 ? (cum->sse_nregs < 0
6606 ? X86_64_SSE_REGPARM_MAX
6607 : cum->sse_regno)
6608 : -1);
6610 switch (mode)
6612 default:
6613 break;
6615 case V8SFmode:
6616 case V8SImode:
6617 case V32QImode:
6618 case V16HImode:
6619 case V4DFmode:
6620 case V4DImode:
6621 /* Unnamed 256bit vector mode parameters are passed on stack. */
6622 if (!named)
6623 return NULL;
6624 break;
6627 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6628 cum->sse_nregs,
6629 &x86_64_int_parameter_registers [cum->regno],
6630 cum->sse_regno);
6633 static rtx
6634 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6635 enum machine_mode orig_mode, bool named,
6636 HOST_WIDE_INT bytes)
6638 unsigned int regno;
6640 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6641 We use value of -2 to specify that current function call is MSABI. */
6642 if (mode == VOIDmode)
6643 return GEN_INT (-2);
6645 /* If we've run out of registers, it goes on the stack. */
6646 if (cum->nregs == 0)
6647 return NULL_RTX;
6649 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6651 /* Only floating point modes are passed in anything but integer regs. */
6652 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6654 if (named)
6655 regno = cum->regno + FIRST_SSE_REG;
6656 else
6658 rtx t1, t2;
6660 /* Unnamed floating parameters are passed in both the
6661 SSE and integer registers. */
6662 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6663 t2 = gen_rtx_REG (mode, regno);
6664 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6665 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6666 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6669 /* Handle aggregated types passed in register. */
6670 if (orig_mode == BLKmode)
6672 if (bytes > 0 && bytes <= 8)
6673 mode = (bytes > 4 ? DImode : SImode);
6674 if (mode == BLKmode)
6675 mode = DImode;
6678 return gen_reg_or_parallel (mode, orig_mode, regno);
6681 /* Return where to put the arguments to a function.
6682 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6684 MODE is the argument's machine mode. TYPE is the data type of the
6685 argument. It is null for libcalls where that information may not be
6686 available. CUM gives information about the preceding args and about
6687 the function being called. NAMED is nonzero if this argument is a
6688 named parameter (otherwise it is an extra parameter matching an
6689 ellipsis). */
6691 static rtx
6692 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6693 const_tree type, bool named)
6695 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6696 enum machine_mode mode = omode;
6697 HOST_WIDE_INT bytes, words;
6698 rtx arg;
6700 if (mode == BLKmode)
6701 bytes = int_size_in_bytes (type);
6702 else
6703 bytes = GET_MODE_SIZE (mode);
6704 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6706 /* To simplify the code below, represent vector types with a vector mode
6707 even if MMX/SSE are not active. */
6708 if (type && TREE_CODE (type) == VECTOR_TYPE)
6709 mode = type_natural_mode (type, cum);
6711 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6712 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6713 else if (TARGET_64BIT)
6714 arg = function_arg_64 (cum, mode, omode, type, named);
6715 else
6716 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6718 return arg;
6721 /* A C expression that indicates when an argument must be passed by
6722 reference. If nonzero for an argument, a copy of that argument is
6723 made in memory and a pointer to the argument is passed instead of
6724 the argument itself. The pointer is passed in whatever way is
6725 appropriate for passing a pointer to that type. */
6727 static bool
6728 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6729 enum machine_mode mode ATTRIBUTE_UNUSED,
6730 const_tree type, bool named ATTRIBUTE_UNUSED)
6732 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6734 /* See Windows x64 Software Convention. */
6735 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6737 int msize = (int) GET_MODE_SIZE (mode);
6738 if (type)
6740 /* Arrays are passed by reference. */
6741 if (TREE_CODE (type) == ARRAY_TYPE)
6742 return true;
6744 if (AGGREGATE_TYPE_P (type))
6746 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6747 are passed by reference. */
6748 msize = int_size_in_bytes (type);
6752 /* __m128 is passed by reference. */
6753 switch (msize) {
6754 case 1: case 2: case 4: case 8:
6755 break;
6756 default:
6757 return true;
6760 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6761 return 1;
6763 return 0;
6766 /* Return true when TYPE should be 128bit aligned for 32bit argument
6767 passing ABI. XXX: This function is obsolete and is only used for
6768 checking psABI compatibility with previous versions of GCC. */
6770 static bool
6771 ix86_compat_aligned_value_p (const_tree type)
6773 enum machine_mode mode = TYPE_MODE (type);
6774 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6775 || mode == TDmode
6776 || mode == TFmode
6777 || mode == TCmode)
6778 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6779 return true;
6780 if (TYPE_ALIGN (type) < 128)
6781 return false;
6783 if (AGGREGATE_TYPE_P (type))
6785 /* Walk the aggregates recursively. */
6786 switch (TREE_CODE (type))
6788 case RECORD_TYPE:
6789 case UNION_TYPE:
6790 case QUAL_UNION_TYPE:
6792 tree field;
6794 /* Walk all the structure fields. */
6795 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6797 if (TREE_CODE (field) == FIELD_DECL
6798 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6799 return true;
6801 break;
6804 case ARRAY_TYPE:
6805 /* Just for use if some languages passes arrays by value. */
6806 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6807 return true;
6808 break;
6810 default:
6811 gcc_unreachable ();
6814 return false;
6817 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6818 XXX: This function is obsolete and is only used for checking psABI
6819 compatibility with previous versions of GCC. */
6821 static unsigned int
6822 ix86_compat_function_arg_boundary (enum machine_mode mode,
6823 const_tree type, unsigned int align)
6825 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6826 natural boundaries. */
6827 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6829 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6830 make an exception for SSE modes since these require 128bit
6831 alignment.
6833 The handling here differs from field_alignment. ICC aligns MMX
6834 arguments to 4 byte boundaries, while structure fields are aligned
6835 to 8 byte boundaries. */
6836 if (!type)
6838 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6839 align = PARM_BOUNDARY;
6841 else
6843 if (!ix86_compat_aligned_value_p (type))
6844 align = PARM_BOUNDARY;
6847 if (align > BIGGEST_ALIGNMENT)
6848 align = BIGGEST_ALIGNMENT;
6849 return align;
6852 /* Return true when TYPE should be 128bit aligned for 32bit argument
6853 passing ABI. */
6855 static bool
6856 ix86_contains_aligned_value_p (const_tree type)
6858 enum machine_mode mode = TYPE_MODE (type);
6860 if (mode == XFmode || mode == XCmode)
6861 return false;
6863 if (TYPE_ALIGN (type) < 128)
6864 return false;
6866 if (AGGREGATE_TYPE_P (type))
6868 /* Walk the aggregates recursively. */
6869 switch (TREE_CODE (type))
6871 case RECORD_TYPE:
6872 case UNION_TYPE:
6873 case QUAL_UNION_TYPE:
6875 tree field;
6877 /* Walk all the structure fields. */
6878 for (field = TYPE_FIELDS (type);
6879 field;
6880 field = DECL_CHAIN (field))
6882 if (TREE_CODE (field) == FIELD_DECL
6883 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
6884 return true;
6886 break;
6889 case ARRAY_TYPE:
6890 /* Just for use if some languages passes arrays by value. */
6891 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
6892 return true;
6893 break;
6895 default:
6896 gcc_unreachable ();
6899 else
6900 return TYPE_ALIGN (type) >= 128;
6902 return false;
6905 /* Gives the alignment boundary, in bits, of an argument with the
6906 specified mode and type. */
6908 static unsigned int
6909 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
6911 unsigned int align;
6912 if (type)
6914 /* Since the main variant type is used for call, we convert it to
6915 the main variant type. */
6916 type = TYPE_MAIN_VARIANT (type);
6917 align = TYPE_ALIGN (type);
6919 else
6920 align = GET_MODE_ALIGNMENT (mode);
6921 if (align < PARM_BOUNDARY)
6922 align = PARM_BOUNDARY;
6923 else
6925 static bool warned;
6926 unsigned int saved_align = align;
6928 if (!TARGET_64BIT)
6930 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
6931 if (!type)
6933 if (mode == XFmode || mode == XCmode)
6934 align = PARM_BOUNDARY;
6936 else if (!ix86_contains_aligned_value_p (type))
6937 align = PARM_BOUNDARY;
6939 if (align < 128)
6940 align = PARM_BOUNDARY;
6943 if (warn_psabi
6944 && !warned
6945 && align != ix86_compat_function_arg_boundary (mode, type,
6946 saved_align))
6948 warned = true;
6949 inform (input_location,
6950 "The ABI for passing parameters with %d-byte"
6951 " alignment has changed in GCC 4.6",
6952 align / BITS_PER_UNIT);
6956 return align;
6959 /* Return true if N is a possible register number of function value. */
6961 static bool
6962 ix86_function_value_regno_p (const unsigned int regno)
6964 switch (regno)
6966 case AX_REG:
6967 return true;
6969 case FIRST_FLOAT_REG:
6970 /* TODO: The function should depend on current function ABI but
6971 builtins.c would need updating then. Therefore we use the
6972 default ABI. */
6973 if (TARGET_64BIT && ix86_abi == MS_ABI)
6974 return false;
6975 return TARGET_FLOAT_RETURNS_IN_80387;
6977 case FIRST_SSE_REG:
6978 return TARGET_SSE;
6980 case FIRST_MMX_REG:
6981 if (TARGET_MACHO || TARGET_64BIT)
6982 return false;
6983 return TARGET_MMX;
6986 return false;
6989 /* Define how to find the value returned by a function.
6990 VALTYPE is the data type of the value (as a tree).
6991 If the precise function being called is known, FUNC is its FUNCTION_DECL;
6992 otherwise, FUNC is 0. */
6994 static rtx
6995 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
6996 const_tree fntype, const_tree fn)
6998 unsigned int regno;
7000 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7001 we normally prevent this case when mmx is not available. However
7002 some ABIs may require the result to be returned like DImode. */
7003 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7004 regno = FIRST_MMX_REG;
7006 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7007 we prevent this case when sse is not available. However some ABIs
7008 may require the result to be returned like integer TImode. */
7009 else if (mode == TImode
7010 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7011 regno = FIRST_SSE_REG;
7013 /* 32-byte vector modes in %ymm0. */
7014 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7015 regno = FIRST_SSE_REG;
7017 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7018 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7019 regno = FIRST_FLOAT_REG;
7020 else
7021 /* Most things go in %eax. */
7022 regno = AX_REG;
7024 /* Override FP return register with %xmm0 for local functions when
7025 SSE math is enabled or for functions with sseregparm attribute. */
7026 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7028 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7029 if ((sse_level >= 1 && mode == SFmode)
7030 || (sse_level == 2 && mode == DFmode))
7031 regno = FIRST_SSE_REG;
7034 /* OImode shouldn't be used directly. */
7035 gcc_assert (mode != OImode);
7037 return gen_rtx_REG (orig_mode, regno);
7040 static rtx
7041 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7042 const_tree valtype)
7044 rtx ret;
7046 /* Handle libcalls, which don't provide a type node. */
7047 if (valtype == NULL)
7049 unsigned int regno;
7051 switch (mode)
7053 case SFmode:
7054 case SCmode:
7055 case DFmode:
7056 case DCmode:
7057 case TFmode:
7058 case SDmode:
7059 case DDmode:
7060 case TDmode:
7061 regno = FIRST_SSE_REG;
7062 break;
7063 case XFmode:
7064 case XCmode:
7065 regno = FIRST_FLOAT_REG;
7066 break;
7067 case TCmode:
7068 return NULL;
7069 default:
7070 regno = AX_REG;
7073 return gen_rtx_REG (mode, regno);
7075 else if (POINTER_TYPE_P (valtype))
7077 /* Pointers are always returned in word_mode. */
7078 mode = word_mode;
7081 ret = construct_container (mode, orig_mode, valtype, 1,
7082 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7083 x86_64_int_return_registers, 0);
7085 /* For zero sized structures, construct_container returns NULL, but we
7086 need to keep rest of compiler happy by returning meaningful value. */
7087 if (!ret)
7088 ret = gen_rtx_REG (orig_mode, AX_REG);
7090 return ret;
7093 static rtx
7094 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7096 unsigned int regno = AX_REG;
7098 if (TARGET_SSE)
7100 switch (GET_MODE_SIZE (mode))
7102 case 16:
7103 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7104 && !COMPLEX_MODE_P (mode))
7105 regno = FIRST_SSE_REG;
7106 break;
7107 case 8:
7108 case 4:
7109 if (mode == SFmode || mode == DFmode)
7110 regno = FIRST_SSE_REG;
7111 break;
7112 default:
7113 break;
7116 return gen_rtx_REG (orig_mode, regno);
7119 static rtx
7120 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7121 enum machine_mode orig_mode, enum machine_mode mode)
7123 const_tree fn, fntype;
7125 fn = NULL_TREE;
7126 if (fntype_or_decl && DECL_P (fntype_or_decl))
7127 fn = fntype_or_decl;
7128 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7130 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7131 return function_value_ms_64 (orig_mode, mode);
7132 else if (TARGET_64BIT)
7133 return function_value_64 (orig_mode, mode, valtype);
7134 else
7135 return function_value_32 (orig_mode, mode, fntype, fn);
7138 static rtx
7139 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7140 bool outgoing ATTRIBUTE_UNUSED)
7142 enum machine_mode mode, orig_mode;
7144 orig_mode = TYPE_MODE (valtype);
7145 mode = type_natural_mode (valtype, NULL);
7146 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7149 /* Pointer function arguments and return values are promoted to
7150 word_mode. */
7152 static enum machine_mode
7153 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7154 int *punsignedp, const_tree fntype,
7155 int for_return)
7157 if (type != NULL_TREE && POINTER_TYPE_P (type))
7159 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7160 return word_mode;
7162 return default_promote_function_mode (type, mode, punsignedp, fntype,
7163 for_return);
7166 /* Return true if a structure, union or array with MODE containing FIELD
7167 should be accessed using BLKmode. */
7169 static bool
7170 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7172 /* Union with XFmode must be in BLKmode. */
7173 return (mode == XFmode
7174 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7175 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7179 ix86_libcall_value (enum machine_mode mode)
7181 return ix86_function_value_1 (NULL, NULL, mode, mode);
7184 /* Return true iff type is returned in memory. */
7186 static bool ATTRIBUTE_UNUSED
7187 return_in_memory_32 (const_tree type, enum machine_mode mode)
7189 HOST_WIDE_INT size;
7191 if (mode == BLKmode)
7192 return true;
7194 size = int_size_in_bytes (type);
7196 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7197 return false;
7199 if (VECTOR_MODE_P (mode) || mode == TImode)
7201 /* User-created vectors small enough to fit in EAX. */
7202 if (size < 8)
7203 return false;
7205 /* MMX/3dNow values are returned in MM0,
7206 except when it doesn't exits or the ABI prescribes otherwise. */
7207 if (size == 8)
7208 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7210 /* SSE values are returned in XMM0, except when it doesn't exist. */
7211 if (size == 16)
7212 return !TARGET_SSE;
7214 /* AVX values are returned in YMM0, except when it doesn't exist. */
7215 if (size == 32)
7216 return !TARGET_AVX;
7219 if (mode == XFmode)
7220 return false;
7222 if (size > 12)
7223 return true;
7225 /* OImode shouldn't be used directly. */
7226 gcc_assert (mode != OImode);
7228 return false;
7231 static bool ATTRIBUTE_UNUSED
7232 return_in_memory_64 (const_tree type, enum machine_mode mode)
7234 int needed_intregs, needed_sseregs;
7235 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7238 static bool ATTRIBUTE_UNUSED
7239 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7241 HOST_WIDE_INT size = int_size_in_bytes (type);
7243 /* __m128 is returned in xmm0. */
7244 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7245 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7246 return false;
7248 /* Otherwise, the size must be exactly in [1248]. */
7249 return size != 1 && size != 2 && size != 4 && size != 8;
7252 static bool
7253 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7255 #ifdef SUBTARGET_RETURN_IN_MEMORY
7256 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7257 #else
7258 const enum machine_mode mode = type_natural_mode (type, NULL);
7260 if (TARGET_64BIT)
7262 if (ix86_function_type_abi (fntype) == MS_ABI)
7263 return return_in_memory_ms_64 (type, mode);
7264 else
7265 return return_in_memory_64 (type, mode);
7267 else
7268 return return_in_memory_32 (type, mode);
7269 #endif
7272 /* When returning SSE vector types, we have a choice of either
7273 (1) being abi incompatible with a -march switch, or
7274 (2) generating an error.
7275 Given no good solution, I think the safest thing is one warning.
7276 The user won't be able to use -Werror, but....
7278 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7279 called in response to actually generating a caller or callee that
7280 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7281 via aggregate_value_p for general type probing from tree-ssa. */
7283 static rtx
7284 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7286 static bool warnedsse, warnedmmx;
7288 if (!TARGET_64BIT && type)
7290 /* Look at the return type of the function, not the function type. */
7291 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7293 if (!TARGET_SSE && !warnedsse)
7295 if (mode == TImode
7296 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7298 warnedsse = true;
7299 warning (0, "SSE vector return without SSE enabled "
7300 "changes the ABI");
7304 if (!TARGET_MMX && !warnedmmx)
7306 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7308 warnedmmx = true;
7309 warning (0, "MMX vector return without MMX enabled "
7310 "changes the ABI");
7315 return NULL;
7319 /* Create the va_list data type. */
7321 /* Returns the calling convention specific va_list date type.
7322 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7324 static tree
7325 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7327 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7329 /* For i386 we use plain pointer to argument area. */
7330 if (!TARGET_64BIT || abi == MS_ABI)
7331 return build_pointer_type (char_type_node);
7333 record = lang_hooks.types.make_type (RECORD_TYPE);
7334 type_decl = build_decl (BUILTINS_LOCATION,
7335 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7337 f_gpr = build_decl (BUILTINS_LOCATION,
7338 FIELD_DECL, get_identifier ("gp_offset"),
7339 unsigned_type_node);
7340 f_fpr = build_decl (BUILTINS_LOCATION,
7341 FIELD_DECL, get_identifier ("fp_offset"),
7342 unsigned_type_node);
7343 f_ovf = build_decl (BUILTINS_LOCATION,
7344 FIELD_DECL, get_identifier ("overflow_arg_area"),
7345 ptr_type_node);
7346 f_sav = build_decl (BUILTINS_LOCATION,
7347 FIELD_DECL, get_identifier ("reg_save_area"),
7348 ptr_type_node);
7350 va_list_gpr_counter_field = f_gpr;
7351 va_list_fpr_counter_field = f_fpr;
7353 DECL_FIELD_CONTEXT (f_gpr) = record;
7354 DECL_FIELD_CONTEXT (f_fpr) = record;
7355 DECL_FIELD_CONTEXT (f_ovf) = record;
7356 DECL_FIELD_CONTEXT (f_sav) = record;
7358 TYPE_STUB_DECL (record) = type_decl;
7359 TYPE_NAME (record) = type_decl;
7360 TYPE_FIELDS (record) = f_gpr;
7361 DECL_CHAIN (f_gpr) = f_fpr;
7362 DECL_CHAIN (f_fpr) = f_ovf;
7363 DECL_CHAIN (f_ovf) = f_sav;
7365 layout_type (record);
7367 /* The correct type is an array type of one element. */
7368 return build_array_type (record, build_index_type (size_zero_node));
7371 /* Setup the builtin va_list data type and for 64-bit the additional
7372 calling convention specific va_list data types. */
7374 static tree
7375 ix86_build_builtin_va_list (void)
7377 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7379 /* Initialize abi specific va_list builtin types. */
7380 if (TARGET_64BIT)
7382 tree t;
7383 if (ix86_abi == MS_ABI)
7385 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7386 if (TREE_CODE (t) != RECORD_TYPE)
7387 t = build_variant_type_copy (t);
7388 sysv_va_list_type_node = t;
7390 else
7392 t = ret;
7393 if (TREE_CODE (t) != RECORD_TYPE)
7394 t = build_variant_type_copy (t);
7395 sysv_va_list_type_node = t;
7397 if (ix86_abi != MS_ABI)
7399 t = ix86_build_builtin_va_list_abi (MS_ABI);
7400 if (TREE_CODE (t) != RECORD_TYPE)
7401 t = build_variant_type_copy (t);
7402 ms_va_list_type_node = t;
7404 else
7406 t = ret;
7407 if (TREE_CODE (t) != RECORD_TYPE)
7408 t = build_variant_type_copy (t);
7409 ms_va_list_type_node = t;
7413 return ret;
7416 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7418 static void
7419 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7421 rtx save_area, mem;
7422 alias_set_type set;
7423 int i, max;
7425 /* GPR size of varargs save area. */
7426 if (cfun->va_list_gpr_size)
7427 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7428 else
7429 ix86_varargs_gpr_size = 0;
7431 /* FPR size of varargs save area. We don't need it if we don't pass
7432 anything in SSE registers. */
7433 if (TARGET_SSE && cfun->va_list_fpr_size)
7434 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7435 else
7436 ix86_varargs_fpr_size = 0;
7438 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7439 return;
7441 save_area = frame_pointer_rtx;
7442 set = get_varargs_alias_set ();
7444 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7445 if (max > X86_64_REGPARM_MAX)
7446 max = X86_64_REGPARM_MAX;
7448 for (i = cum->regno; i < max; i++)
7450 mem = gen_rtx_MEM (word_mode,
7451 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7452 MEM_NOTRAP_P (mem) = 1;
7453 set_mem_alias_set (mem, set);
7454 emit_move_insn (mem,
7455 gen_rtx_REG (word_mode,
7456 x86_64_int_parameter_registers[i]));
7459 if (ix86_varargs_fpr_size)
7461 enum machine_mode smode;
7462 rtx label, test;
7464 /* Now emit code to save SSE registers. The AX parameter contains number
7465 of SSE parameter registers used to call this function, though all we
7466 actually check here is the zero/non-zero status. */
7468 label = gen_label_rtx ();
7469 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7470 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7471 label));
7473 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7474 we used movdqa (i.e. TImode) instead? Perhaps even better would
7475 be if we could determine the real mode of the data, via a hook
7476 into pass_stdarg. Ignore all that for now. */
7477 smode = V4SFmode;
7478 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7479 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7481 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7482 if (max > X86_64_SSE_REGPARM_MAX)
7483 max = X86_64_SSE_REGPARM_MAX;
7485 for (i = cum->sse_regno; i < max; ++i)
7487 mem = plus_constant (Pmode, save_area,
7488 i * 16 + ix86_varargs_gpr_size);
7489 mem = gen_rtx_MEM (smode, mem);
7490 MEM_NOTRAP_P (mem) = 1;
7491 set_mem_alias_set (mem, set);
7492 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7494 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7497 emit_label (label);
7501 static void
7502 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7504 alias_set_type set = get_varargs_alias_set ();
7505 int i;
7507 /* Reset to zero, as there might be a sysv vaarg used
7508 before. */
7509 ix86_varargs_gpr_size = 0;
7510 ix86_varargs_fpr_size = 0;
7512 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7514 rtx reg, mem;
7516 mem = gen_rtx_MEM (Pmode,
7517 plus_constant (Pmode, virtual_incoming_args_rtx,
7518 i * UNITS_PER_WORD));
7519 MEM_NOTRAP_P (mem) = 1;
7520 set_mem_alias_set (mem, set);
7522 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7523 emit_move_insn (mem, reg);
7527 static void
7528 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7529 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7530 int no_rtl)
7532 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7533 CUMULATIVE_ARGS next_cum;
7534 tree fntype;
7536 /* This argument doesn't appear to be used anymore. Which is good,
7537 because the old code here didn't suppress rtl generation. */
7538 gcc_assert (!no_rtl);
7540 if (!TARGET_64BIT)
7541 return;
7543 fntype = TREE_TYPE (current_function_decl);
7545 /* For varargs, we do not want to skip the dummy va_dcl argument.
7546 For stdargs, we do want to skip the last named argument. */
7547 next_cum = *cum;
7548 if (stdarg_p (fntype))
7549 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7550 true);
7552 if (cum->call_abi == MS_ABI)
7553 setup_incoming_varargs_ms_64 (&next_cum);
7554 else
7555 setup_incoming_varargs_64 (&next_cum);
7558 /* Checks if TYPE is of kind va_list char *. */
7560 static bool
7561 is_va_list_char_pointer (tree type)
7563 tree canonic;
7565 /* For 32-bit it is always true. */
7566 if (!TARGET_64BIT)
7567 return true;
7568 canonic = ix86_canonical_va_list_type (type);
7569 return (canonic == ms_va_list_type_node
7570 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7573 /* Implement va_start. */
7575 static void
7576 ix86_va_start (tree valist, rtx nextarg)
7578 HOST_WIDE_INT words, n_gpr, n_fpr;
7579 tree f_gpr, f_fpr, f_ovf, f_sav;
7580 tree gpr, fpr, ovf, sav, t;
7581 tree type;
7582 rtx ovf_rtx;
7584 if (flag_split_stack
7585 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7587 unsigned int scratch_regno;
7589 /* When we are splitting the stack, we can't refer to the stack
7590 arguments using internal_arg_pointer, because they may be on
7591 the old stack. The split stack prologue will arrange to
7592 leave a pointer to the old stack arguments in a scratch
7593 register, which we here copy to a pseudo-register. The split
7594 stack prologue can't set the pseudo-register directly because
7595 it (the prologue) runs before any registers have been saved. */
7597 scratch_regno = split_stack_prologue_scratch_regno ();
7598 if (scratch_regno != INVALID_REGNUM)
7600 rtx reg, seq;
7602 reg = gen_reg_rtx (Pmode);
7603 cfun->machine->split_stack_varargs_pointer = reg;
7605 start_sequence ();
7606 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7607 seq = get_insns ();
7608 end_sequence ();
7610 push_topmost_sequence ();
7611 emit_insn_after (seq, entry_of_function ());
7612 pop_topmost_sequence ();
7616 /* Only 64bit target needs something special. */
7617 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7619 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7620 std_expand_builtin_va_start (valist, nextarg);
7621 else
7623 rtx va_r, next;
7625 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7626 next = expand_binop (ptr_mode, add_optab,
7627 cfun->machine->split_stack_varargs_pointer,
7628 crtl->args.arg_offset_rtx,
7629 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7630 convert_move (va_r, next, 0);
7632 return;
7635 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7636 f_fpr = DECL_CHAIN (f_gpr);
7637 f_ovf = DECL_CHAIN (f_fpr);
7638 f_sav = DECL_CHAIN (f_ovf);
7640 valist = build_simple_mem_ref (valist);
7641 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7642 /* The following should be folded into the MEM_REF offset. */
7643 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7644 f_gpr, NULL_TREE);
7645 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7646 f_fpr, NULL_TREE);
7647 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7648 f_ovf, NULL_TREE);
7649 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7650 f_sav, NULL_TREE);
7652 /* Count number of gp and fp argument registers used. */
7653 words = crtl->args.info.words;
7654 n_gpr = crtl->args.info.regno;
7655 n_fpr = crtl->args.info.sse_regno;
7657 if (cfun->va_list_gpr_size)
7659 type = TREE_TYPE (gpr);
7660 t = build2 (MODIFY_EXPR, type,
7661 gpr, build_int_cst (type, n_gpr * 8));
7662 TREE_SIDE_EFFECTS (t) = 1;
7663 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7666 if (TARGET_SSE && cfun->va_list_fpr_size)
7668 type = TREE_TYPE (fpr);
7669 t = build2 (MODIFY_EXPR, type, fpr,
7670 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7671 TREE_SIDE_EFFECTS (t) = 1;
7672 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7675 /* Find the overflow area. */
7676 type = TREE_TYPE (ovf);
7677 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7678 ovf_rtx = crtl->args.internal_arg_pointer;
7679 else
7680 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7681 t = make_tree (type, ovf_rtx);
7682 if (words != 0)
7683 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7684 t = build2 (MODIFY_EXPR, type, ovf, t);
7685 TREE_SIDE_EFFECTS (t) = 1;
7686 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7688 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7690 /* Find the register save area.
7691 Prologue of the function save it right above stack frame. */
7692 type = TREE_TYPE (sav);
7693 t = make_tree (type, frame_pointer_rtx);
7694 if (!ix86_varargs_gpr_size)
7695 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7696 t = build2 (MODIFY_EXPR, type, sav, t);
7697 TREE_SIDE_EFFECTS (t) = 1;
7698 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7702 /* Implement va_arg. */
7704 static tree
7705 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7706 gimple_seq *post_p)
7708 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7709 tree f_gpr, f_fpr, f_ovf, f_sav;
7710 tree gpr, fpr, ovf, sav, t;
7711 int size, rsize;
7712 tree lab_false, lab_over = NULL_TREE;
7713 tree addr, t2;
7714 rtx container;
7715 int indirect_p = 0;
7716 tree ptrtype;
7717 enum machine_mode nat_mode;
7718 unsigned int arg_boundary;
7720 /* Only 64bit target needs something special. */
7721 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7722 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7724 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7725 f_fpr = DECL_CHAIN (f_gpr);
7726 f_ovf = DECL_CHAIN (f_fpr);
7727 f_sav = DECL_CHAIN (f_ovf);
7729 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7730 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7731 valist = build_va_arg_indirect_ref (valist);
7732 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7733 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7734 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7736 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7737 if (indirect_p)
7738 type = build_pointer_type (type);
7739 size = int_size_in_bytes (type);
7740 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7742 nat_mode = type_natural_mode (type, NULL);
7743 switch (nat_mode)
7745 case V8SFmode:
7746 case V8SImode:
7747 case V32QImode:
7748 case V16HImode:
7749 case V4DFmode:
7750 case V4DImode:
7751 /* Unnamed 256bit vector mode parameters are passed on stack. */
7752 if (!TARGET_64BIT_MS_ABI)
7754 container = NULL;
7755 break;
7758 default:
7759 container = construct_container (nat_mode, TYPE_MODE (type),
7760 type, 0, X86_64_REGPARM_MAX,
7761 X86_64_SSE_REGPARM_MAX, intreg,
7763 break;
7766 /* Pull the value out of the saved registers. */
7768 addr = create_tmp_var (ptr_type_node, "addr");
7770 if (container)
7772 int needed_intregs, needed_sseregs;
7773 bool need_temp;
7774 tree int_addr, sse_addr;
7776 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7777 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7779 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7781 need_temp = (!REG_P (container)
7782 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7783 || TYPE_ALIGN (type) > 128));
7785 /* In case we are passing structure, verify that it is consecutive block
7786 on the register save area. If not we need to do moves. */
7787 if (!need_temp && !REG_P (container))
7789 /* Verify that all registers are strictly consecutive */
7790 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7792 int i;
7794 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7796 rtx slot = XVECEXP (container, 0, i);
7797 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7798 || INTVAL (XEXP (slot, 1)) != i * 16)
7799 need_temp = 1;
7802 else
7804 int i;
7806 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7808 rtx slot = XVECEXP (container, 0, i);
7809 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7810 || INTVAL (XEXP (slot, 1)) != i * 8)
7811 need_temp = 1;
7815 if (!need_temp)
7817 int_addr = addr;
7818 sse_addr = addr;
7820 else
7822 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7823 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7826 /* First ensure that we fit completely in registers. */
7827 if (needed_intregs)
7829 t = build_int_cst (TREE_TYPE (gpr),
7830 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7831 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7832 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7833 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7834 gimplify_and_add (t, pre_p);
7836 if (needed_sseregs)
7838 t = build_int_cst (TREE_TYPE (fpr),
7839 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7840 + X86_64_REGPARM_MAX * 8);
7841 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7842 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7843 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7844 gimplify_and_add (t, pre_p);
7847 /* Compute index to start of area used for integer regs. */
7848 if (needed_intregs)
7850 /* int_addr = gpr + sav; */
7851 t = fold_build_pointer_plus (sav, gpr);
7852 gimplify_assign (int_addr, t, pre_p);
7854 if (needed_sseregs)
7856 /* sse_addr = fpr + sav; */
7857 t = fold_build_pointer_plus (sav, fpr);
7858 gimplify_assign (sse_addr, t, pre_p);
7860 if (need_temp)
7862 int i, prev_size = 0;
7863 tree temp = create_tmp_var (type, "va_arg_tmp");
7865 /* addr = &temp; */
7866 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
7867 gimplify_assign (addr, t, pre_p);
7869 for (i = 0; i < XVECLEN (container, 0); i++)
7871 rtx slot = XVECEXP (container, 0, i);
7872 rtx reg = XEXP (slot, 0);
7873 enum machine_mode mode = GET_MODE (reg);
7874 tree piece_type;
7875 tree addr_type;
7876 tree daddr_type;
7877 tree src_addr, src;
7878 int src_offset;
7879 tree dest_addr, dest;
7880 int cur_size = GET_MODE_SIZE (mode);
7882 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
7883 prev_size = INTVAL (XEXP (slot, 1));
7884 if (prev_size + cur_size > size)
7886 cur_size = size - prev_size;
7887 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
7888 if (mode == BLKmode)
7889 mode = QImode;
7891 piece_type = lang_hooks.types.type_for_mode (mode, 1);
7892 if (mode == GET_MODE (reg))
7893 addr_type = build_pointer_type (piece_type);
7894 else
7895 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
7896 true);
7897 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
7898 true);
7900 if (SSE_REGNO_P (REGNO (reg)))
7902 src_addr = sse_addr;
7903 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
7905 else
7907 src_addr = int_addr;
7908 src_offset = REGNO (reg) * 8;
7910 src_addr = fold_convert (addr_type, src_addr);
7911 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
7913 dest_addr = fold_convert (daddr_type, addr);
7914 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
7915 if (cur_size == GET_MODE_SIZE (mode))
7917 src = build_va_arg_indirect_ref (src_addr);
7918 dest = build_va_arg_indirect_ref (dest_addr);
7920 gimplify_assign (dest, src, pre_p);
7922 else
7924 tree copy
7925 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
7926 3, dest_addr, src_addr,
7927 size_int (cur_size));
7928 gimplify_and_add (copy, pre_p);
7930 prev_size += cur_size;
7934 if (needed_intregs)
7936 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
7937 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
7938 gimplify_assign (gpr, t, pre_p);
7941 if (needed_sseregs)
7943 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
7944 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
7945 gimplify_assign (fpr, t, pre_p);
7948 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
7950 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
7953 /* ... otherwise out of the overflow area. */
7955 /* When we align parameter on stack for caller, if the parameter
7956 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
7957 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
7958 here with caller. */
7959 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
7960 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
7961 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
7963 /* Care for on-stack alignment if needed. */
7964 if (arg_boundary <= 64 || size == 0)
7965 t = ovf;
7966 else
7968 HOST_WIDE_INT align = arg_boundary / 8;
7969 t = fold_build_pointer_plus_hwi (ovf, align - 1);
7970 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7971 build_int_cst (TREE_TYPE (t), -align));
7974 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
7975 gimplify_assign (addr, t, pre_p);
7977 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
7978 gimplify_assign (unshare_expr (ovf), t, pre_p);
7980 if (container)
7981 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
7983 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
7984 addr = fold_convert (ptrtype, addr);
7986 if (indirect_p)
7987 addr = build_va_arg_indirect_ref (addr);
7988 return build_va_arg_indirect_ref (addr);
7991 /* Return true if OPNUM's MEM should be matched
7992 in movabs* patterns. */
7994 bool
7995 ix86_check_movabs (rtx insn, int opnum)
7997 rtx set, mem;
7999 set = PATTERN (insn);
8000 if (GET_CODE (set) == PARALLEL)
8001 set = XVECEXP (set, 0, 0);
8002 gcc_assert (GET_CODE (set) == SET);
8003 mem = XEXP (set, opnum);
8004 while (GET_CODE (mem) == SUBREG)
8005 mem = SUBREG_REG (mem);
8006 gcc_assert (MEM_P (mem));
8007 return volatile_ok || !MEM_VOLATILE_P (mem);
8010 /* Initialize the table of extra 80387 mathematical constants. */
8012 static void
8013 init_ext_80387_constants (void)
8015 static const char * cst[5] =
8017 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8018 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8019 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8020 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8021 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8023 int i;
8025 for (i = 0; i < 5; i++)
8027 real_from_string (&ext_80387_constants_table[i], cst[i]);
8028 /* Ensure each constant is rounded to XFmode precision. */
8029 real_convert (&ext_80387_constants_table[i],
8030 XFmode, &ext_80387_constants_table[i]);
8033 ext_80387_constants_init = 1;
8036 /* Return non-zero if the constant is something that
8037 can be loaded with a special instruction. */
8040 standard_80387_constant_p (rtx x)
8042 enum machine_mode mode = GET_MODE (x);
8044 REAL_VALUE_TYPE r;
8046 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8047 return -1;
8049 if (x == CONST0_RTX (mode))
8050 return 1;
8051 if (x == CONST1_RTX (mode))
8052 return 2;
8054 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8056 /* For XFmode constants, try to find a special 80387 instruction when
8057 optimizing for size or on those CPUs that benefit from them. */
8058 if (mode == XFmode
8059 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8061 int i;
8063 if (! ext_80387_constants_init)
8064 init_ext_80387_constants ();
8066 for (i = 0; i < 5; i++)
8067 if (real_identical (&r, &ext_80387_constants_table[i]))
8068 return i + 3;
8071 /* Load of the constant -0.0 or -1.0 will be split as
8072 fldz;fchs or fld1;fchs sequence. */
8073 if (real_isnegzero (&r))
8074 return 8;
8075 if (real_identical (&r, &dconstm1))
8076 return 9;
8078 return 0;
8081 /* Return the opcode of the special instruction to be used to load
8082 the constant X. */
8084 const char *
8085 standard_80387_constant_opcode (rtx x)
8087 switch (standard_80387_constant_p (x))
8089 case 1:
8090 return "fldz";
8091 case 2:
8092 return "fld1";
8093 case 3:
8094 return "fldlg2";
8095 case 4:
8096 return "fldln2";
8097 case 5:
8098 return "fldl2e";
8099 case 6:
8100 return "fldl2t";
8101 case 7:
8102 return "fldpi";
8103 case 8:
8104 case 9:
8105 return "#";
8106 default:
8107 gcc_unreachable ();
8111 /* Return the CONST_DOUBLE representing the 80387 constant that is
8112 loaded by the specified special instruction. The argument IDX
8113 matches the return value from standard_80387_constant_p. */
8116 standard_80387_constant_rtx (int idx)
8118 int i;
8120 if (! ext_80387_constants_init)
8121 init_ext_80387_constants ();
8123 switch (idx)
8125 case 3:
8126 case 4:
8127 case 5:
8128 case 6:
8129 case 7:
8130 i = idx - 3;
8131 break;
8133 default:
8134 gcc_unreachable ();
8137 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8138 XFmode);
8141 /* Return 1 if X is all 0s and 2 if x is all 1s
8142 in supported SSE/AVX vector mode. */
8145 standard_sse_constant_p (rtx x)
8147 enum machine_mode mode = GET_MODE (x);
8149 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8150 return 1;
8151 if (vector_all_ones_operand (x, mode))
8152 switch (mode)
8154 case V16QImode:
8155 case V8HImode:
8156 case V4SImode:
8157 case V2DImode:
8158 if (TARGET_SSE2)
8159 return 2;
8160 case V32QImode:
8161 case V16HImode:
8162 case V8SImode:
8163 case V4DImode:
8164 if (TARGET_AVX2)
8165 return 2;
8166 default:
8167 break;
8170 return 0;
8173 /* Return the opcode of the special instruction to be used to load
8174 the constant X. */
8176 const char *
8177 standard_sse_constant_opcode (rtx insn, rtx x)
8179 switch (standard_sse_constant_p (x))
8181 case 1:
8182 switch (get_attr_mode (insn))
8184 case MODE_TI:
8185 return "%vpxor\t%0, %d0";
8186 case MODE_V2DF:
8187 return "%vxorpd\t%0, %d0";
8188 case MODE_V4SF:
8189 return "%vxorps\t%0, %d0";
8191 case MODE_OI:
8192 return "vpxor\t%x0, %x0, %x0";
8193 case MODE_V4DF:
8194 return "vxorpd\t%x0, %x0, %x0";
8195 case MODE_V8SF:
8196 return "vxorps\t%x0, %x0, %x0";
8198 default:
8199 break;
8202 case 2:
8203 if (TARGET_AVX)
8204 return "vpcmpeqd\t%0, %0, %0";
8205 else
8206 return "pcmpeqd\t%0, %0";
8208 default:
8209 break;
8211 gcc_unreachable ();
8214 /* Returns true if OP contains a symbol reference */
8216 bool
8217 symbolic_reference_mentioned_p (rtx op)
8219 const char *fmt;
8220 int i;
8222 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8223 return true;
8225 fmt = GET_RTX_FORMAT (GET_CODE (op));
8226 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8228 if (fmt[i] == 'E')
8230 int j;
8232 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8233 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8234 return true;
8237 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8238 return true;
8241 return false;
8244 /* Return true if it is appropriate to emit `ret' instructions in the
8245 body of a function. Do this only if the epilogue is simple, needing a
8246 couple of insns. Prior to reloading, we can't tell how many registers
8247 must be saved, so return false then. Return false if there is no frame
8248 marker to de-allocate. */
8250 bool
8251 ix86_can_use_return_insn_p (void)
8253 struct ix86_frame frame;
8255 if (! reload_completed || frame_pointer_needed)
8256 return 0;
8258 /* Don't allow more than 32k pop, since that's all we can do
8259 with one instruction. */
8260 if (crtl->args.pops_args && crtl->args.size >= 32768)
8261 return 0;
8263 ix86_compute_frame_layout (&frame);
8264 return (frame.stack_pointer_offset == UNITS_PER_WORD
8265 && (frame.nregs + frame.nsseregs) == 0);
8268 /* Value should be nonzero if functions must have frame pointers.
8269 Zero means the frame pointer need not be set up (and parms may
8270 be accessed via the stack pointer) in functions that seem suitable. */
8272 static bool
8273 ix86_frame_pointer_required (void)
8275 /* If we accessed previous frames, then the generated code expects
8276 to be able to access the saved ebp value in our frame. */
8277 if (cfun->machine->accesses_prev_frame)
8278 return true;
8280 /* Several x86 os'es need a frame pointer for other reasons,
8281 usually pertaining to setjmp. */
8282 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8283 return true;
8285 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8286 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8287 return true;
8289 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8290 allocation is 4GB. */
8291 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8292 return true;
8294 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8295 turns off the frame pointer by default. Turn it back on now if
8296 we've not got a leaf function. */
8297 if (TARGET_OMIT_LEAF_FRAME_POINTER
8298 && (!crtl->is_leaf
8299 || ix86_current_function_calls_tls_descriptor))
8300 return true;
8302 if (crtl->profile && !flag_fentry)
8303 return true;
8305 return false;
8308 /* Record that the current function accesses previous call frames. */
8310 void
8311 ix86_setup_frame_addresses (void)
8313 cfun->machine->accesses_prev_frame = 1;
8316 #ifndef USE_HIDDEN_LINKONCE
8317 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8318 # define USE_HIDDEN_LINKONCE 1
8319 # else
8320 # define USE_HIDDEN_LINKONCE 0
8321 # endif
8322 #endif
8324 static int pic_labels_used;
8326 /* Fills in the label name that should be used for a pc thunk for
8327 the given register. */
8329 static void
8330 get_pc_thunk_name (char name[32], unsigned int regno)
8332 gcc_assert (!TARGET_64BIT);
8334 if (USE_HIDDEN_LINKONCE)
8335 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8336 else
8337 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8341 /* This function generates code for -fpic that loads %ebx with
8342 the return address of the caller and then returns. */
8344 static void
8345 ix86_code_end (void)
8347 rtx xops[2];
8348 int regno;
8350 for (regno = AX_REG; regno <= SP_REG; regno++)
8352 char name[32];
8353 tree decl;
8355 if (!(pic_labels_used & (1 << regno)))
8356 continue;
8358 get_pc_thunk_name (name, regno);
8360 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8361 get_identifier (name),
8362 build_function_type_list (void_type_node, NULL_TREE));
8363 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8364 NULL_TREE, void_type_node);
8365 TREE_PUBLIC (decl) = 1;
8366 TREE_STATIC (decl) = 1;
8367 DECL_IGNORED_P (decl) = 1;
8369 #if TARGET_MACHO
8370 if (TARGET_MACHO)
8372 switch_to_section (darwin_sections[text_coal_section]);
8373 fputs ("\t.weak_definition\t", asm_out_file);
8374 assemble_name (asm_out_file, name);
8375 fputs ("\n\t.private_extern\t", asm_out_file);
8376 assemble_name (asm_out_file, name);
8377 putc ('\n', asm_out_file);
8378 ASM_OUTPUT_LABEL (asm_out_file, name);
8379 DECL_WEAK (decl) = 1;
8381 else
8382 #endif
8383 if (USE_HIDDEN_LINKONCE)
8385 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8387 targetm.asm_out.unique_section (decl, 0);
8388 switch_to_section (get_named_section (decl, NULL, 0));
8390 targetm.asm_out.globalize_label (asm_out_file, name);
8391 fputs ("\t.hidden\t", asm_out_file);
8392 assemble_name (asm_out_file, name);
8393 putc ('\n', asm_out_file);
8394 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8396 else
8398 switch_to_section (text_section);
8399 ASM_OUTPUT_LABEL (asm_out_file, name);
8402 DECL_INITIAL (decl) = make_node (BLOCK);
8403 current_function_decl = decl;
8404 init_function_start (decl);
8405 first_function_block_is_cold = false;
8406 /* Make sure unwind info is emitted for the thunk if needed. */
8407 final_start_function (emit_barrier (), asm_out_file, 1);
8409 /* Pad stack IP move with 4 instructions (two NOPs count
8410 as one instruction). */
8411 if (TARGET_PAD_SHORT_FUNCTION)
8413 int i = 8;
8415 while (i--)
8416 fputs ("\tnop\n", asm_out_file);
8419 xops[0] = gen_rtx_REG (Pmode, regno);
8420 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8421 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8422 fputs ("\tret\n", asm_out_file);
8423 final_end_function ();
8424 init_insn_lengths ();
8425 free_after_compilation (cfun);
8426 set_cfun (NULL);
8427 current_function_decl = NULL;
8430 if (flag_split_stack)
8431 file_end_indicate_split_stack ();
8434 /* Emit code for the SET_GOT patterns. */
8436 const char *
8437 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8439 rtx xops[3];
8441 xops[0] = dest;
8443 if (TARGET_VXWORKS_RTP && flag_pic)
8445 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8446 xops[2] = gen_rtx_MEM (Pmode,
8447 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8448 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8450 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8451 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8452 an unadorned address. */
8453 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8454 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8455 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8456 return "";
8459 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8461 if (!flag_pic)
8463 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8465 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8467 #if TARGET_MACHO
8468 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8469 is what will be referenced by the Mach-O PIC subsystem. */
8470 if (!label)
8471 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8472 #endif
8474 targetm.asm_out.internal_label (asm_out_file, "L",
8475 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8477 else
8479 char name[32];
8480 get_pc_thunk_name (name, REGNO (dest));
8481 pic_labels_used |= 1 << REGNO (dest);
8483 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8484 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8485 output_asm_insn ("call\t%X2", xops);
8486 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8487 is what will be referenced by the Mach-O PIC subsystem. */
8488 #if TARGET_MACHO
8489 if (!label)
8490 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8491 else
8492 targetm.asm_out.internal_label (asm_out_file, "L",
8493 CODE_LABEL_NUMBER (label));
8494 #endif
8497 if (!TARGET_MACHO)
8498 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8500 return "";
8503 /* Generate an "push" pattern for input ARG. */
8505 static rtx
8506 gen_push (rtx arg)
8508 struct machine_function *m = cfun->machine;
8510 if (m->fs.cfa_reg == stack_pointer_rtx)
8511 m->fs.cfa_offset += UNITS_PER_WORD;
8512 m->fs.sp_offset += UNITS_PER_WORD;
8514 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8515 arg = gen_rtx_REG (word_mode, REGNO (arg));
8517 return gen_rtx_SET (VOIDmode,
8518 gen_rtx_MEM (word_mode,
8519 gen_rtx_PRE_DEC (Pmode,
8520 stack_pointer_rtx)),
8521 arg);
8524 /* Generate an "pop" pattern for input ARG. */
8526 static rtx
8527 gen_pop (rtx arg)
8529 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8530 arg = gen_rtx_REG (word_mode, REGNO (arg));
8532 return gen_rtx_SET (VOIDmode,
8533 arg,
8534 gen_rtx_MEM (word_mode,
8535 gen_rtx_POST_INC (Pmode,
8536 stack_pointer_rtx)));
8539 /* Return >= 0 if there is an unused call-clobbered register available
8540 for the entire function. */
8542 static unsigned int
8543 ix86_select_alt_pic_regnum (void)
8545 if (crtl->is_leaf
8546 && !crtl->profile
8547 && !ix86_current_function_calls_tls_descriptor)
8549 int i, drap;
8550 /* Can't use the same register for both PIC and DRAP. */
8551 if (crtl->drap_reg)
8552 drap = REGNO (crtl->drap_reg);
8553 else
8554 drap = -1;
8555 for (i = 2; i >= 0; --i)
8556 if (i != drap && !df_regs_ever_live_p (i))
8557 return i;
8560 return INVALID_REGNUM;
8563 /* Return TRUE if we need to save REGNO. */
8565 static bool
8566 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8568 if (pic_offset_table_rtx
8569 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8570 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8571 || crtl->profile
8572 || crtl->calls_eh_return
8573 || crtl->uses_const_pool))
8574 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8576 if (crtl->calls_eh_return && maybe_eh_return)
8578 unsigned i;
8579 for (i = 0; ; i++)
8581 unsigned test = EH_RETURN_DATA_REGNO (i);
8582 if (test == INVALID_REGNUM)
8583 break;
8584 if (test == regno)
8585 return true;
8589 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8590 return true;
8592 return (df_regs_ever_live_p (regno)
8593 && !call_used_regs[regno]
8594 && !fixed_regs[regno]
8595 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8598 /* Return number of saved general prupose registers. */
8600 static int
8601 ix86_nsaved_regs (void)
8603 int nregs = 0;
8604 int regno;
8606 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8607 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8608 nregs ++;
8609 return nregs;
8612 /* Return number of saved SSE registrers. */
8614 static int
8615 ix86_nsaved_sseregs (void)
8617 int nregs = 0;
8618 int regno;
8620 if (!TARGET_64BIT_MS_ABI)
8621 return 0;
8622 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8623 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8624 nregs ++;
8625 return nregs;
8628 /* Given FROM and TO register numbers, say whether this elimination is
8629 allowed. If stack alignment is needed, we can only replace argument
8630 pointer with hard frame pointer, or replace frame pointer with stack
8631 pointer. Otherwise, frame pointer elimination is automatically
8632 handled and all other eliminations are valid. */
8634 static bool
8635 ix86_can_eliminate (const int from, const int to)
8637 if (stack_realign_fp)
8638 return ((from == ARG_POINTER_REGNUM
8639 && to == HARD_FRAME_POINTER_REGNUM)
8640 || (from == FRAME_POINTER_REGNUM
8641 && to == STACK_POINTER_REGNUM));
8642 else
8643 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8646 /* Return the offset between two registers, one to be eliminated, and the other
8647 its replacement, at the start of a routine. */
8649 HOST_WIDE_INT
8650 ix86_initial_elimination_offset (int from, int to)
8652 struct ix86_frame frame;
8653 ix86_compute_frame_layout (&frame);
8655 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8656 return frame.hard_frame_pointer_offset;
8657 else if (from == FRAME_POINTER_REGNUM
8658 && to == HARD_FRAME_POINTER_REGNUM)
8659 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8660 else
8662 gcc_assert (to == STACK_POINTER_REGNUM);
8664 if (from == ARG_POINTER_REGNUM)
8665 return frame.stack_pointer_offset;
8667 gcc_assert (from == FRAME_POINTER_REGNUM);
8668 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8672 /* In a dynamically-aligned function, we can't know the offset from
8673 stack pointer to frame pointer, so we must ensure that setjmp
8674 eliminates fp against the hard fp (%ebp) rather than trying to
8675 index from %esp up to the top of the frame across a gap that is
8676 of unknown (at compile-time) size. */
8677 static rtx
8678 ix86_builtin_setjmp_frame_value (void)
8680 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8683 /* When using -fsplit-stack, the allocation routines set a field in
8684 the TCB to the bottom of the stack plus this much space, measured
8685 in bytes. */
8687 #define SPLIT_STACK_AVAILABLE 256
8689 /* Fill structure ix86_frame about frame of currently computed function. */
8691 static void
8692 ix86_compute_frame_layout (struct ix86_frame *frame)
8694 unsigned HOST_WIDE_INT stack_alignment_needed;
8695 HOST_WIDE_INT offset;
8696 unsigned HOST_WIDE_INT preferred_alignment;
8697 HOST_WIDE_INT size = get_frame_size ();
8698 HOST_WIDE_INT to_allocate;
8700 frame->nregs = ix86_nsaved_regs ();
8701 frame->nsseregs = ix86_nsaved_sseregs ();
8703 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8704 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8706 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8707 function prologues and leaf. */
8708 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8709 && (!crtl->is_leaf || cfun->calls_alloca != 0
8710 || ix86_current_function_calls_tls_descriptor))
8712 preferred_alignment = 16;
8713 stack_alignment_needed = 16;
8714 crtl->preferred_stack_boundary = 128;
8715 crtl->stack_alignment_needed = 128;
8718 gcc_assert (!size || stack_alignment_needed);
8719 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8720 gcc_assert (preferred_alignment <= stack_alignment_needed);
8722 /* For SEH we have to limit the amount of code movement into the prologue.
8723 At present we do this via a BLOCKAGE, at which point there's very little
8724 scheduling that can be done, which means that there's very little point
8725 in doing anything except PUSHs. */
8726 if (TARGET_SEH)
8727 cfun->machine->use_fast_prologue_epilogue = false;
8729 /* During reload iteration the amount of registers saved can change.
8730 Recompute the value as needed. Do not recompute when amount of registers
8731 didn't change as reload does multiple calls to the function and does not
8732 expect the decision to change within single iteration. */
8733 else if (!optimize_function_for_size_p (cfun)
8734 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8736 int count = frame->nregs;
8737 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8739 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8741 /* The fast prologue uses move instead of push to save registers. This
8742 is significantly longer, but also executes faster as modern hardware
8743 can execute the moves in parallel, but can't do that for push/pop.
8745 Be careful about choosing what prologue to emit: When function takes
8746 many instructions to execute we may use slow version as well as in
8747 case function is known to be outside hot spot (this is known with
8748 feedback only). Weight the size of function by number of registers
8749 to save as it is cheap to use one or two push instructions but very
8750 slow to use many of them. */
8751 if (count)
8752 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8753 if (node->frequency < NODE_FREQUENCY_NORMAL
8754 || (flag_branch_probabilities
8755 && node->frequency < NODE_FREQUENCY_HOT))
8756 cfun->machine->use_fast_prologue_epilogue = false;
8757 else
8758 cfun->machine->use_fast_prologue_epilogue
8759 = !expensive_function_p (count);
8762 frame->save_regs_using_mov
8763 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8764 /* If static stack checking is enabled and done with probes,
8765 the registers need to be saved before allocating the frame. */
8766 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8768 /* Skip return address. */
8769 offset = UNITS_PER_WORD;
8771 /* Skip pushed static chain. */
8772 if (ix86_static_chain_on_stack)
8773 offset += UNITS_PER_WORD;
8775 /* Skip saved base pointer. */
8776 if (frame_pointer_needed)
8777 offset += UNITS_PER_WORD;
8778 frame->hfp_save_offset = offset;
8780 /* The traditional frame pointer location is at the top of the frame. */
8781 frame->hard_frame_pointer_offset = offset;
8783 /* Register save area */
8784 offset += frame->nregs * UNITS_PER_WORD;
8785 frame->reg_save_offset = offset;
8787 /* On SEH target, registers are pushed just before the frame pointer
8788 location. */
8789 if (TARGET_SEH)
8790 frame->hard_frame_pointer_offset = offset;
8792 /* Align and set SSE register save area. */
8793 if (frame->nsseregs)
8795 /* The only ABI that has saved SSE registers (Win64) also has a
8796 16-byte aligned default stack, and thus we don't need to be
8797 within the re-aligned local stack frame to save them. */
8798 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8799 offset = (offset + 16 - 1) & -16;
8800 offset += frame->nsseregs * 16;
8802 frame->sse_reg_save_offset = offset;
8804 /* The re-aligned stack starts here. Values before this point are not
8805 directly comparable with values below this point. In order to make
8806 sure that no value happens to be the same before and after, force
8807 the alignment computation below to add a non-zero value. */
8808 if (stack_realign_fp)
8809 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8811 /* Va-arg area */
8812 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8813 offset += frame->va_arg_size;
8815 /* Align start of frame for local function. */
8816 if (stack_realign_fp
8817 || offset != frame->sse_reg_save_offset
8818 || size != 0
8819 || !crtl->is_leaf
8820 || cfun->calls_alloca
8821 || ix86_current_function_calls_tls_descriptor)
8822 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8824 /* Frame pointer points here. */
8825 frame->frame_pointer_offset = offset;
8827 offset += size;
8829 /* Add outgoing arguments area. Can be skipped if we eliminated
8830 all the function calls as dead code.
8831 Skipping is however impossible when function calls alloca. Alloca
8832 expander assumes that last crtl->outgoing_args_size
8833 of stack frame are unused. */
8834 if (ACCUMULATE_OUTGOING_ARGS
8835 && (!crtl->is_leaf || cfun->calls_alloca
8836 || ix86_current_function_calls_tls_descriptor))
8838 offset += crtl->outgoing_args_size;
8839 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8841 else
8842 frame->outgoing_arguments_size = 0;
8844 /* Align stack boundary. Only needed if we're calling another function
8845 or using alloca. */
8846 if (!crtl->is_leaf || cfun->calls_alloca
8847 || ix86_current_function_calls_tls_descriptor)
8848 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8850 /* We've reached end of stack frame. */
8851 frame->stack_pointer_offset = offset;
8853 /* Size prologue needs to allocate. */
8854 to_allocate = offset - frame->sse_reg_save_offset;
8856 if ((!to_allocate && frame->nregs <= 1)
8857 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8858 frame->save_regs_using_mov = false;
8860 if (ix86_using_red_zone ()
8861 && crtl->sp_is_unchanging
8862 && crtl->is_leaf
8863 && !ix86_current_function_calls_tls_descriptor)
8865 frame->red_zone_size = to_allocate;
8866 if (frame->save_regs_using_mov)
8867 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
8868 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
8869 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
8871 else
8872 frame->red_zone_size = 0;
8873 frame->stack_pointer_offset -= frame->red_zone_size;
8875 /* The SEH frame pointer location is near the bottom of the frame.
8876 This is enforced by the fact that the difference between the
8877 stack pointer and the frame pointer is limited to 240 bytes in
8878 the unwind data structure. */
8879 if (TARGET_SEH)
8881 HOST_WIDE_INT diff;
8883 /* If we can leave the frame pointer where it is, do so. Also, returns
8884 the establisher frame for __builtin_frame_address (0). */
8885 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
8886 if (diff <= SEH_MAX_FRAME_SIZE
8887 && (diff > 240 || (diff & 15) != 0)
8888 && !crtl->accesses_prior_frames)
8890 /* Ideally we'd determine what portion of the local stack frame
8891 (within the constraint of the lowest 240) is most heavily used.
8892 But without that complication, simply bias the frame pointer
8893 by 128 bytes so as to maximize the amount of the local stack
8894 frame that is addressable with 8-bit offsets. */
8895 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
8900 /* This is semi-inlined memory_address_length, but simplified
8901 since we know that we're always dealing with reg+offset, and
8902 to avoid having to create and discard all that rtl. */
8904 static inline int
8905 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
8907 int len = 4;
8909 if (offset == 0)
8911 /* EBP and R13 cannot be encoded without an offset. */
8912 len = (regno == BP_REG || regno == R13_REG);
8914 else if (IN_RANGE (offset, -128, 127))
8915 len = 1;
8917 /* ESP and R12 must be encoded with a SIB byte. */
8918 if (regno == SP_REG || regno == R12_REG)
8919 len++;
8921 return len;
8924 /* Return an RTX that points to CFA_OFFSET within the stack frame.
8925 The valid base registers are taken from CFUN->MACHINE->FS. */
8927 static rtx
8928 choose_baseaddr (HOST_WIDE_INT cfa_offset)
8930 const struct machine_function *m = cfun->machine;
8931 rtx base_reg = NULL;
8932 HOST_WIDE_INT base_offset = 0;
8934 if (m->use_fast_prologue_epilogue)
8936 /* Choose the base register most likely to allow the most scheduling
8937 opportunities. Generally FP is valid throughout the function,
8938 while DRAP must be reloaded within the epilogue. But choose either
8939 over the SP due to increased encoding size. */
8941 if (m->fs.fp_valid)
8943 base_reg = hard_frame_pointer_rtx;
8944 base_offset = m->fs.fp_offset - cfa_offset;
8946 else if (m->fs.drap_valid)
8948 base_reg = crtl->drap_reg;
8949 base_offset = 0 - cfa_offset;
8951 else if (m->fs.sp_valid)
8953 base_reg = stack_pointer_rtx;
8954 base_offset = m->fs.sp_offset - cfa_offset;
8957 else
8959 HOST_WIDE_INT toffset;
8960 int len = 16, tlen;
8962 /* Choose the base register with the smallest address encoding.
8963 With a tie, choose FP > DRAP > SP. */
8964 if (m->fs.sp_valid)
8966 base_reg = stack_pointer_rtx;
8967 base_offset = m->fs.sp_offset - cfa_offset;
8968 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
8970 if (m->fs.drap_valid)
8972 toffset = 0 - cfa_offset;
8973 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
8974 if (tlen <= len)
8976 base_reg = crtl->drap_reg;
8977 base_offset = toffset;
8978 len = tlen;
8981 if (m->fs.fp_valid)
8983 toffset = m->fs.fp_offset - cfa_offset;
8984 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
8985 if (tlen <= len)
8987 base_reg = hard_frame_pointer_rtx;
8988 base_offset = toffset;
8989 len = tlen;
8993 gcc_assert (base_reg != NULL);
8995 return plus_constant (Pmode, base_reg, base_offset);
8998 /* Emit code to save registers in the prologue. */
9000 static void
9001 ix86_emit_save_regs (void)
9003 unsigned int regno;
9004 rtx insn;
9006 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9007 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9009 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9010 RTX_FRAME_RELATED_P (insn) = 1;
9014 /* Emit a single register save at CFA - CFA_OFFSET. */
9016 static void
9017 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9018 HOST_WIDE_INT cfa_offset)
9020 struct machine_function *m = cfun->machine;
9021 rtx reg = gen_rtx_REG (mode, regno);
9022 rtx mem, addr, base, insn;
9024 addr = choose_baseaddr (cfa_offset);
9025 mem = gen_frame_mem (mode, addr);
9027 /* For SSE saves, we need to indicate the 128-bit alignment. */
9028 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9030 insn = emit_move_insn (mem, reg);
9031 RTX_FRAME_RELATED_P (insn) = 1;
9033 base = addr;
9034 if (GET_CODE (base) == PLUS)
9035 base = XEXP (base, 0);
9036 gcc_checking_assert (REG_P (base));
9038 /* When saving registers into a re-aligned local stack frame, avoid
9039 any tricky guessing by dwarf2out. */
9040 if (m->fs.realigned)
9042 gcc_checking_assert (stack_realign_drap);
9044 if (regno == REGNO (crtl->drap_reg))
9046 /* A bit of a hack. We force the DRAP register to be saved in
9047 the re-aligned stack frame, which provides us with a copy
9048 of the CFA that will last past the prologue. Install it. */
9049 gcc_checking_assert (cfun->machine->fs.fp_valid);
9050 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9051 cfun->machine->fs.fp_offset - cfa_offset);
9052 mem = gen_rtx_MEM (mode, addr);
9053 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9055 else
9057 /* The frame pointer is a stable reference within the
9058 aligned frame. Use it. */
9059 gcc_checking_assert (cfun->machine->fs.fp_valid);
9060 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9061 cfun->machine->fs.fp_offset - cfa_offset);
9062 mem = gen_rtx_MEM (mode, addr);
9063 add_reg_note (insn, REG_CFA_EXPRESSION,
9064 gen_rtx_SET (VOIDmode, mem, reg));
9068 /* The memory may not be relative to the current CFA register,
9069 which means that we may need to generate a new pattern for
9070 use by the unwind info. */
9071 else if (base != m->fs.cfa_reg)
9073 addr = plus_constant (Pmode, m->fs.cfa_reg,
9074 m->fs.cfa_offset - cfa_offset);
9075 mem = gen_rtx_MEM (mode, addr);
9076 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9080 /* Emit code to save registers using MOV insns.
9081 First register is stored at CFA - CFA_OFFSET. */
9082 static void
9083 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9085 unsigned int regno;
9087 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9088 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9090 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9091 cfa_offset -= UNITS_PER_WORD;
9095 /* Emit code to save SSE registers using MOV insns.
9096 First register is stored at CFA - CFA_OFFSET. */
9097 static void
9098 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9100 unsigned int regno;
9102 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9103 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9105 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9106 cfa_offset -= 16;
9110 static GTY(()) rtx queued_cfa_restores;
9112 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9113 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9114 Don't add the note if the previously saved value will be left untouched
9115 within stack red-zone till return, as unwinders can find the same value
9116 in the register and on the stack. */
9118 static void
9119 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9121 if (!crtl->shrink_wrapped
9122 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9123 return;
9125 if (insn)
9127 add_reg_note (insn, REG_CFA_RESTORE, reg);
9128 RTX_FRAME_RELATED_P (insn) = 1;
9130 else
9131 queued_cfa_restores
9132 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9135 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9137 static void
9138 ix86_add_queued_cfa_restore_notes (rtx insn)
9140 rtx last;
9141 if (!queued_cfa_restores)
9142 return;
9143 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9145 XEXP (last, 1) = REG_NOTES (insn);
9146 REG_NOTES (insn) = queued_cfa_restores;
9147 queued_cfa_restores = NULL_RTX;
9148 RTX_FRAME_RELATED_P (insn) = 1;
9151 /* Expand prologue or epilogue stack adjustment.
9152 The pattern exist to put a dependency on all ebp-based memory accesses.
9153 STYLE should be negative if instructions should be marked as frame related,
9154 zero if %r11 register is live and cannot be freely used and positive
9155 otherwise. */
9157 static void
9158 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9159 int style, bool set_cfa)
9161 struct machine_function *m = cfun->machine;
9162 rtx insn;
9163 bool add_frame_related_expr = false;
9165 if (Pmode == SImode)
9166 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9167 else if (x86_64_immediate_operand (offset, DImode))
9168 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9169 else
9171 rtx tmp;
9172 /* r11 is used by indirect sibcall return as well, set before the
9173 epilogue and used after the epilogue. */
9174 if (style)
9175 tmp = gen_rtx_REG (DImode, R11_REG);
9176 else
9178 gcc_assert (src != hard_frame_pointer_rtx
9179 && dest != hard_frame_pointer_rtx);
9180 tmp = hard_frame_pointer_rtx;
9182 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9183 if (style < 0)
9184 add_frame_related_expr = true;
9186 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9189 insn = emit_insn (insn);
9190 if (style >= 0)
9191 ix86_add_queued_cfa_restore_notes (insn);
9193 if (set_cfa)
9195 rtx r;
9197 gcc_assert (m->fs.cfa_reg == src);
9198 m->fs.cfa_offset += INTVAL (offset);
9199 m->fs.cfa_reg = dest;
9201 r = gen_rtx_PLUS (Pmode, src, offset);
9202 r = gen_rtx_SET (VOIDmode, dest, r);
9203 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9204 RTX_FRAME_RELATED_P (insn) = 1;
9206 else if (style < 0)
9208 RTX_FRAME_RELATED_P (insn) = 1;
9209 if (add_frame_related_expr)
9211 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9212 r = gen_rtx_SET (VOIDmode, dest, r);
9213 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9217 if (dest == stack_pointer_rtx)
9219 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9220 bool valid = m->fs.sp_valid;
9222 if (src == hard_frame_pointer_rtx)
9224 valid = m->fs.fp_valid;
9225 ooffset = m->fs.fp_offset;
9227 else if (src == crtl->drap_reg)
9229 valid = m->fs.drap_valid;
9230 ooffset = 0;
9232 else
9234 /* Else there are two possibilities: SP itself, which we set
9235 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9236 taken care of this by hand along the eh_return path. */
9237 gcc_checking_assert (src == stack_pointer_rtx
9238 || offset == const0_rtx);
9241 m->fs.sp_offset = ooffset - INTVAL (offset);
9242 m->fs.sp_valid = valid;
9246 /* Find an available register to be used as dynamic realign argument
9247 pointer regsiter. Such a register will be written in prologue and
9248 used in begin of body, so it must not be
9249 1. parameter passing register.
9250 2. GOT pointer.
9251 We reuse static-chain register if it is available. Otherwise, we
9252 use DI for i386 and R13 for x86-64. We chose R13 since it has
9253 shorter encoding.
9255 Return: the regno of chosen register. */
9257 static unsigned int
9258 find_drap_reg (void)
9260 tree decl = cfun->decl;
9262 if (TARGET_64BIT)
9264 /* Use R13 for nested function or function need static chain.
9265 Since function with tail call may use any caller-saved
9266 registers in epilogue, DRAP must not use caller-saved
9267 register in such case. */
9268 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9269 return R13_REG;
9271 return R10_REG;
9273 else
9275 /* Use DI for nested function or function need static chain.
9276 Since function with tail call may use any caller-saved
9277 registers in epilogue, DRAP must not use caller-saved
9278 register in such case. */
9279 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9280 return DI_REG;
9282 /* Reuse static chain register if it isn't used for parameter
9283 passing. */
9284 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9286 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9287 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9288 return CX_REG;
9290 return DI_REG;
9294 /* Return minimum incoming stack alignment. */
9296 static unsigned int
9297 ix86_minimum_incoming_stack_boundary (bool sibcall)
9299 unsigned int incoming_stack_boundary;
9301 /* Prefer the one specified at command line. */
9302 if (ix86_user_incoming_stack_boundary)
9303 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9304 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9305 if -mstackrealign is used, it isn't used for sibcall check and
9306 estimated stack alignment is 128bit. */
9307 else if (!sibcall
9308 && !TARGET_64BIT
9309 && ix86_force_align_arg_pointer
9310 && crtl->stack_alignment_estimated == 128)
9311 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9312 else
9313 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9315 /* Incoming stack alignment can be changed on individual functions
9316 via force_align_arg_pointer attribute. We use the smallest
9317 incoming stack boundary. */
9318 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9319 && lookup_attribute (ix86_force_align_arg_pointer_string,
9320 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9321 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9323 /* The incoming stack frame has to be aligned at least at
9324 parm_stack_boundary. */
9325 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9326 incoming_stack_boundary = crtl->parm_stack_boundary;
9328 /* Stack at entrance of main is aligned by runtime. We use the
9329 smallest incoming stack boundary. */
9330 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9331 && DECL_NAME (current_function_decl)
9332 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9333 && DECL_FILE_SCOPE_P (current_function_decl))
9334 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9336 return incoming_stack_boundary;
9339 /* Update incoming stack boundary and estimated stack alignment. */
9341 static void
9342 ix86_update_stack_boundary (void)
9344 ix86_incoming_stack_boundary
9345 = ix86_minimum_incoming_stack_boundary (false);
9347 /* x86_64 vararg needs 16byte stack alignment for register save
9348 area. */
9349 if (TARGET_64BIT
9350 && cfun->stdarg
9351 && crtl->stack_alignment_estimated < 128)
9352 crtl->stack_alignment_estimated = 128;
9355 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9356 needed or an rtx for DRAP otherwise. */
9358 static rtx
9359 ix86_get_drap_rtx (void)
9361 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9362 crtl->need_drap = true;
9364 if (stack_realign_drap)
9366 /* Assign DRAP to vDRAP and returns vDRAP */
9367 unsigned int regno = find_drap_reg ();
9368 rtx drap_vreg;
9369 rtx arg_ptr;
9370 rtx seq, insn;
9372 arg_ptr = gen_rtx_REG (Pmode, regno);
9373 crtl->drap_reg = arg_ptr;
9375 start_sequence ();
9376 drap_vreg = copy_to_reg (arg_ptr);
9377 seq = get_insns ();
9378 end_sequence ();
9380 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9381 if (!optimize)
9383 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9384 RTX_FRAME_RELATED_P (insn) = 1;
9386 return drap_vreg;
9388 else
9389 return NULL;
9392 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9394 static rtx
9395 ix86_internal_arg_pointer (void)
9397 return virtual_incoming_args_rtx;
9400 struct scratch_reg {
9401 rtx reg;
9402 bool saved;
9405 /* Return a short-lived scratch register for use on function entry.
9406 In 32-bit mode, it is valid only after the registers are saved
9407 in the prologue. This register must be released by means of
9408 release_scratch_register_on_entry once it is dead. */
9410 static void
9411 get_scratch_register_on_entry (struct scratch_reg *sr)
9413 int regno;
9415 sr->saved = false;
9417 if (TARGET_64BIT)
9419 /* We always use R11 in 64-bit mode. */
9420 regno = R11_REG;
9422 else
9424 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9425 bool fastcall_p
9426 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9427 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9428 int regparm = ix86_function_regparm (fntype, decl);
9429 int drap_regno
9430 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9432 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9433 for the static chain register. */
9434 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9435 && drap_regno != AX_REG)
9436 regno = AX_REG;
9437 else if (regparm < 2 && drap_regno != DX_REG)
9438 regno = DX_REG;
9439 /* ecx is the static chain register. */
9440 else if (regparm < 3 && !fastcall_p && !static_chain_p
9441 && drap_regno != CX_REG)
9442 regno = CX_REG;
9443 else if (ix86_save_reg (BX_REG, true))
9444 regno = BX_REG;
9445 /* esi is the static chain register. */
9446 else if (!(regparm == 3 && static_chain_p)
9447 && ix86_save_reg (SI_REG, true))
9448 regno = SI_REG;
9449 else if (ix86_save_reg (DI_REG, true))
9450 regno = DI_REG;
9451 else
9453 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9454 sr->saved = true;
9458 sr->reg = gen_rtx_REG (Pmode, regno);
9459 if (sr->saved)
9461 rtx insn = emit_insn (gen_push (sr->reg));
9462 RTX_FRAME_RELATED_P (insn) = 1;
9466 /* Release a scratch register obtained from the preceding function. */
9468 static void
9469 release_scratch_register_on_entry (struct scratch_reg *sr)
9471 if (sr->saved)
9473 struct machine_function *m = cfun->machine;
9474 rtx x, insn = emit_insn (gen_pop (sr->reg));
9476 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9477 RTX_FRAME_RELATED_P (insn) = 1;
9478 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9479 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9480 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9481 m->fs.sp_offset -= UNITS_PER_WORD;
9485 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9487 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9489 static void
9490 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9492 /* We skip the probe for the first interval + a small dope of 4 words and
9493 probe that many bytes past the specified size to maintain a protection
9494 area at the botton of the stack. */
9495 const int dope = 4 * UNITS_PER_WORD;
9496 rtx size_rtx = GEN_INT (size), last;
9498 /* See if we have a constant small number of probes to generate. If so,
9499 that's the easy case. The run-time loop is made up of 11 insns in the
9500 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9501 for n # of intervals. */
9502 if (size <= 5 * PROBE_INTERVAL)
9504 HOST_WIDE_INT i, adjust;
9505 bool first_probe = true;
9507 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9508 values of N from 1 until it exceeds SIZE. If only one probe is
9509 needed, this will not generate any code. Then adjust and probe
9510 to PROBE_INTERVAL + SIZE. */
9511 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9513 if (first_probe)
9515 adjust = 2 * PROBE_INTERVAL + dope;
9516 first_probe = false;
9518 else
9519 adjust = PROBE_INTERVAL;
9521 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9522 plus_constant (Pmode, stack_pointer_rtx,
9523 -adjust)));
9524 emit_stack_probe (stack_pointer_rtx);
9527 if (first_probe)
9528 adjust = size + PROBE_INTERVAL + dope;
9529 else
9530 adjust = size + PROBE_INTERVAL - i;
9532 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9533 plus_constant (Pmode, stack_pointer_rtx,
9534 -adjust)));
9535 emit_stack_probe (stack_pointer_rtx);
9537 /* Adjust back to account for the additional first interval. */
9538 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9539 plus_constant (Pmode, stack_pointer_rtx,
9540 PROBE_INTERVAL + dope)));
9543 /* Otherwise, do the same as above, but in a loop. Note that we must be
9544 extra careful with variables wrapping around because we might be at
9545 the very top (or the very bottom) of the address space and we have
9546 to be able to handle this case properly; in particular, we use an
9547 equality test for the loop condition. */
9548 else
9550 HOST_WIDE_INT rounded_size;
9551 struct scratch_reg sr;
9553 get_scratch_register_on_entry (&sr);
9556 /* Step 1: round SIZE to the previous multiple of the interval. */
9558 rounded_size = size & -PROBE_INTERVAL;
9561 /* Step 2: compute initial and final value of the loop counter. */
9563 /* SP = SP_0 + PROBE_INTERVAL. */
9564 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9565 plus_constant (Pmode, stack_pointer_rtx,
9566 - (PROBE_INTERVAL + dope))));
9568 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9569 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9570 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9571 gen_rtx_PLUS (Pmode, sr.reg,
9572 stack_pointer_rtx)));
9575 /* Step 3: the loop
9577 while (SP != LAST_ADDR)
9579 SP = SP + PROBE_INTERVAL
9580 probe at SP
9583 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9584 values of N from 1 until it is equal to ROUNDED_SIZE. */
9586 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9589 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9590 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9592 if (size != rounded_size)
9594 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9595 plus_constant (Pmode, stack_pointer_rtx,
9596 rounded_size - size)));
9597 emit_stack_probe (stack_pointer_rtx);
9600 /* Adjust back to account for the additional first interval. */
9601 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9602 plus_constant (Pmode, stack_pointer_rtx,
9603 PROBE_INTERVAL + dope)));
9605 release_scratch_register_on_entry (&sr);
9608 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9610 /* Even if the stack pointer isn't the CFA register, we need to correctly
9611 describe the adjustments made to it, in particular differentiate the
9612 frame-related ones from the frame-unrelated ones. */
9613 if (size > 0)
9615 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9616 XVECEXP (expr, 0, 0)
9617 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9618 plus_constant (Pmode, stack_pointer_rtx, -size));
9619 XVECEXP (expr, 0, 1)
9620 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9621 plus_constant (Pmode, stack_pointer_rtx,
9622 PROBE_INTERVAL + dope + size));
9623 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9624 RTX_FRAME_RELATED_P (last) = 1;
9626 cfun->machine->fs.sp_offset += size;
9629 /* Make sure nothing is scheduled before we are done. */
9630 emit_insn (gen_blockage ());
9633 /* Adjust the stack pointer up to REG while probing it. */
9635 const char *
9636 output_adjust_stack_and_probe (rtx reg)
9638 static int labelno = 0;
9639 char loop_lab[32], end_lab[32];
9640 rtx xops[2];
9642 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9643 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9645 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9647 /* Jump to END_LAB if SP == LAST_ADDR. */
9648 xops[0] = stack_pointer_rtx;
9649 xops[1] = reg;
9650 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9651 fputs ("\tje\t", asm_out_file);
9652 assemble_name_raw (asm_out_file, end_lab);
9653 fputc ('\n', asm_out_file);
9655 /* SP = SP + PROBE_INTERVAL. */
9656 xops[1] = GEN_INT (PROBE_INTERVAL);
9657 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9659 /* Probe at SP. */
9660 xops[1] = const0_rtx;
9661 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9663 fprintf (asm_out_file, "\tjmp\t");
9664 assemble_name_raw (asm_out_file, loop_lab);
9665 fputc ('\n', asm_out_file);
9667 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9669 return "";
9672 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9673 inclusive. These are offsets from the current stack pointer. */
9675 static void
9676 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9678 /* See if we have a constant small number of probes to generate. If so,
9679 that's the easy case. The run-time loop is made up of 7 insns in the
9680 generic case while the compile-time loop is made up of n insns for n #
9681 of intervals. */
9682 if (size <= 7 * PROBE_INTERVAL)
9684 HOST_WIDE_INT i;
9686 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9687 it exceeds SIZE. If only one probe is needed, this will not
9688 generate any code. Then probe at FIRST + SIZE. */
9689 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9690 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9691 -(first + i)));
9693 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9694 -(first + size)));
9697 /* Otherwise, do the same as above, but in a loop. Note that we must be
9698 extra careful with variables wrapping around because we might be at
9699 the very top (or the very bottom) of the address space and we have
9700 to be able to handle this case properly; in particular, we use an
9701 equality test for the loop condition. */
9702 else
9704 HOST_WIDE_INT rounded_size, last;
9705 struct scratch_reg sr;
9707 get_scratch_register_on_entry (&sr);
9710 /* Step 1: round SIZE to the previous multiple of the interval. */
9712 rounded_size = size & -PROBE_INTERVAL;
9715 /* Step 2: compute initial and final value of the loop counter. */
9717 /* TEST_OFFSET = FIRST. */
9718 emit_move_insn (sr.reg, GEN_INT (-first));
9720 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9721 last = first + rounded_size;
9724 /* Step 3: the loop
9726 while (TEST_ADDR != LAST_ADDR)
9728 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9729 probe at TEST_ADDR
9732 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9733 until it is equal to ROUNDED_SIZE. */
9735 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9738 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9739 that SIZE is equal to ROUNDED_SIZE. */
9741 if (size != rounded_size)
9742 emit_stack_probe (plus_constant (Pmode,
9743 gen_rtx_PLUS (Pmode,
9744 stack_pointer_rtx,
9745 sr.reg),
9746 rounded_size - size));
9748 release_scratch_register_on_entry (&sr);
9751 /* Make sure nothing is scheduled before we are done. */
9752 emit_insn (gen_blockage ());
9755 /* Probe a range of stack addresses from REG to END, inclusive. These are
9756 offsets from the current stack pointer. */
9758 const char *
9759 output_probe_stack_range (rtx reg, rtx end)
9761 static int labelno = 0;
9762 char loop_lab[32], end_lab[32];
9763 rtx xops[3];
9765 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9766 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9768 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9770 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9771 xops[0] = reg;
9772 xops[1] = end;
9773 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9774 fputs ("\tje\t", asm_out_file);
9775 assemble_name_raw (asm_out_file, end_lab);
9776 fputc ('\n', asm_out_file);
9778 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9779 xops[1] = GEN_INT (PROBE_INTERVAL);
9780 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9782 /* Probe at TEST_ADDR. */
9783 xops[0] = stack_pointer_rtx;
9784 xops[1] = reg;
9785 xops[2] = const0_rtx;
9786 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9788 fprintf (asm_out_file, "\tjmp\t");
9789 assemble_name_raw (asm_out_file, loop_lab);
9790 fputc ('\n', asm_out_file);
9792 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9794 return "";
9797 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9798 to be generated in correct form. */
9799 static void
9800 ix86_finalize_stack_realign_flags (void)
9802 /* Check if stack realign is really needed after reload, and
9803 stores result in cfun */
9804 unsigned int incoming_stack_boundary
9805 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9806 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9807 unsigned int stack_realign = (incoming_stack_boundary
9808 < (crtl->is_leaf
9809 ? crtl->max_used_stack_slot_alignment
9810 : crtl->stack_alignment_needed));
9812 if (crtl->stack_realign_finalized)
9814 /* After stack_realign_needed is finalized, we can't no longer
9815 change it. */
9816 gcc_assert (crtl->stack_realign_needed == stack_realign);
9817 return;
9820 /* If the only reason for frame_pointer_needed is that we conservatively
9821 assumed stack realignment might be needed, but in the end nothing that
9822 needed the stack alignment had been spilled, clear frame_pointer_needed
9823 and say we don't need stack realignment. */
9824 if (stack_realign
9825 && !crtl->need_drap
9826 && frame_pointer_needed
9827 && crtl->is_leaf
9828 && flag_omit_frame_pointer
9829 && crtl->sp_is_unchanging
9830 && !ix86_current_function_calls_tls_descriptor
9831 && !crtl->accesses_prior_frames
9832 && !cfun->calls_alloca
9833 && !crtl->calls_eh_return
9834 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
9835 && !ix86_frame_pointer_required ()
9836 && get_frame_size () == 0
9837 && ix86_nsaved_sseregs () == 0
9838 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
9840 HARD_REG_SET set_up_by_prologue, prologue_used;
9841 basic_block bb;
9843 CLEAR_HARD_REG_SET (prologue_used);
9844 CLEAR_HARD_REG_SET (set_up_by_prologue);
9845 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
9846 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
9847 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
9848 HARD_FRAME_POINTER_REGNUM);
9849 FOR_EACH_BB (bb)
9851 rtx insn;
9852 FOR_BB_INSNS (bb, insn)
9853 if (NONDEBUG_INSN_P (insn)
9854 && requires_stack_frame_p (insn, prologue_used,
9855 set_up_by_prologue))
9857 crtl->stack_realign_needed = stack_realign;
9858 crtl->stack_realign_finalized = true;
9859 return;
9863 frame_pointer_needed = false;
9864 stack_realign = false;
9865 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
9866 crtl->stack_alignment_needed = incoming_stack_boundary;
9867 crtl->stack_alignment_estimated = incoming_stack_boundary;
9868 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
9869 crtl->preferred_stack_boundary = incoming_stack_boundary;
9870 df_finish_pass (true);
9871 df_scan_alloc (NULL);
9872 df_scan_blocks ();
9873 df_compute_regs_ever_live (true);
9874 df_analyze ();
9877 crtl->stack_realign_needed = stack_realign;
9878 crtl->stack_realign_finalized = true;
9881 /* Expand the prologue into a bunch of separate insns. */
9883 void
9884 ix86_expand_prologue (void)
9886 struct machine_function *m = cfun->machine;
9887 rtx insn, t;
9888 bool pic_reg_used;
9889 struct ix86_frame frame;
9890 HOST_WIDE_INT allocate;
9891 bool int_registers_saved;
9892 bool sse_registers_saved;
9894 ix86_finalize_stack_realign_flags ();
9896 /* DRAP should not coexist with stack_realign_fp */
9897 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
9899 memset (&m->fs, 0, sizeof (m->fs));
9901 /* Initialize CFA state for before the prologue. */
9902 m->fs.cfa_reg = stack_pointer_rtx;
9903 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
9905 /* Track SP offset to the CFA. We continue tracking this after we've
9906 swapped the CFA register away from SP. In the case of re-alignment
9907 this is fudged; we're interested to offsets within the local frame. */
9908 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
9909 m->fs.sp_valid = true;
9911 ix86_compute_frame_layout (&frame);
9913 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
9915 /* We should have already generated an error for any use of
9916 ms_hook on a nested function. */
9917 gcc_checking_assert (!ix86_static_chain_on_stack);
9919 /* Check if profiling is active and we shall use profiling before
9920 prologue variant. If so sorry. */
9921 if (crtl->profile && flag_fentry != 0)
9922 sorry ("ms_hook_prologue attribute isn%'t compatible "
9923 "with -mfentry for 32-bit");
9925 /* In ix86_asm_output_function_label we emitted:
9926 8b ff movl.s %edi,%edi
9927 55 push %ebp
9928 8b ec movl.s %esp,%ebp
9930 This matches the hookable function prologue in Win32 API
9931 functions in Microsoft Windows XP Service Pack 2 and newer.
9932 Wine uses this to enable Windows apps to hook the Win32 API
9933 functions provided by Wine.
9935 What that means is that we've already set up the frame pointer. */
9937 if (frame_pointer_needed
9938 && !(crtl->drap_reg && crtl->stack_realign_needed))
9940 rtx push, mov;
9942 /* We've decided to use the frame pointer already set up.
9943 Describe this to the unwinder by pretending that both
9944 push and mov insns happen right here.
9946 Putting the unwind info here at the end of the ms_hook
9947 is done so that we can make absolutely certain we get
9948 the required byte sequence at the start of the function,
9949 rather than relying on an assembler that can produce
9950 the exact encoding required.
9952 However it does mean (in the unpatched case) that we have
9953 a 1 insn window where the asynchronous unwind info is
9954 incorrect. However, if we placed the unwind info at
9955 its correct location we would have incorrect unwind info
9956 in the patched case. Which is probably all moot since
9957 I don't expect Wine generates dwarf2 unwind info for the
9958 system libraries that use this feature. */
9960 insn = emit_insn (gen_blockage ());
9962 push = gen_push (hard_frame_pointer_rtx);
9963 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
9964 stack_pointer_rtx);
9965 RTX_FRAME_RELATED_P (push) = 1;
9966 RTX_FRAME_RELATED_P (mov) = 1;
9968 RTX_FRAME_RELATED_P (insn) = 1;
9969 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
9970 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
9972 /* Note that gen_push incremented m->fs.cfa_offset, even
9973 though we didn't emit the push insn here. */
9974 m->fs.cfa_reg = hard_frame_pointer_rtx;
9975 m->fs.fp_offset = m->fs.cfa_offset;
9976 m->fs.fp_valid = true;
9978 else
9980 /* The frame pointer is not needed so pop %ebp again.
9981 This leaves us with a pristine state. */
9982 emit_insn (gen_pop (hard_frame_pointer_rtx));
9986 /* The first insn of a function that accepts its static chain on the
9987 stack is to push the register that would be filled in by a direct
9988 call. This insn will be skipped by the trampoline. */
9989 else if (ix86_static_chain_on_stack)
9991 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
9992 emit_insn (gen_blockage ());
9994 /* We don't want to interpret this push insn as a register save,
9995 only as a stack adjustment. The real copy of the register as
9996 a save will be done later, if needed. */
9997 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
9998 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
9999 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10000 RTX_FRAME_RELATED_P (insn) = 1;
10003 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10004 of DRAP is needed and stack realignment is really needed after reload */
10005 if (stack_realign_drap)
10007 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10009 /* Only need to push parameter pointer reg if it is caller saved. */
10010 if (!call_used_regs[REGNO (crtl->drap_reg)])
10012 /* Push arg pointer reg */
10013 insn = emit_insn (gen_push (crtl->drap_reg));
10014 RTX_FRAME_RELATED_P (insn) = 1;
10017 /* Grab the argument pointer. */
10018 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10019 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10020 RTX_FRAME_RELATED_P (insn) = 1;
10021 m->fs.cfa_reg = crtl->drap_reg;
10022 m->fs.cfa_offset = 0;
10024 /* Align the stack. */
10025 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10026 stack_pointer_rtx,
10027 GEN_INT (-align_bytes)));
10028 RTX_FRAME_RELATED_P (insn) = 1;
10030 /* Replicate the return address on the stack so that return
10031 address can be reached via (argp - 1) slot. This is needed
10032 to implement macro RETURN_ADDR_RTX and intrinsic function
10033 expand_builtin_return_addr etc. */
10034 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10035 t = gen_frame_mem (word_mode, t);
10036 insn = emit_insn (gen_push (t));
10037 RTX_FRAME_RELATED_P (insn) = 1;
10039 /* For the purposes of frame and register save area addressing,
10040 we've started over with a new frame. */
10041 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10042 m->fs.realigned = true;
10045 int_registers_saved = (frame.nregs == 0);
10046 sse_registers_saved = (frame.nsseregs == 0);
10048 if (frame_pointer_needed && !m->fs.fp_valid)
10050 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10051 slower on all targets. Also sdb doesn't like it. */
10052 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10053 RTX_FRAME_RELATED_P (insn) = 1;
10055 /* Push registers now, before setting the frame pointer
10056 on SEH target. */
10057 if (!int_registers_saved
10058 && TARGET_SEH
10059 && !frame.save_regs_using_mov)
10061 ix86_emit_save_regs ();
10062 int_registers_saved = true;
10063 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10066 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10068 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10069 RTX_FRAME_RELATED_P (insn) = 1;
10071 if (m->fs.cfa_reg == stack_pointer_rtx)
10072 m->fs.cfa_reg = hard_frame_pointer_rtx;
10073 m->fs.fp_offset = m->fs.sp_offset;
10074 m->fs.fp_valid = true;
10078 if (!int_registers_saved)
10080 /* If saving registers via PUSH, do so now. */
10081 if (!frame.save_regs_using_mov)
10083 ix86_emit_save_regs ();
10084 int_registers_saved = true;
10085 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10088 /* When using red zone we may start register saving before allocating
10089 the stack frame saving one cycle of the prologue. However, avoid
10090 doing this if we have to probe the stack; at least on x86_64 the
10091 stack probe can turn into a call that clobbers a red zone location. */
10092 else if (ix86_using_red_zone ()
10093 && (! TARGET_STACK_PROBE
10094 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10096 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10097 int_registers_saved = true;
10101 if (stack_realign_fp)
10103 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10104 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10106 /* The computation of the size of the re-aligned stack frame means
10107 that we must allocate the size of the register save area before
10108 performing the actual alignment. Otherwise we cannot guarantee
10109 that there's enough storage above the realignment point. */
10110 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10111 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10112 GEN_INT (m->fs.sp_offset
10113 - frame.sse_reg_save_offset),
10114 -1, false);
10116 /* Align the stack. */
10117 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10118 stack_pointer_rtx,
10119 GEN_INT (-align_bytes)));
10121 /* For the purposes of register save area addressing, the stack
10122 pointer is no longer valid. As for the value of sp_offset,
10123 see ix86_compute_frame_layout, which we need to match in order
10124 to pass verification of stack_pointer_offset at the end. */
10125 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10126 m->fs.sp_valid = false;
10129 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10131 if (flag_stack_usage_info)
10133 /* We start to count from ARG_POINTER. */
10134 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10136 /* If it was realigned, take into account the fake frame. */
10137 if (stack_realign_drap)
10139 if (ix86_static_chain_on_stack)
10140 stack_size += UNITS_PER_WORD;
10142 if (!call_used_regs[REGNO (crtl->drap_reg)])
10143 stack_size += UNITS_PER_WORD;
10145 /* This over-estimates by 1 minimal-stack-alignment-unit but
10146 mitigates that by counting in the new return address slot. */
10147 current_function_dynamic_stack_size
10148 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10151 current_function_static_stack_size = stack_size;
10154 /* On SEH target with very large frame size, allocate an area to save
10155 SSE registers (as the very large allocation won't be described). */
10156 if (TARGET_SEH
10157 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10158 && !sse_registers_saved)
10160 HOST_WIDE_INT sse_size =
10161 frame.sse_reg_save_offset - frame.reg_save_offset;
10163 gcc_assert (int_registers_saved);
10165 /* No need to do stack checking as the area will be immediately
10166 written. */
10167 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10168 GEN_INT (-sse_size), -1,
10169 m->fs.cfa_reg == stack_pointer_rtx);
10170 allocate -= sse_size;
10171 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10172 sse_registers_saved = true;
10175 /* The stack has already been decremented by the instruction calling us
10176 so probe if the size is non-negative to preserve the protection area. */
10177 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10179 /* We expect the registers to be saved when probes are used. */
10180 gcc_assert (int_registers_saved);
10182 if (STACK_CHECK_MOVING_SP)
10184 ix86_adjust_stack_and_probe (allocate);
10185 allocate = 0;
10187 else
10189 HOST_WIDE_INT size = allocate;
10191 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10192 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10194 if (TARGET_STACK_PROBE)
10195 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10196 else
10197 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10201 if (allocate == 0)
10203 else if (!ix86_target_stack_probe ()
10204 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10206 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10207 GEN_INT (-allocate), -1,
10208 m->fs.cfa_reg == stack_pointer_rtx);
10210 else
10212 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10213 rtx r10 = NULL;
10214 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10215 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10216 bool eax_live = false;
10217 bool r10_live = false;
10219 if (TARGET_64BIT)
10220 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10221 if (!TARGET_64BIT_MS_ABI)
10222 eax_live = ix86_eax_live_at_start_p ();
10224 /* Note that SEH directives need to continue tracking the stack
10225 pointer even after the frame pointer has been set up. */
10226 if (eax_live)
10228 insn = emit_insn (gen_push (eax));
10229 allocate -= UNITS_PER_WORD;
10230 if (sp_is_cfa_reg || TARGET_SEH)
10232 if (sp_is_cfa_reg)
10233 m->fs.cfa_offset += UNITS_PER_WORD;
10234 RTX_FRAME_RELATED_P (insn) = 1;
10238 if (r10_live)
10240 r10 = gen_rtx_REG (Pmode, R10_REG);
10241 insn = emit_insn (gen_push (r10));
10242 allocate -= UNITS_PER_WORD;
10243 if (sp_is_cfa_reg || TARGET_SEH)
10245 if (sp_is_cfa_reg)
10246 m->fs.cfa_offset += UNITS_PER_WORD;
10247 RTX_FRAME_RELATED_P (insn) = 1;
10251 emit_move_insn (eax, GEN_INT (allocate));
10252 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10254 /* Use the fact that AX still contains ALLOCATE. */
10255 adjust_stack_insn = (Pmode == DImode
10256 ? gen_pro_epilogue_adjust_stack_di_sub
10257 : gen_pro_epilogue_adjust_stack_si_sub);
10259 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10260 stack_pointer_rtx, eax));
10262 if (sp_is_cfa_reg || TARGET_SEH)
10264 if (sp_is_cfa_reg)
10265 m->fs.cfa_offset += allocate;
10266 RTX_FRAME_RELATED_P (insn) = 1;
10267 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10268 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10269 plus_constant (Pmode, stack_pointer_rtx,
10270 -allocate)));
10272 m->fs.sp_offset += allocate;
10274 if (r10_live && eax_live)
10276 t = choose_baseaddr (m->fs.sp_offset - allocate);
10277 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10278 gen_frame_mem (word_mode, t));
10279 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10280 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10281 gen_frame_mem (word_mode, t));
10283 else if (eax_live || r10_live)
10285 t = choose_baseaddr (m->fs.sp_offset - allocate);
10286 emit_move_insn (gen_rtx_REG (word_mode,
10287 (eax_live ? AX_REG : R10_REG)),
10288 gen_frame_mem (word_mode, t));
10291 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10293 /* If we havn't already set up the frame pointer, do so now. */
10294 if (frame_pointer_needed && !m->fs.fp_valid)
10296 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10297 GEN_INT (frame.stack_pointer_offset
10298 - frame.hard_frame_pointer_offset));
10299 insn = emit_insn (insn);
10300 RTX_FRAME_RELATED_P (insn) = 1;
10301 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10303 if (m->fs.cfa_reg == stack_pointer_rtx)
10304 m->fs.cfa_reg = hard_frame_pointer_rtx;
10305 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10306 m->fs.fp_valid = true;
10309 if (!int_registers_saved)
10310 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10311 if (!sse_registers_saved)
10312 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10314 pic_reg_used = false;
10315 if (pic_offset_table_rtx
10316 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10317 || crtl->profile))
10319 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10321 if (alt_pic_reg_used != INVALID_REGNUM)
10322 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10324 pic_reg_used = true;
10327 if (pic_reg_used)
10329 if (TARGET_64BIT)
10331 if (ix86_cmodel == CM_LARGE_PIC)
10333 rtx label, tmp_reg;
10335 gcc_assert (Pmode == DImode);
10336 label = gen_label_rtx ();
10337 emit_label (label);
10338 LABEL_PRESERVE_P (label) = 1;
10339 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10340 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10341 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10342 label));
10343 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10344 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10345 pic_offset_table_rtx, tmp_reg));
10347 else
10348 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10350 else
10352 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10353 RTX_FRAME_RELATED_P (insn) = 1;
10354 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10358 /* In the pic_reg_used case, make sure that the got load isn't deleted
10359 when mcount needs it. Blockage to avoid call movement across mcount
10360 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10361 note. */
10362 if (crtl->profile && !flag_fentry && pic_reg_used)
10363 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10365 if (crtl->drap_reg && !crtl->stack_realign_needed)
10367 /* vDRAP is setup but after reload it turns out stack realign
10368 isn't necessary, here we will emit prologue to setup DRAP
10369 without stack realign adjustment */
10370 t = choose_baseaddr (0);
10371 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10374 /* Prevent instructions from being scheduled into register save push
10375 sequence when access to the redzone area is done through frame pointer.
10376 The offset between the frame pointer and the stack pointer is calculated
10377 relative to the value of the stack pointer at the end of the function
10378 prologue, and moving instructions that access redzone area via frame
10379 pointer inside push sequence violates this assumption. */
10380 if (frame_pointer_needed && frame.red_zone_size)
10381 emit_insn (gen_memory_blockage ());
10383 /* Emit cld instruction if stringops are used in the function. */
10384 if (TARGET_CLD && ix86_current_function_needs_cld)
10385 emit_insn (gen_cld ());
10387 /* SEH requires that the prologue end within 256 bytes of the start of
10388 the function. Prevent instruction schedules that would extend that.
10389 Further, prevent alloca modifications to the stack pointer from being
10390 combined with prologue modifications. */
10391 if (TARGET_SEH)
10392 emit_insn (gen_prologue_use (stack_pointer_rtx));
10395 /* Emit code to restore REG using a POP insn. */
10397 static void
10398 ix86_emit_restore_reg_using_pop (rtx reg)
10400 struct machine_function *m = cfun->machine;
10401 rtx insn = emit_insn (gen_pop (reg));
10403 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10404 m->fs.sp_offset -= UNITS_PER_WORD;
10406 if (m->fs.cfa_reg == crtl->drap_reg
10407 && REGNO (reg) == REGNO (crtl->drap_reg))
10409 /* Previously we'd represented the CFA as an expression
10410 like *(%ebp - 8). We've just popped that value from
10411 the stack, which means we need to reset the CFA to
10412 the drap register. This will remain until we restore
10413 the stack pointer. */
10414 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10415 RTX_FRAME_RELATED_P (insn) = 1;
10417 /* This means that the DRAP register is valid for addressing too. */
10418 m->fs.drap_valid = true;
10419 return;
10422 if (m->fs.cfa_reg == stack_pointer_rtx)
10424 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10425 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10426 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10427 RTX_FRAME_RELATED_P (insn) = 1;
10429 m->fs.cfa_offset -= UNITS_PER_WORD;
10432 /* When the frame pointer is the CFA, and we pop it, we are
10433 swapping back to the stack pointer as the CFA. This happens
10434 for stack frames that don't allocate other data, so we assume
10435 the stack pointer is now pointing at the return address, i.e.
10436 the function entry state, which makes the offset be 1 word. */
10437 if (reg == hard_frame_pointer_rtx)
10439 m->fs.fp_valid = false;
10440 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10442 m->fs.cfa_reg = stack_pointer_rtx;
10443 m->fs.cfa_offset -= UNITS_PER_WORD;
10445 add_reg_note (insn, REG_CFA_DEF_CFA,
10446 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10447 GEN_INT (m->fs.cfa_offset)));
10448 RTX_FRAME_RELATED_P (insn) = 1;
10453 /* Emit code to restore saved registers using POP insns. */
10455 static void
10456 ix86_emit_restore_regs_using_pop (void)
10458 unsigned int regno;
10460 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10461 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10462 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10465 /* Emit code and notes for the LEAVE instruction. */
10467 static void
10468 ix86_emit_leave (void)
10470 struct machine_function *m = cfun->machine;
10471 rtx insn = emit_insn (ix86_gen_leave ());
10473 ix86_add_queued_cfa_restore_notes (insn);
10475 gcc_assert (m->fs.fp_valid);
10476 m->fs.sp_valid = true;
10477 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10478 m->fs.fp_valid = false;
10480 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10482 m->fs.cfa_reg = stack_pointer_rtx;
10483 m->fs.cfa_offset = m->fs.sp_offset;
10485 add_reg_note (insn, REG_CFA_DEF_CFA,
10486 plus_constant (Pmode, stack_pointer_rtx,
10487 m->fs.sp_offset));
10488 RTX_FRAME_RELATED_P (insn) = 1;
10490 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10491 m->fs.fp_offset);
10494 /* Emit code to restore saved registers using MOV insns.
10495 First register is restored from CFA - CFA_OFFSET. */
10496 static void
10497 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10498 bool maybe_eh_return)
10500 struct machine_function *m = cfun->machine;
10501 unsigned int regno;
10503 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10504 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10506 rtx reg = gen_rtx_REG (word_mode, regno);
10507 rtx insn, mem;
10509 mem = choose_baseaddr (cfa_offset);
10510 mem = gen_frame_mem (word_mode, mem);
10511 insn = emit_move_insn (reg, mem);
10513 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10515 /* Previously we'd represented the CFA as an expression
10516 like *(%ebp - 8). We've just popped that value from
10517 the stack, which means we need to reset the CFA to
10518 the drap register. This will remain until we restore
10519 the stack pointer. */
10520 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10521 RTX_FRAME_RELATED_P (insn) = 1;
10523 /* This means that the DRAP register is valid for addressing. */
10524 m->fs.drap_valid = true;
10526 else
10527 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10529 cfa_offset -= UNITS_PER_WORD;
10533 /* Emit code to restore saved registers using MOV insns.
10534 First register is restored from CFA - CFA_OFFSET. */
10535 static void
10536 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10537 bool maybe_eh_return)
10539 unsigned int regno;
10541 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10542 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10544 rtx reg = gen_rtx_REG (V4SFmode, regno);
10545 rtx mem;
10547 mem = choose_baseaddr (cfa_offset);
10548 mem = gen_rtx_MEM (V4SFmode, mem);
10549 set_mem_align (mem, 128);
10550 emit_move_insn (reg, mem);
10552 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10554 cfa_offset -= 16;
10558 /* Restore function stack, frame, and registers. */
10560 void
10561 ix86_expand_epilogue (int style)
10563 struct machine_function *m = cfun->machine;
10564 struct machine_frame_state frame_state_save = m->fs;
10565 struct ix86_frame frame;
10566 bool restore_regs_via_mov;
10567 bool using_drap;
10569 ix86_finalize_stack_realign_flags ();
10570 ix86_compute_frame_layout (&frame);
10572 m->fs.sp_valid = (!frame_pointer_needed
10573 || (crtl->sp_is_unchanging
10574 && !stack_realign_fp));
10575 gcc_assert (!m->fs.sp_valid
10576 || m->fs.sp_offset == frame.stack_pointer_offset);
10578 /* The FP must be valid if the frame pointer is present. */
10579 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10580 gcc_assert (!m->fs.fp_valid
10581 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10583 /* We must have *some* valid pointer to the stack frame. */
10584 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10586 /* The DRAP is never valid at this point. */
10587 gcc_assert (!m->fs.drap_valid);
10589 /* See the comment about red zone and frame
10590 pointer usage in ix86_expand_prologue. */
10591 if (frame_pointer_needed && frame.red_zone_size)
10592 emit_insn (gen_memory_blockage ());
10594 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10595 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10597 /* Determine the CFA offset of the end of the red-zone. */
10598 m->fs.red_zone_offset = 0;
10599 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10601 /* The red-zone begins below the return address. */
10602 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10604 /* When the register save area is in the aligned portion of
10605 the stack, determine the maximum runtime displacement that
10606 matches up with the aligned frame. */
10607 if (stack_realign_drap)
10608 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10609 + UNITS_PER_WORD);
10612 /* Special care must be taken for the normal return case of a function
10613 using eh_return: the eax and edx registers are marked as saved, but
10614 not restored along this path. Adjust the save location to match. */
10615 if (crtl->calls_eh_return && style != 2)
10616 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10618 /* EH_RETURN requires the use of moves to function properly. */
10619 if (crtl->calls_eh_return)
10620 restore_regs_via_mov = true;
10621 /* SEH requires the use of pops to identify the epilogue. */
10622 else if (TARGET_SEH)
10623 restore_regs_via_mov = false;
10624 /* If we're only restoring one register and sp is not valid then
10625 using a move instruction to restore the register since it's
10626 less work than reloading sp and popping the register. */
10627 else if (!m->fs.sp_valid && frame.nregs <= 1)
10628 restore_regs_via_mov = true;
10629 else if (TARGET_EPILOGUE_USING_MOVE
10630 && cfun->machine->use_fast_prologue_epilogue
10631 && (frame.nregs > 1
10632 || m->fs.sp_offset != frame.reg_save_offset))
10633 restore_regs_via_mov = true;
10634 else if (frame_pointer_needed
10635 && !frame.nregs
10636 && m->fs.sp_offset != frame.reg_save_offset)
10637 restore_regs_via_mov = true;
10638 else if (frame_pointer_needed
10639 && TARGET_USE_LEAVE
10640 && cfun->machine->use_fast_prologue_epilogue
10641 && frame.nregs == 1)
10642 restore_regs_via_mov = true;
10643 else
10644 restore_regs_via_mov = false;
10646 if (restore_regs_via_mov || frame.nsseregs)
10648 /* Ensure that the entire register save area is addressable via
10649 the stack pointer, if we will restore via sp. */
10650 if (TARGET_64BIT
10651 && m->fs.sp_offset > 0x7fffffff
10652 && !(m->fs.fp_valid || m->fs.drap_valid)
10653 && (frame.nsseregs + frame.nregs) != 0)
10655 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10656 GEN_INT (m->fs.sp_offset
10657 - frame.sse_reg_save_offset),
10658 style,
10659 m->fs.cfa_reg == stack_pointer_rtx);
10663 /* If there are any SSE registers to restore, then we have to do it
10664 via moves, since there's obviously no pop for SSE regs. */
10665 if (frame.nsseregs)
10666 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10667 style == 2);
10669 if (restore_regs_via_mov)
10671 rtx t;
10673 if (frame.nregs)
10674 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10676 /* eh_return epilogues need %ecx added to the stack pointer. */
10677 if (style == 2)
10679 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10681 /* Stack align doesn't work with eh_return. */
10682 gcc_assert (!stack_realign_drap);
10683 /* Neither does regparm nested functions. */
10684 gcc_assert (!ix86_static_chain_on_stack);
10686 if (frame_pointer_needed)
10688 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10689 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10690 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10692 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10693 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10695 /* Note that we use SA as a temporary CFA, as the return
10696 address is at the proper place relative to it. We
10697 pretend this happens at the FP restore insn because
10698 prior to this insn the FP would be stored at the wrong
10699 offset relative to SA, and after this insn we have no
10700 other reasonable register to use for the CFA. We don't
10701 bother resetting the CFA to the SP for the duration of
10702 the return insn. */
10703 add_reg_note (insn, REG_CFA_DEF_CFA,
10704 plus_constant (Pmode, sa, UNITS_PER_WORD));
10705 ix86_add_queued_cfa_restore_notes (insn);
10706 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10707 RTX_FRAME_RELATED_P (insn) = 1;
10709 m->fs.cfa_reg = sa;
10710 m->fs.cfa_offset = UNITS_PER_WORD;
10711 m->fs.fp_valid = false;
10713 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10714 const0_rtx, style, false);
10716 else
10718 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10719 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10720 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10721 ix86_add_queued_cfa_restore_notes (insn);
10723 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10724 if (m->fs.cfa_offset != UNITS_PER_WORD)
10726 m->fs.cfa_offset = UNITS_PER_WORD;
10727 add_reg_note (insn, REG_CFA_DEF_CFA,
10728 plus_constant (Pmode, stack_pointer_rtx,
10729 UNITS_PER_WORD));
10730 RTX_FRAME_RELATED_P (insn) = 1;
10733 m->fs.sp_offset = UNITS_PER_WORD;
10734 m->fs.sp_valid = true;
10737 else
10739 /* SEH requires that the function end with (1) a stack adjustment
10740 if necessary, (2) a sequence of pops, and (3) a return or
10741 jump instruction. Prevent insns from the function body from
10742 being scheduled into this sequence. */
10743 if (TARGET_SEH)
10745 /* Prevent a catch region from being adjacent to the standard
10746 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10747 several other flags that would be interesting to test are
10748 not yet set up. */
10749 if (flag_non_call_exceptions)
10750 emit_insn (gen_nops (const1_rtx));
10751 else
10752 emit_insn (gen_blockage ());
10755 /* First step is to deallocate the stack frame so that we can
10756 pop the registers. Also do it on SEH target for very large
10757 frame as the emitted instructions aren't allowed by the ABI in
10758 epilogues. */
10759 if (!m->fs.sp_valid
10760 || (TARGET_SEH
10761 && (m->fs.sp_offset - frame.reg_save_offset
10762 >= SEH_MAX_FRAME_SIZE)))
10764 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10765 GEN_INT (m->fs.fp_offset
10766 - frame.reg_save_offset),
10767 style, false);
10769 else if (m->fs.sp_offset != frame.reg_save_offset)
10771 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10772 GEN_INT (m->fs.sp_offset
10773 - frame.reg_save_offset),
10774 style,
10775 m->fs.cfa_reg == stack_pointer_rtx);
10778 ix86_emit_restore_regs_using_pop ();
10781 /* If we used a stack pointer and haven't already got rid of it,
10782 then do so now. */
10783 if (m->fs.fp_valid)
10785 /* If the stack pointer is valid and pointing at the frame
10786 pointer store address, then we only need a pop. */
10787 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10788 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10789 /* Leave results in shorter dependency chains on CPUs that are
10790 able to grok it fast. */
10791 else if (TARGET_USE_LEAVE
10792 || optimize_function_for_size_p (cfun)
10793 || !cfun->machine->use_fast_prologue_epilogue)
10794 ix86_emit_leave ();
10795 else
10797 pro_epilogue_adjust_stack (stack_pointer_rtx,
10798 hard_frame_pointer_rtx,
10799 const0_rtx, style, !using_drap);
10800 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10804 if (using_drap)
10806 int param_ptr_offset = UNITS_PER_WORD;
10807 rtx insn;
10809 gcc_assert (stack_realign_drap);
10811 if (ix86_static_chain_on_stack)
10812 param_ptr_offset += UNITS_PER_WORD;
10813 if (!call_used_regs[REGNO (crtl->drap_reg)])
10814 param_ptr_offset += UNITS_PER_WORD;
10816 insn = emit_insn (gen_rtx_SET
10817 (VOIDmode, stack_pointer_rtx,
10818 gen_rtx_PLUS (Pmode,
10819 crtl->drap_reg,
10820 GEN_INT (-param_ptr_offset))));
10821 m->fs.cfa_reg = stack_pointer_rtx;
10822 m->fs.cfa_offset = param_ptr_offset;
10823 m->fs.sp_offset = param_ptr_offset;
10824 m->fs.realigned = false;
10826 add_reg_note (insn, REG_CFA_DEF_CFA,
10827 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10828 GEN_INT (param_ptr_offset)));
10829 RTX_FRAME_RELATED_P (insn) = 1;
10831 if (!call_used_regs[REGNO (crtl->drap_reg)])
10832 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10835 /* At this point the stack pointer must be valid, and we must have
10836 restored all of the registers. We may not have deallocated the
10837 entire stack frame. We've delayed this until now because it may
10838 be possible to merge the local stack deallocation with the
10839 deallocation forced by ix86_static_chain_on_stack. */
10840 gcc_assert (m->fs.sp_valid);
10841 gcc_assert (!m->fs.fp_valid);
10842 gcc_assert (!m->fs.realigned);
10843 if (m->fs.sp_offset != UNITS_PER_WORD)
10845 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10846 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10847 style, true);
10849 else
10850 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10852 /* Sibcall epilogues don't want a return instruction. */
10853 if (style == 0)
10855 m->fs = frame_state_save;
10856 return;
10859 if (crtl->args.pops_args && crtl->args.size)
10861 rtx popc = GEN_INT (crtl->args.pops_args);
10863 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10864 address, do explicit add, and jump indirectly to the caller. */
10866 if (crtl->args.pops_args >= 65536)
10868 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10869 rtx insn;
10871 /* There is no "pascal" calling convention in any 64bit ABI. */
10872 gcc_assert (!TARGET_64BIT);
10874 insn = emit_insn (gen_pop (ecx));
10875 m->fs.cfa_offset -= UNITS_PER_WORD;
10876 m->fs.sp_offset -= UNITS_PER_WORD;
10878 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10879 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10880 add_reg_note (insn, REG_CFA_REGISTER,
10881 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10882 RTX_FRAME_RELATED_P (insn) = 1;
10884 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10885 popc, -1, true);
10886 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10888 else
10889 emit_jump_insn (gen_simple_return_pop_internal (popc));
10891 else
10892 emit_jump_insn (gen_simple_return_internal ());
10894 /* Restore the state back to the state from the prologue,
10895 so that it's correct for the next epilogue. */
10896 m->fs = frame_state_save;
10899 /* Reset from the function's potential modifications. */
10901 static void
10902 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10903 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10905 if (pic_offset_table_rtx)
10906 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
10907 #if TARGET_MACHO
10908 /* Mach-O doesn't support labels at the end of objects, so if
10909 it looks like we might want one, insert a NOP. */
10911 rtx insn = get_last_insn ();
10912 rtx deleted_debug_label = NULL_RTX;
10913 while (insn
10914 && NOTE_P (insn)
10915 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
10917 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
10918 notes only, instead set their CODE_LABEL_NUMBER to -1,
10919 otherwise there would be code generation differences
10920 in between -g and -g0. */
10921 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
10922 deleted_debug_label = insn;
10923 insn = PREV_INSN (insn);
10925 if (insn
10926 && (LABEL_P (insn)
10927 || (NOTE_P (insn)
10928 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
10929 fputs ("\tnop\n", file);
10930 else if (deleted_debug_label)
10931 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
10932 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
10933 CODE_LABEL_NUMBER (insn) = -1;
10935 #endif
10939 /* Return a scratch register to use in the split stack prologue. The
10940 split stack prologue is used for -fsplit-stack. It is the first
10941 instructions in the function, even before the regular prologue.
10942 The scratch register can be any caller-saved register which is not
10943 used for parameters or for the static chain. */
10945 static unsigned int
10946 split_stack_prologue_scratch_regno (void)
10948 if (TARGET_64BIT)
10949 return R11_REG;
10950 else
10952 bool is_fastcall;
10953 int regparm;
10955 is_fastcall = (lookup_attribute ("fastcall",
10956 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
10957 != NULL);
10958 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
10960 if (is_fastcall)
10962 if (DECL_STATIC_CHAIN (cfun->decl))
10964 sorry ("-fsplit-stack does not support fastcall with "
10965 "nested function");
10966 return INVALID_REGNUM;
10968 return AX_REG;
10970 else if (regparm < 3)
10972 if (!DECL_STATIC_CHAIN (cfun->decl))
10973 return CX_REG;
10974 else
10976 if (regparm >= 2)
10978 sorry ("-fsplit-stack does not support 2 register "
10979 " parameters for a nested function");
10980 return INVALID_REGNUM;
10982 return DX_REG;
10985 else
10987 /* FIXME: We could make this work by pushing a register
10988 around the addition and comparison. */
10989 sorry ("-fsplit-stack does not support 3 register parameters");
10990 return INVALID_REGNUM;
10995 /* A SYMBOL_REF for the function which allocates new stackspace for
10996 -fsplit-stack. */
10998 static GTY(()) rtx split_stack_fn;
11000 /* A SYMBOL_REF for the more stack function when using the large
11001 model. */
11003 static GTY(()) rtx split_stack_fn_large;
11005 /* Handle -fsplit-stack. These are the first instructions in the
11006 function, even before the regular prologue. */
11008 void
11009 ix86_expand_split_stack_prologue (void)
11011 struct ix86_frame frame;
11012 HOST_WIDE_INT allocate;
11013 unsigned HOST_WIDE_INT args_size;
11014 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11015 rtx scratch_reg = NULL_RTX;
11016 rtx varargs_label = NULL_RTX;
11017 rtx fn;
11019 gcc_assert (flag_split_stack && reload_completed);
11021 ix86_finalize_stack_realign_flags ();
11022 ix86_compute_frame_layout (&frame);
11023 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11025 /* This is the label we will branch to if we have enough stack
11026 space. We expect the basic block reordering pass to reverse this
11027 branch if optimizing, so that we branch in the unlikely case. */
11028 label = gen_label_rtx ();
11030 /* We need to compare the stack pointer minus the frame size with
11031 the stack boundary in the TCB. The stack boundary always gives
11032 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11033 can compare directly. Otherwise we need to do an addition. */
11035 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11036 UNSPEC_STACK_CHECK);
11037 limit = gen_rtx_CONST (Pmode, limit);
11038 limit = gen_rtx_MEM (Pmode, limit);
11039 if (allocate < SPLIT_STACK_AVAILABLE)
11040 current = stack_pointer_rtx;
11041 else
11043 unsigned int scratch_regno;
11044 rtx offset;
11046 /* We need a scratch register to hold the stack pointer minus
11047 the required frame size. Since this is the very start of the
11048 function, the scratch register can be any caller-saved
11049 register which is not used for parameters. */
11050 offset = GEN_INT (- allocate);
11051 scratch_regno = split_stack_prologue_scratch_regno ();
11052 if (scratch_regno == INVALID_REGNUM)
11053 return;
11054 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11055 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11057 /* We don't use ix86_gen_add3 in this case because it will
11058 want to split to lea, but when not optimizing the insn
11059 will not be split after this point. */
11060 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11061 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11062 offset)));
11064 else
11066 emit_move_insn (scratch_reg, offset);
11067 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11068 stack_pointer_rtx));
11070 current = scratch_reg;
11073 ix86_expand_branch (GEU, current, limit, label);
11074 jump_insn = get_last_insn ();
11075 JUMP_LABEL (jump_insn) = label;
11077 /* Mark the jump as very likely to be taken. */
11078 add_reg_note (jump_insn, REG_BR_PROB,
11079 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11081 if (split_stack_fn == NULL_RTX)
11082 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11083 fn = split_stack_fn;
11085 /* Get more stack space. We pass in the desired stack space and the
11086 size of the arguments to copy to the new stack. In 32-bit mode
11087 we push the parameters; __morestack will return on a new stack
11088 anyhow. In 64-bit mode we pass the parameters in r10 and
11089 r11. */
11090 allocate_rtx = GEN_INT (allocate);
11091 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11092 call_fusage = NULL_RTX;
11093 if (TARGET_64BIT)
11095 rtx reg10, reg11;
11097 reg10 = gen_rtx_REG (Pmode, R10_REG);
11098 reg11 = gen_rtx_REG (Pmode, R11_REG);
11100 /* If this function uses a static chain, it will be in %r10.
11101 Preserve it across the call to __morestack. */
11102 if (DECL_STATIC_CHAIN (cfun->decl))
11104 rtx rax;
11106 rax = gen_rtx_REG (word_mode, AX_REG);
11107 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11108 use_reg (&call_fusage, rax);
11111 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11113 HOST_WIDE_INT argval;
11115 gcc_assert (Pmode == DImode);
11116 /* When using the large model we need to load the address
11117 into a register, and we've run out of registers. So we
11118 switch to a different calling convention, and we call a
11119 different function: __morestack_large. We pass the
11120 argument size in the upper 32 bits of r10 and pass the
11121 frame size in the lower 32 bits. */
11122 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11123 gcc_assert ((args_size & 0xffffffff) == args_size);
11125 if (split_stack_fn_large == NULL_RTX)
11126 split_stack_fn_large =
11127 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11129 if (ix86_cmodel == CM_LARGE_PIC)
11131 rtx label, x;
11133 label = gen_label_rtx ();
11134 emit_label (label);
11135 LABEL_PRESERVE_P (label) = 1;
11136 emit_insn (gen_set_rip_rex64 (reg10, label));
11137 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11138 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11139 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11140 UNSPEC_GOT);
11141 x = gen_rtx_CONST (Pmode, x);
11142 emit_move_insn (reg11, x);
11143 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11144 x = gen_const_mem (Pmode, x);
11145 emit_move_insn (reg11, x);
11147 else
11148 emit_move_insn (reg11, split_stack_fn_large);
11150 fn = reg11;
11152 argval = ((args_size << 16) << 16) + allocate;
11153 emit_move_insn (reg10, GEN_INT (argval));
11155 else
11157 emit_move_insn (reg10, allocate_rtx);
11158 emit_move_insn (reg11, GEN_INT (args_size));
11159 use_reg (&call_fusage, reg11);
11162 use_reg (&call_fusage, reg10);
11164 else
11166 emit_insn (gen_push (GEN_INT (args_size)));
11167 emit_insn (gen_push (allocate_rtx));
11169 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11170 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11171 NULL_RTX, false);
11172 add_function_usage_to (call_insn, call_fusage);
11174 /* In order to make call/return prediction work right, we now need
11175 to execute a return instruction. See
11176 libgcc/config/i386/morestack.S for the details on how this works.
11178 For flow purposes gcc must not see this as a return
11179 instruction--we need control flow to continue at the subsequent
11180 label. Therefore, we use an unspec. */
11181 gcc_assert (crtl->args.pops_args < 65536);
11182 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11184 /* If we are in 64-bit mode and this function uses a static chain,
11185 we saved %r10 in %rax before calling _morestack. */
11186 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11187 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11188 gen_rtx_REG (word_mode, AX_REG));
11190 /* If this function calls va_start, we need to store a pointer to
11191 the arguments on the old stack, because they may not have been
11192 all copied to the new stack. At this point the old stack can be
11193 found at the frame pointer value used by __morestack, because
11194 __morestack has set that up before calling back to us. Here we
11195 store that pointer in a scratch register, and in
11196 ix86_expand_prologue we store the scratch register in a stack
11197 slot. */
11198 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11200 unsigned int scratch_regno;
11201 rtx frame_reg;
11202 int words;
11204 scratch_regno = split_stack_prologue_scratch_regno ();
11205 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11206 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11208 /* 64-bit:
11209 fp -> old fp value
11210 return address within this function
11211 return address of caller of this function
11212 stack arguments
11213 So we add three words to get to the stack arguments.
11215 32-bit:
11216 fp -> old fp value
11217 return address within this function
11218 first argument to __morestack
11219 second argument to __morestack
11220 return address of caller of this function
11221 stack arguments
11222 So we add five words to get to the stack arguments.
11224 words = TARGET_64BIT ? 3 : 5;
11225 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11226 gen_rtx_PLUS (Pmode, frame_reg,
11227 GEN_INT (words * UNITS_PER_WORD))));
11229 varargs_label = gen_label_rtx ();
11230 emit_jump_insn (gen_jump (varargs_label));
11231 JUMP_LABEL (get_last_insn ()) = varargs_label;
11233 emit_barrier ();
11236 emit_label (label);
11237 LABEL_NUSES (label) = 1;
11239 /* If this function calls va_start, we now have to set the scratch
11240 register for the case where we do not call __morestack. In this
11241 case we need to set it based on the stack pointer. */
11242 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11244 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11245 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11246 GEN_INT (UNITS_PER_WORD))));
11248 emit_label (varargs_label);
11249 LABEL_NUSES (varargs_label) = 1;
11253 /* We may have to tell the dataflow pass that the split stack prologue
11254 is initializing a scratch register. */
11256 static void
11257 ix86_live_on_entry (bitmap regs)
11259 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11261 gcc_assert (flag_split_stack);
11262 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11266 /* Determine if op is suitable SUBREG RTX for address. */
11268 static bool
11269 ix86_address_subreg_operand (rtx op)
11271 enum machine_mode mode;
11273 if (!REG_P (op))
11274 return false;
11276 mode = GET_MODE (op);
11278 if (GET_MODE_CLASS (mode) != MODE_INT)
11279 return false;
11281 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11282 failures when the register is one word out of a two word structure. */
11283 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11284 return false;
11286 /* Allow only SUBREGs of non-eliminable hard registers. */
11287 return register_no_elim_operand (op, mode);
11290 /* Extract the parts of an RTL expression that is a valid memory address
11291 for an instruction. Return 0 if the structure of the address is
11292 grossly off. Return -1 if the address contains ASHIFT, so it is not
11293 strictly valid, but still used for computing length of lea instruction. */
11296 ix86_decompose_address (rtx addr, struct ix86_address *out)
11298 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11299 rtx base_reg, index_reg;
11300 HOST_WIDE_INT scale = 1;
11301 rtx scale_rtx = NULL_RTX;
11302 rtx tmp;
11303 int retval = 1;
11304 enum ix86_address_seg seg = SEG_DEFAULT;
11306 /* Allow zero-extended SImode addresses,
11307 they will be emitted with addr32 prefix. */
11308 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11310 if (GET_CODE (addr) == ZERO_EXTEND
11311 && GET_MODE (XEXP (addr, 0)) == SImode)
11313 addr = XEXP (addr, 0);
11314 if (CONST_INT_P (addr))
11315 return 0;
11317 else if (GET_CODE (addr) == AND
11318 && const_32bit_mask (XEXP (addr, 1), DImode))
11320 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11321 if (addr == NULL_RTX)
11322 return 0;
11324 if (CONST_INT_P (addr))
11325 return 0;
11329 /* Allow SImode subregs of DImode addresses,
11330 they will be emitted with addr32 prefix. */
11331 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11333 if (GET_CODE (addr) == SUBREG
11334 && GET_MODE (SUBREG_REG (addr)) == DImode)
11336 addr = SUBREG_REG (addr);
11337 if (CONST_INT_P (addr))
11338 return 0;
11342 if (REG_P (addr))
11343 base = addr;
11344 else if (GET_CODE (addr) == SUBREG)
11346 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11347 base = addr;
11348 else
11349 return 0;
11351 else if (GET_CODE (addr) == PLUS)
11353 rtx addends[4], op;
11354 int n = 0, i;
11356 op = addr;
11359 if (n >= 4)
11360 return 0;
11361 addends[n++] = XEXP (op, 1);
11362 op = XEXP (op, 0);
11364 while (GET_CODE (op) == PLUS);
11365 if (n >= 4)
11366 return 0;
11367 addends[n] = op;
11369 for (i = n; i >= 0; --i)
11371 op = addends[i];
11372 switch (GET_CODE (op))
11374 case MULT:
11375 if (index)
11376 return 0;
11377 index = XEXP (op, 0);
11378 scale_rtx = XEXP (op, 1);
11379 break;
11381 case ASHIFT:
11382 if (index)
11383 return 0;
11384 index = XEXP (op, 0);
11385 tmp = XEXP (op, 1);
11386 if (!CONST_INT_P (tmp))
11387 return 0;
11388 scale = INTVAL (tmp);
11389 if ((unsigned HOST_WIDE_INT) scale > 3)
11390 return 0;
11391 scale = 1 << scale;
11392 break;
11394 case ZERO_EXTEND:
11395 op = XEXP (op, 0);
11396 if (GET_CODE (op) != UNSPEC)
11397 return 0;
11398 /* FALLTHRU */
11400 case UNSPEC:
11401 if (XINT (op, 1) == UNSPEC_TP
11402 && TARGET_TLS_DIRECT_SEG_REFS
11403 && seg == SEG_DEFAULT)
11404 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11405 else
11406 return 0;
11407 break;
11409 case SUBREG:
11410 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11411 return 0;
11412 /* FALLTHRU */
11414 case REG:
11415 if (!base)
11416 base = op;
11417 else if (!index)
11418 index = op;
11419 else
11420 return 0;
11421 break;
11423 case CONST:
11424 case CONST_INT:
11425 case SYMBOL_REF:
11426 case LABEL_REF:
11427 if (disp)
11428 return 0;
11429 disp = op;
11430 break;
11432 default:
11433 return 0;
11437 else if (GET_CODE (addr) == MULT)
11439 index = XEXP (addr, 0); /* index*scale */
11440 scale_rtx = XEXP (addr, 1);
11442 else if (GET_CODE (addr) == ASHIFT)
11444 /* We're called for lea too, which implements ashift on occasion. */
11445 index = XEXP (addr, 0);
11446 tmp = XEXP (addr, 1);
11447 if (!CONST_INT_P (tmp))
11448 return 0;
11449 scale = INTVAL (tmp);
11450 if ((unsigned HOST_WIDE_INT) scale > 3)
11451 return 0;
11452 scale = 1 << scale;
11453 retval = -1;
11455 else if (CONST_INT_P (addr))
11457 if (!x86_64_immediate_operand (addr, VOIDmode))
11458 return 0;
11460 /* Constant addresses are sign extended to 64bit, we have to
11461 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11462 if (TARGET_X32
11463 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11464 return 0;
11466 disp = addr;
11468 else
11469 disp = addr; /* displacement */
11471 if (index)
11473 if (REG_P (index))
11475 else if (GET_CODE (index) == SUBREG
11476 && ix86_address_subreg_operand (SUBREG_REG (index)))
11478 else
11479 return 0;
11482 /* Address override works only on the (%reg) part of %fs:(%reg). */
11483 if (seg != SEG_DEFAULT
11484 && ((base && GET_MODE (base) != word_mode)
11485 || (index && GET_MODE (index) != word_mode)))
11486 return 0;
11488 /* Extract the integral value of scale. */
11489 if (scale_rtx)
11491 if (!CONST_INT_P (scale_rtx))
11492 return 0;
11493 scale = INTVAL (scale_rtx);
11496 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11497 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11499 /* Avoid useless 0 displacement. */
11500 if (disp == const0_rtx && (base || index))
11501 disp = NULL_RTX;
11503 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11504 if (base_reg && index_reg && scale == 1
11505 && (index_reg == arg_pointer_rtx
11506 || index_reg == frame_pointer_rtx
11507 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11509 rtx tmp;
11510 tmp = base, base = index, index = tmp;
11511 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11514 /* Special case: %ebp cannot be encoded as a base without a displacement.
11515 Similarly %r13. */
11516 if (!disp
11517 && base_reg
11518 && (base_reg == hard_frame_pointer_rtx
11519 || base_reg == frame_pointer_rtx
11520 || base_reg == arg_pointer_rtx
11521 || (REG_P (base_reg)
11522 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11523 || REGNO (base_reg) == R13_REG))))
11524 disp = const0_rtx;
11526 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11527 Avoid this by transforming to [%esi+0].
11528 Reload calls address legitimization without cfun defined, so we need
11529 to test cfun for being non-NULL. */
11530 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11531 && base_reg && !index_reg && !disp
11532 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11533 disp = const0_rtx;
11535 /* Special case: encode reg+reg instead of reg*2. */
11536 if (!base && index && scale == 2)
11537 base = index, base_reg = index_reg, scale = 1;
11539 /* Special case: scaling cannot be encoded without base or displacement. */
11540 if (!base && !disp && index && scale != 1)
11541 disp = const0_rtx;
11543 out->base = base;
11544 out->index = index;
11545 out->disp = disp;
11546 out->scale = scale;
11547 out->seg = seg;
11549 return retval;
11552 /* Return cost of the memory address x.
11553 For i386, it is better to use a complex address than let gcc copy
11554 the address into a reg and make a new pseudo. But not if the address
11555 requires to two regs - that would mean more pseudos with longer
11556 lifetimes. */
11557 static int
11558 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11559 addr_space_t as ATTRIBUTE_UNUSED,
11560 bool speed ATTRIBUTE_UNUSED)
11562 struct ix86_address parts;
11563 int cost = 1;
11564 int ok = ix86_decompose_address (x, &parts);
11566 gcc_assert (ok);
11568 if (parts.base && GET_CODE (parts.base) == SUBREG)
11569 parts.base = SUBREG_REG (parts.base);
11570 if (parts.index && GET_CODE (parts.index) == SUBREG)
11571 parts.index = SUBREG_REG (parts.index);
11573 /* Attempt to minimize number of registers in the address. */
11574 if ((parts.base
11575 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11576 || (parts.index
11577 && (!REG_P (parts.index)
11578 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11579 cost++;
11581 if (parts.base
11582 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11583 && parts.index
11584 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11585 && parts.base != parts.index)
11586 cost++;
11588 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11589 since it's predecode logic can't detect the length of instructions
11590 and it degenerates to vector decoded. Increase cost of such
11591 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11592 to split such addresses or even refuse such addresses at all.
11594 Following addressing modes are affected:
11595 [base+scale*index]
11596 [scale*index+disp]
11597 [base+index]
11599 The first and last case may be avoidable by explicitly coding the zero in
11600 memory address, but I don't have AMD-K6 machine handy to check this
11601 theory. */
11603 if (TARGET_K6
11604 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11605 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11606 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11607 cost += 10;
11609 return cost;
11612 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11613 this is used for to form addresses to local data when -fPIC is in
11614 use. */
11616 static bool
11617 darwin_local_data_pic (rtx disp)
11619 return (GET_CODE (disp) == UNSPEC
11620 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11623 /* Determine if a given RTX is a valid constant. We already know this
11624 satisfies CONSTANT_P. */
11626 static bool
11627 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11629 switch (GET_CODE (x))
11631 case CONST:
11632 x = XEXP (x, 0);
11634 if (GET_CODE (x) == PLUS)
11636 if (!CONST_INT_P (XEXP (x, 1)))
11637 return false;
11638 x = XEXP (x, 0);
11641 if (TARGET_MACHO && darwin_local_data_pic (x))
11642 return true;
11644 /* Only some unspecs are valid as "constants". */
11645 if (GET_CODE (x) == UNSPEC)
11646 switch (XINT (x, 1))
11648 case UNSPEC_GOT:
11649 case UNSPEC_GOTOFF:
11650 case UNSPEC_PLTOFF:
11651 return TARGET_64BIT;
11652 case UNSPEC_TPOFF:
11653 case UNSPEC_NTPOFF:
11654 x = XVECEXP (x, 0, 0);
11655 return (GET_CODE (x) == SYMBOL_REF
11656 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11657 case UNSPEC_DTPOFF:
11658 x = XVECEXP (x, 0, 0);
11659 return (GET_CODE (x) == SYMBOL_REF
11660 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11661 default:
11662 return false;
11665 /* We must have drilled down to a symbol. */
11666 if (GET_CODE (x) == LABEL_REF)
11667 return true;
11668 if (GET_CODE (x) != SYMBOL_REF)
11669 return false;
11670 /* FALLTHRU */
11672 case SYMBOL_REF:
11673 /* TLS symbols are never valid. */
11674 if (SYMBOL_REF_TLS_MODEL (x))
11675 return false;
11677 /* DLLIMPORT symbols are never valid. */
11678 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11679 && SYMBOL_REF_DLLIMPORT_P (x))
11680 return false;
11682 #if TARGET_MACHO
11683 /* mdynamic-no-pic */
11684 if (MACHO_DYNAMIC_NO_PIC_P)
11685 return machopic_symbol_defined_p (x);
11686 #endif
11687 break;
11689 case CONST_DOUBLE:
11690 if (GET_MODE (x) == TImode
11691 && x != CONST0_RTX (TImode)
11692 && !TARGET_64BIT)
11693 return false;
11694 break;
11696 case CONST_VECTOR:
11697 if (!standard_sse_constant_p (x))
11698 return false;
11700 default:
11701 break;
11704 /* Otherwise we handle everything else in the move patterns. */
11705 return true;
11708 /* Determine if it's legal to put X into the constant pool. This
11709 is not possible for the address of thread-local symbols, which
11710 is checked above. */
11712 static bool
11713 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11715 /* We can always put integral constants and vectors in memory. */
11716 switch (GET_CODE (x))
11718 case CONST_INT:
11719 case CONST_DOUBLE:
11720 case CONST_VECTOR:
11721 return false;
11723 default:
11724 break;
11726 return !ix86_legitimate_constant_p (mode, x);
11730 /* Nonzero if the constant value X is a legitimate general operand
11731 when generating PIC code. It is given that flag_pic is on and
11732 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11734 bool
11735 legitimate_pic_operand_p (rtx x)
11737 rtx inner;
11739 switch (GET_CODE (x))
11741 case CONST:
11742 inner = XEXP (x, 0);
11743 if (GET_CODE (inner) == PLUS
11744 && CONST_INT_P (XEXP (inner, 1)))
11745 inner = XEXP (inner, 0);
11747 /* Only some unspecs are valid as "constants". */
11748 if (GET_CODE (inner) == UNSPEC)
11749 switch (XINT (inner, 1))
11751 case UNSPEC_GOT:
11752 case UNSPEC_GOTOFF:
11753 case UNSPEC_PLTOFF:
11754 return TARGET_64BIT;
11755 case UNSPEC_TPOFF:
11756 x = XVECEXP (inner, 0, 0);
11757 return (GET_CODE (x) == SYMBOL_REF
11758 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11759 case UNSPEC_MACHOPIC_OFFSET:
11760 return legitimate_pic_address_disp_p (x);
11761 default:
11762 return false;
11764 /* FALLTHRU */
11766 case SYMBOL_REF:
11767 case LABEL_REF:
11768 return legitimate_pic_address_disp_p (x);
11770 default:
11771 return true;
11775 /* Determine if a given CONST RTX is a valid memory displacement
11776 in PIC mode. */
11778 bool
11779 legitimate_pic_address_disp_p (rtx disp)
11781 bool saw_plus;
11783 /* In 64bit mode we can allow direct addresses of symbols and labels
11784 when they are not dynamic symbols. */
11785 if (TARGET_64BIT)
11787 rtx op0 = disp, op1;
11789 switch (GET_CODE (disp))
11791 case LABEL_REF:
11792 return true;
11794 case CONST:
11795 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11796 break;
11797 op0 = XEXP (XEXP (disp, 0), 0);
11798 op1 = XEXP (XEXP (disp, 0), 1);
11799 if (!CONST_INT_P (op1)
11800 || INTVAL (op1) >= 16*1024*1024
11801 || INTVAL (op1) < -16*1024*1024)
11802 break;
11803 if (GET_CODE (op0) == LABEL_REF)
11804 return true;
11805 if (GET_CODE (op0) == CONST
11806 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11807 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11808 return true;
11809 if (GET_CODE (op0) == UNSPEC
11810 && XINT (op0, 1) == UNSPEC_PCREL)
11811 return true;
11812 if (GET_CODE (op0) != SYMBOL_REF)
11813 break;
11814 /* FALLTHRU */
11816 case SYMBOL_REF:
11817 /* TLS references should always be enclosed in UNSPEC. */
11818 if (SYMBOL_REF_TLS_MODEL (op0))
11819 return false;
11820 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11821 && ix86_cmodel != CM_LARGE_PIC)
11822 return true;
11823 break;
11825 default:
11826 break;
11829 if (GET_CODE (disp) != CONST)
11830 return false;
11831 disp = XEXP (disp, 0);
11833 if (TARGET_64BIT)
11835 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11836 of GOT tables. We should not need these anyway. */
11837 if (GET_CODE (disp) != UNSPEC
11838 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11839 && XINT (disp, 1) != UNSPEC_GOTOFF
11840 && XINT (disp, 1) != UNSPEC_PCREL
11841 && XINT (disp, 1) != UNSPEC_PLTOFF))
11842 return false;
11844 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11845 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11846 return false;
11847 return true;
11850 saw_plus = false;
11851 if (GET_CODE (disp) == PLUS)
11853 if (!CONST_INT_P (XEXP (disp, 1)))
11854 return false;
11855 disp = XEXP (disp, 0);
11856 saw_plus = true;
11859 if (TARGET_MACHO && darwin_local_data_pic (disp))
11860 return true;
11862 if (GET_CODE (disp) != UNSPEC)
11863 return false;
11865 switch (XINT (disp, 1))
11867 case UNSPEC_GOT:
11868 if (saw_plus)
11869 return false;
11870 /* We need to check for both symbols and labels because VxWorks loads
11871 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11872 details. */
11873 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11874 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11875 case UNSPEC_GOTOFF:
11876 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11877 While ABI specify also 32bit relocation but we don't produce it in
11878 small PIC model at all. */
11879 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11880 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11881 && !TARGET_64BIT)
11882 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11883 return false;
11884 case UNSPEC_GOTTPOFF:
11885 case UNSPEC_GOTNTPOFF:
11886 case UNSPEC_INDNTPOFF:
11887 if (saw_plus)
11888 return false;
11889 disp = XVECEXP (disp, 0, 0);
11890 return (GET_CODE (disp) == SYMBOL_REF
11891 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11892 case UNSPEC_NTPOFF:
11893 disp = XVECEXP (disp, 0, 0);
11894 return (GET_CODE (disp) == SYMBOL_REF
11895 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11896 case UNSPEC_DTPOFF:
11897 disp = XVECEXP (disp, 0, 0);
11898 return (GET_CODE (disp) == SYMBOL_REF
11899 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11902 return false;
11905 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
11906 replace the input X, or the original X if no replacement is called for.
11907 The output parameter *WIN is 1 if the calling macro should goto WIN,
11908 0 if it should not. */
11910 bool
11911 ix86_legitimize_reload_address (rtx x,
11912 enum machine_mode mode ATTRIBUTE_UNUSED,
11913 int opnum, int type,
11914 int ind_levels ATTRIBUTE_UNUSED)
11916 /* Reload can generate:
11918 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
11919 (reg:DI 97))
11920 (reg:DI 2 cx))
11922 This RTX is rejected from ix86_legitimate_address_p due to
11923 non-strictness of base register 97. Following this rejection,
11924 reload pushes all three components into separate registers,
11925 creating invalid memory address RTX.
11927 Following code reloads only the invalid part of the
11928 memory address RTX. */
11930 if (GET_CODE (x) == PLUS
11931 && REG_P (XEXP (x, 1))
11932 && GET_CODE (XEXP (x, 0)) == PLUS
11933 && REG_P (XEXP (XEXP (x, 0), 1)))
11935 rtx base, index;
11936 bool something_reloaded = false;
11938 base = XEXP (XEXP (x, 0), 1);
11939 if (!REG_OK_FOR_BASE_STRICT_P (base))
11941 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
11942 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
11943 opnum, (enum reload_type) type);
11944 something_reloaded = true;
11947 index = XEXP (x, 1);
11948 if (!REG_OK_FOR_INDEX_STRICT_P (index))
11950 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
11951 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
11952 opnum, (enum reload_type) type);
11953 something_reloaded = true;
11956 gcc_assert (something_reloaded);
11957 return true;
11960 return false;
11963 /* Recognizes RTL expressions that are valid memory addresses for an
11964 instruction. The MODE argument is the machine mode for the MEM
11965 expression that wants to use this address.
11967 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11968 convert common non-canonical forms to canonical form so that they will
11969 be recognized. */
11971 static bool
11972 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11973 rtx addr, bool strict)
11975 struct ix86_address parts;
11976 rtx base, index, disp;
11977 HOST_WIDE_INT scale;
11979 if (ix86_decompose_address (addr, &parts) <= 0)
11980 /* Decomposition failed. */
11981 return false;
11983 base = parts.base;
11984 index = parts.index;
11985 disp = parts.disp;
11986 scale = parts.scale;
11988 /* Validate base register. */
11989 if (base)
11991 rtx reg;
11993 if (REG_P (base))
11994 reg = base;
11995 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
11996 reg = SUBREG_REG (base);
11997 else
11998 /* Base is not a register. */
11999 return false;
12001 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12002 return false;
12004 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12005 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12006 /* Base is not valid. */
12007 return false;
12010 /* Validate index register. */
12011 if (index)
12013 rtx reg;
12015 if (REG_P (index))
12016 reg = index;
12017 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12018 reg = SUBREG_REG (index);
12019 else
12020 /* Index is not a register. */
12021 return false;
12023 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12024 return false;
12026 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12027 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12028 /* Index is not valid. */
12029 return false;
12032 /* Index and base should have the same mode. */
12033 if (base && index
12034 && GET_MODE (base) != GET_MODE (index))
12035 return false;
12037 /* Validate scale factor. */
12038 if (scale != 1)
12040 if (!index)
12041 /* Scale without index. */
12042 return false;
12044 if (scale != 2 && scale != 4 && scale != 8)
12045 /* Scale is not a valid multiplier. */
12046 return false;
12049 /* Validate displacement. */
12050 if (disp)
12052 if (GET_CODE (disp) == CONST
12053 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12054 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12055 switch (XINT (XEXP (disp, 0), 1))
12057 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12058 used. While ABI specify also 32bit relocations, we don't produce
12059 them at all and use IP relative instead. */
12060 case UNSPEC_GOT:
12061 case UNSPEC_GOTOFF:
12062 gcc_assert (flag_pic);
12063 if (!TARGET_64BIT)
12064 goto is_legitimate_pic;
12066 /* 64bit address unspec. */
12067 return false;
12069 case UNSPEC_GOTPCREL:
12070 case UNSPEC_PCREL:
12071 gcc_assert (flag_pic);
12072 goto is_legitimate_pic;
12074 case UNSPEC_GOTTPOFF:
12075 case UNSPEC_GOTNTPOFF:
12076 case UNSPEC_INDNTPOFF:
12077 case UNSPEC_NTPOFF:
12078 case UNSPEC_DTPOFF:
12079 break;
12081 case UNSPEC_STACK_CHECK:
12082 gcc_assert (flag_split_stack);
12083 break;
12085 default:
12086 /* Invalid address unspec. */
12087 return false;
12090 else if (SYMBOLIC_CONST (disp)
12091 && (flag_pic
12092 || (TARGET_MACHO
12093 #if TARGET_MACHO
12094 && MACHOPIC_INDIRECT
12095 && !machopic_operand_p (disp)
12096 #endif
12100 is_legitimate_pic:
12101 if (TARGET_64BIT && (index || base))
12103 /* foo@dtpoff(%rX) is ok. */
12104 if (GET_CODE (disp) != CONST
12105 || GET_CODE (XEXP (disp, 0)) != PLUS
12106 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12107 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12108 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12109 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12110 /* Non-constant pic memory reference. */
12111 return false;
12113 else if ((!TARGET_MACHO || flag_pic)
12114 && ! legitimate_pic_address_disp_p (disp))
12115 /* Displacement is an invalid pic construct. */
12116 return false;
12117 #if TARGET_MACHO
12118 else if (MACHO_DYNAMIC_NO_PIC_P
12119 && !ix86_legitimate_constant_p (Pmode, disp))
12120 /* displacment must be referenced via non_lazy_pointer */
12121 return false;
12122 #endif
12124 /* This code used to verify that a symbolic pic displacement
12125 includes the pic_offset_table_rtx register.
12127 While this is good idea, unfortunately these constructs may
12128 be created by "adds using lea" optimization for incorrect
12129 code like:
12131 int a;
12132 int foo(int i)
12134 return *(&a+i);
12137 This code is nonsensical, but results in addressing
12138 GOT table with pic_offset_table_rtx base. We can't
12139 just refuse it easily, since it gets matched by
12140 "addsi3" pattern, that later gets split to lea in the
12141 case output register differs from input. While this
12142 can be handled by separate addsi pattern for this case
12143 that never results in lea, this seems to be easier and
12144 correct fix for crash to disable this test. */
12146 else if (GET_CODE (disp) != LABEL_REF
12147 && !CONST_INT_P (disp)
12148 && (GET_CODE (disp) != CONST
12149 || !ix86_legitimate_constant_p (Pmode, disp))
12150 && (GET_CODE (disp) != SYMBOL_REF
12151 || !ix86_legitimate_constant_p (Pmode, disp)))
12152 /* Displacement is not constant. */
12153 return false;
12154 else if (TARGET_64BIT
12155 && !x86_64_immediate_operand (disp, VOIDmode))
12156 /* Displacement is out of range. */
12157 return false;
12160 /* Everything looks valid. */
12161 return true;
12164 /* Determine if a given RTX is a valid constant address. */
12166 bool
12167 constant_address_p (rtx x)
12169 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12172 /* Return a unique alias set for the GOT. */
12174 static alias_set_type
12175 ix86_GOT_alias_set (void)
12177 static alias_set_type set = -1;
12178 if (set == -1)
12179 set = new_alias_set ();
12180 return set;
12183 /* Return a legitimate reference for ORIG (an address) using the
12184 register REG. If REG is 0, a new pseudo is generated.
12186 There are two types of references that must be handled:
12188 1. Global data references must load the address from the GOT, via
12189 the PIC reg. An insn is emitted to do this load, and the reg is
12190 returned.
12192 2. Static data references, constant pool addresses, and code labels
12193 compute the address as an offset from the GOT, whose base is in
12194 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12195 differentiate them from global data objects. The returned
12196 address is the PIC reg + an unspec constant.
12198 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12199 reg also appears in the address. */
12201 static rtx
12202 legitimize_pic_address (rtx orig, rtx reg)
12204 rtx addr = orig;
12205 rtx new_rtx = orig;
12206 rtx base;
12208 #if TARGET_MACHO
12209 if (TARGET_MACHO && !TARGET_64BIT)
12211 if (reg == 0)
12212 reg = gen_reg_rtx (Pmode);
12213 /* Use the generic Mach-O PIC machinery. */
12214 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12216 #endif
12218 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12219 new_rtx = addr;
12220 else if (TARGET_64BIT
12221 && ix86_cmodel != CM_SMALL_PIC
12222 && gotoff_operand (addr, Pmode))
12224 rtx tmpreg;
12225 /* This symbol may be referenced via a displacement from the PIC
12226 base address (@GOTOFF). */
12228 if (reload_in_progress)
12229 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12230 if (GET_CODE (addr) == CONST)
12231 addr = XEXP (addr, 0);
12232 if (GET_CODE (addr) == PLUS)
12234 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12235 UNSPEC_GOTOFF);
12236 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12238 else
12239 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12240 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12241 if (!reg)
12242 tmpreg = gen_reg_rtx (Pmode);
12243 else
12244 tmpreg = reg;
12245 emit_move_insn (tmpreg, new_rtx);
12247 if (reg != 0)
12249 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12250 tmpreg, 1, OPTAB_DIRECT);
12251 new_rtx = reg;
12253 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12255 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12257 /* This symbol may be referenced via a displacement from the PIC
12258 base address (@GOTOFF). */
12260 if (reload_in_progress)
12261 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12262 if (GET_CODE (addr) == CONST)
12263 addr = XEXP (addr, 0);
12264 if (GET_CODE (addr) == PLUS)
12266 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12267 UNSPEC_GOTOFF);
12268 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12270 else
12271 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12272 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12273 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12275 if (reg != 0)
12277 emit_move_insn (reg, new_rtx);
12278 new_rtx = reg;
12281 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12282 /* We can't use @GOTOFF for text labels on VxWorks;
12283 see gotoff_operand. */
12284 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12286 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12288 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12289 return legitimize_dllimport_symbol (addr, true);
12290 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12291 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12292 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12294 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12295 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12299 /* For x64 PE-COFF there is no GOT table. So we use address
12300 directly. */
12301 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12303 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12304 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12306 if (reg == 0)
12307 reg = gen_reg_rtx (Pmode);
12308 emit_move_insn (reg, new_rtx);
12309 new_rtx = reg;
12311 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12313 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12314 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12315 new_rtx = gen_const_mem (Pmode, new_rtx);
12316 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12318 if (reg == 0)
12319 reg = gen_reg_rtx (Pmode);
12320 /* Use directly gen_movsi, otherwise the address is loaded
12321 into register for CSE. We don't want to CSE this addresses,
12322 instead we CSE addresses from the GOT table, so skip this. */
12323 emit_insn (gen_movsi (reg, new_rtx));
12324 new_rtx = reg;
12326 else
12328 /* This symbol must be referenced via a load from the
12329 Global Offset Table (@GOT). */
12331 if (reload_in_progress)
12332 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12333 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12334 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12335 if (TARGET_64BIT)
12336 new_rtx = force_reg (Pmode, new_rtx);
12337 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12338 new_rtx = gen_const_mem (Pmode, new_rtx);
12339 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12341 if (reg == 0)
12342 reg = gen_reg_rtx (Pmode);
12343 emit_move_insn (reg, new_rtx);
12344 new_rtx = reg;
12347 else
12349 if (CONST_INT_P (addr)
12350 && !x86_64_immediate_operand (addr, VOIDmode))
12352 if (reg)
12354 emit_move_insn (reg, addr);
12355 new_rtx = reg;
12357 else
12358 new_rtx = force_reg (Pmode, addr);
12360 else if (GET_CODE (addr) == CONST)
12362 addr = XEXP (addr, 0);
12364 /* We must match stuff we generate before. Assume the only
12365 unspecs that can get here are ours. Not that we could do
12366 anything with them anyway.... */
12367 if (GET_CODE (addr) == UNSPEC
12368 || (GET_CODE (addr) == PLUS
12369 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12370 return orig;
12371 gcc_assert (GET_CODE (addr) == PLUS);
12373 if (GET_CODE (addr) == PLUS)
12375 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12377 /* Check first to see if this is a constant offset from a @GOTOFF
12378 symbol reference. */
12379 if (gotoff_operand (op0, Pmode)
12380 && CONST_INT_P (op1))
12382 if (!TARGET_64BIT)
12384 if (reload_in_progress)
12385 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12386 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12387 UNSPEC_GOTOFF);
12388 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12389 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12390 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12392 if (reg != 0)
12394 emit_move_insn (reg, new_rtx);
12395 new_rtx = reg;
12398 else
12400 if (INTVAL (op1) < -16*1024*1024
12401 || INTVAL (op1) >= 16*1024*1024)
12403 if (!x86_64_immediate_operand (op1, Pmode))
12404 op1 = force_reg (Pmode, op1);
12405 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12409 else
12411 base = legitimize_pic_address (XEXP (addr, 0), reg);
12412 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12413 base == reg ? NULL_RTX : reg);
12415 if (CONST_INT_P (new_rtx))
12416 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12417 else
12419 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12421 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12422 new_rtx = XEXP (new_rtx, 1);
12424 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12429 return new_rtx;
12432 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12434 static rtx
12435 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12437 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12439 if (GET_MODE (tp) != tp_mode)
12441 gcc_assert (GET_MODE (tp) == SImode);
12442 gcc_assert (tp_mode == DImode);
12444 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12447 if (to_reg)
12448 tp = copy_to_mode_reg (tp_mode, tp);
12450 return tp;
12453 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12455 static GTY(()) rtx ix86_tls_symbol;
12457 static rtx
12458 ix86_tls_get_addr (void)
12460 if (!ix86_tls_symbol)
12462 const char *sym
12463 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12464 ? "___tls_get_addr" : "__tls_get_addr");
12466 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12469 return ix86_tls_symbol;
12472 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12474 static GTY(()) rtx ix86_tls_module_base_symbol;
12477 ix86_tls_module_base (void)
12479 if (!ix86_tls_module_base_symbol)
12481 ix86_tls_module_base_symbol
12482 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12484 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12485 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12488 return ix86_tls_module_base_symbol;
12491 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12492 false if we expect this to be used for a memory address and true if
12493 we expect to load the address into a register. */
12495 static rtx
12496 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12498 rtx dest, base, off;
12499 rtx pic = NULL_RTX, tp = NULL_RTX;
12500 enum machine_mode tp_mode = Pmode;
12501 int type;
12503 switch (model)
12505 case TLS_MODEL_GLOBAL_DYNAMIC:
12506 dest = gen_reg_rtx (Pmode);
12508 if (!TARGET_64BIT)
12510 if (flag_pic)
12511 pic = pic_offset_table_rtx;
12512 else
12514 pic = gen_reg_rtx (Pmode);
12515 emit_insn (gen_set_got (pic));
12519 if (TARGET_GNU2_TLS)
12521 if (TARGET_64BIT)
12522 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12523 else
12524 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12526 tp = get_thread_pointer (Pmode, true);
12527 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12529 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12531 else
12533 rtx caddr = ix86_tls_get_addr ();
12535 if (TARGET_64BIT)
12537 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12539 start_sequence ();
12540 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12541 caddr));
12542 insns = get_insns ();
12543 end_sequence ();
12545 RTL_CONST_CALL_P (insns) = 1;
12546 emit_libcall_block (insns, dest, rax, x);
12548 else
12549 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12551 break;
12553 case TLS_MODEL_LOCAL_DYNAMIC:
12554 base = gen_reg_rtx (Pmode);
12556 if (!TARGET_64BIT)
12558 if (flag_pic)
12559 pic = pic_offset_table_rtx;
12560 else
12562 pic = gen_reg_rtx (Pmode);
12563 emit_insn (gen_set_got (pic));
12567 if (TARGET_GNU2_TLS)
12569 rtx tmp = ix86_tls_module_base ();
12571 if (TARGET_64BIT)
12572 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12573 else
12574 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12576 tp = get_thread_pointer (Pmode, true);
12577 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12578 gen_rtx_MINUS (Pmode, tmp, tp));
12580 else
12582 rtx caddr = ix86_tls_get_addr ();
12584 if (TARGET_64BIT)
12586 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12588 start_sequence ();
12589 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12590 caddr));
12591 insns = get_insns ();
12592 end_sequence ();
12594 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12595 share the LD_BASE result with other LD model accesses. */
12596 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12597 UNSPEC_TLS_LD_BASE);
12599 RTL_CONST_CALL_P (insns) = 1;
12600 emit_libcall_block (insns, base, rax, eqv);
12602 else
12603 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12606 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12607 off = gen_rtx_CONST (Pmode, off);
12609 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12611 if (TARGET_GNU2_TLS)
12613 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12615 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12617 break;
12619 case TLS_MODEL_INITIAL_EXEC:
12620 if (TARGET_64BIT)
12622 if (TARGET_SUN_TLS && !TARGET_X32)
12624 /* The Sun linker took the AMD64 TLS spec literally
12625 and can only handle %rax as destination of the
12626 initial executable code sequence. */
12628 dest = gen_reg_rtx (DImode);
12629 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12630 return dest;
12633 /* Generate DImode references to avoid %fs:(%reg32)
12634 problems and linker IE->LE relaxation bug. */
12635 tp_mode = DImode;
12636 pic = NULL;
12637 type = UNSPEC_GOTNTPOFF;
12639 else if (flag_pic)
12641 if (reload_in_progress)
12642 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12643 pic = pic_offset_table_rtx;
12644 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12646 else if (!TARGET_ANY_GNU_TLS)
12648 pic = gen_reg_rtx (Pmode);
12649 emit_insn (gen_set_got (pic));
12650 type = UNSPEC_GOTTPOFF;
12652 else
12654 pic = NULL;
12655 type = UNSPEC_INDNTPOFF;
12658 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12659 off = gen_rtx_CONST (tp_mode, off);
12660 if (pic)
12661 off = gen_rtx_PLUS (tp_mode, pic, off);
12662 off = gen_const_mem (tp_mode, off);
12663 set_mem_alias_set (off, ix86_GOT_alias_set ());
12665 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12667 base = get_thread_pointer (tp_mode,
12668 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12669 off = force_reg (tp_mode, off);
12670 return gen_rtx_PLUS (tp_mode, base, off);
12672 else
12674 base = get_thread_pointer (Pmode, true);
12675 dest = gen_reg_rtx (Pmode);
12676 emit_insn (ix86_gen_sub3 (dest, base, off));
12678 break;
12680 case TLS_MODEL_LOCAL_EXEC:
12681 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12682 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12683 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12684 off = gen_rtx_CONST (Pmode, off);
12686 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12688 base = get_thread_pointer (Pmode,
12689 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12690 return gen_rtx_PLUS (Pmode, base, off);
12692 else
12694 base = get_thread_pointer (Pmode, true);
12695 dest = gen_reg_rtx (Pmode);
12696 emit_insn (ix86_gen_sub3 (dest, base, off));
12698 break;
12700 default:
12701 gcc_unreachable ();
12704 return dest;
12707 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12708 to symbol DECL. */
12710 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12711 htab_t dllimport_map;
12713 static tree
12714 get_dllimport_decl (tree decl)
12716 struct tree_map *h, in;
12717 void **loc;
12718 const char *name;
12719 const char *prefix;
12720 size_t namelen, prefixlen;
12721 char *imp_name;
12722 tree to;
12723 rtx rtl;
12725 if (!dllimport_map)
12726 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12728 in.hash = htab_hash_pointer (decl);
12729 in.base.from = decl;
12730 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12731 h = (struct tree_map *) *loc;
12732 if (h)
12733 return h->to;
12735 *loc = h = ggc_alloc_tree_map ();
12736 h->hash = in.hash;
12737 h->base.from = decl;
12738 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12739 VAR_DECL, NULL, ptr_type_node);
12740 DECL_ARTIFICIAL (to) = 1;
12741 DECL_IGNORED_P (to) = 1;
12742 DECL_EXTERNAL (to) = 1;
12743 TREE_READONLY (to) = 1;
12745 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12746 name = targetm.strip_name_encoding (name);
12747 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12748 ? "*__imp_" : "*__imp__";
12749 namelen = strlen (name);
12750 prefixlen = strlen (prefix);
12751 imp_name = (char *) alloca (namelen + prefixlen + 1);
12752 memcpy (imp_name, prefix, prefixlen);
12753 memcpy (imp_name + prefixlen, name, namelen + 1);
12755 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12756 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12757 SET_SYMBOL_REF_DECL (rtl, to);
12758 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12760 rtl = gen_const_mem (Pmode, rtl);
12761 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12763 SET_DECL_RTL (to, rtl);
12764 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12766 return to;
12769 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12770 true if we require the result be a register. */
12772 static rtx
12773 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12775 tree imp_decl;
12776 rtx x;
12778 gcc_assert (SYMBOL_REF_DECL (symbol));
12779 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12781 x = DECL_RTL (imp_decl);
12782 if (want_reg)
12783 x = force_reg (Pmode, x);
12784 return x;
12787 /* Try machine-dependent ways of modifying an illegitimate address
12788 to be legitimate. If we find one, return the new, valid address.
12789 This macro is used in only one place: `memory_address' in explow.c.
12791 OLDX is the address as it was before break_out_memory_refs was called.
12792 In some cases it is useful to look at this to decide what needs to be done.
12794 It is always safe for this macro to do nothing. It exists to recognize
12795 opportunities to optimize the output.
12797 For the 80386, we handle X+REG by loading X into a register R and
12798 using R+REG. R will go in a general reg and indexing will be used.
12799 However, if REG is a broken-out memory address or multiplication,
12800 nothing needs to be done because REG can certainly go in a general reg.
12802 When -fpic is used, special handling is needed for symbolic references.
12803 See comments by legitimize_pic_address in i386.c for details. */
12805 static rtx
12806 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12807 enum machine_mode mode)
12809 int changed = 0;
12810 unsigned log;
12812 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12813 if (log)
12814 return legitimize_tls_address (x, (enum tls_model) log, false);
12815 if (GET_CODE (x) == CONST
12816 && GET_CODE (XEXP (x, 0)) == PLUS
12817 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12818 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12820 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12821 (enum tls_model) log, false);
12822 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12825 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12827 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12828 return legitimize_dllimport_symbol (x, true);
12829 if (GET_CODE (x) == CONST
12830 && GET_CODE (XEXP (x, 0)) == PLUS
12831 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12832 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12834 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12835 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12839 if (flag_pic && SYMBOLIC_CONST (x))
12840 return legitimize_pic_address (x, 0);
12842 #if TARGET_MACHO
12843 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12844 return machopic_indirect_data_reference (x, 0);
12845 #endif
12847 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12848 if (GET_CODE (x) == ASHIFT
12849 && CONST_INT_P (XEXP (x, 1))
12850 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12852 changed = 1;
12853 log = INTVAL (XEXP (x, 1));
12854 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12855 GEN_INT (1 << log));
12858 if (GET_CODE (x) == PLUS)
12860 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12862 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12863 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12864 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12866 changed = 1;
12867 log = INTVAL (XEXP (XEXP (x, 0), 1));
12868 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12869 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12870 GEN_INT (1 << log));
12873 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12874 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12875 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12877 changed = 1;
12878 log = INTVAL (XEXP (XEXP (x, 1), 1));
12879 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12880 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12881 GEN_INT (1 << log));
12884 /* Put multiply first if it isn't already. */
12885 if (GET_CODE (XEXP (x, 1)) == MULT)
12887 rtx tmp = XEXP (x, 0);
12888 XEXP (x, 0) = XEXP (x, 1);
12889 XEXP (x, 1) = tmp;
12890 changed = 1;
12893 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12894 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12895 created by virtual register instantiation, register elimination, and
12896 similar optimizations. */
12897 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12899 changed = 1;
12900 x = gen_rtx_PLUS (Pmode,
12901 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12902 XEXP (XEXP (x, 1), 0)),
12903 XEXP (XEXP (x, 1), 1));
12906 /* Canonicalize
12907 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12908 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12909 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12910 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12911 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12912 && CONSTANT_P (XEXP (x, 1)))
12914 rtx constant;
12915 rtx other = NULL_RTX;
12917 if (CONST_INT_P (XEXP (x, 1)))
12919 constant = XEXP (x, 1);
12920 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12922 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12924 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12925 other = XEXP (x, 1);
12927 else
12928 constant = 0;
12930 if (constant)
12932 changed = 1;
12933 x = gen_rtx_PLUS (Pmode,
12934 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12935 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12936 plus_constant (Pmode, other,
12937 INTVAL (constant)));
12941 if (changed && ix86_legitimate_address_p (mode, x, false))
12942 return x;
12944 if (GET_CODE (XEXP (x, 0)) == MULT)
12946 changed = 1;
12947 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12950 if (GET_CODE (XEXP (x, 1)) == MULT)
12952 changed = 1;
12953 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12956 if (changed
12957 && REG_P (XEXP (x, 1))
12958 && REG_P (XEXP (x, 0)))
12959 return x;
12961 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12963 changed = 1;
12964 x = legitimize_pic_address (x, 0);
12967 if (changed && ix86_legitimate_address_p (mode, x, false))
12968 return x;
12970 if (REG_P (XEXP (x, 0)))
12972 rtx temp = gen_reg_rtx (Pmode);
12973 rtx val = force_operand (XEXP (x, 1), temp);
12974 if (val != temp)
12976 if (GET_MODE (val) != Pmode)
12977 val = convert_to_mode (Pmode, val, 1);
12978 emit_move_insn (temp, val);
12981 XEXP (x, 1) = temp;
12982 return x;
12985 else if (REG_P (XEXP (x, 1)))
12987 rtx temp = gen_reg_rtx (Pmode);
12988 rtx val = force_operand (XEXP (x, 0), temp);
12989 if (val != temp)
12991 if (GET_MODE (val) != Pmode)
12992 val = convert_to_mode (Pmode, val, 1);
12993 emit_move_insn (temp, val);
12996 XEXP (x, 0) = temp;
12997 return x;
13001 return x;
13004 /* Print an integer constant expression in assembler syntax. Addition
13005 and subtraction are the only arithmetic that may appear in these
13006 expressions. FILE is the stdio stream to write to, X is the rtx, and
13007 CODE is the operand print code from the output string. */
13009 static void
13010 output_pic_addr_const (FILE *file, rtx x, int code)
13012 char buf[256];
13014 switch (GET_CODE (x))
13016 case PC:
13017 gcc_assert (flag_pic);
13018 putc ('.', file);
13019 break;
13021 case SYMBOL_REF:
13022 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13023 output_addr_const (file, x);
13024 else
13026 const char *name = XSTR (x, 0);
13028 /* Mark the decl as referenced so that cgraph will
13029 output the function. */
13030 if (SYMBOL_REF_DECL (x))
13031 mark_decl_referenced (SYMBOL_REF_DECL (x));
13033 #if TARGET_MACHO
13034 if (MACHOPIC_INDIRECT
13035 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13036 name = machopic_indirection_name (x, /*stub_p=*/true);
13037 #endif
13038 assemble_name (file, name);
13040 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13041 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13042 fputs ("@PLT", file);
13043 break;
13045 case LABEL_REF:
13046 x = XEXP (x, 0);
13047 /* FALLTHRU */
13048 case CODE_LABEL:
13049 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13050 assemble_name (asm_out_file, buf);
13051 break;
13053 case CONST_INT:
13054 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13055 break;
13057 case CONST:
13058 /* This used to output parentheses around the expression,
13059 but that does not work on the 386 (either ATT or BSD assembler). */
13060 output_pic_addr_const (file, XEXP (x, 0), code);
13061 break;
13063 case CONST_DOUBLE:
13064 if (GET_MODE (x) == VOIDmode)
13066 /* We can use %d if the number is <32 bits and positive. */
13067 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13068 fprintf (file, "0x%lx%08lx",
13069 (unsigned long) CONST_DOUBLE_HIGH (x),
13070 (unsigned long) CONST_DOUBLE_LOW (x));
13071 else
13072 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13074 else
13075 /* We can't handle floating point constants;
13076 TARGET_PRINT_OPERAND must handle them. */
13077 output_operand_lossage ("floating constant misused");
13078 break;
13080 case PLUS:
13081 /* Some assemblers need integer constants to appear first. */
13082 if (CONST_INT_P (XEXP (x, 0)))
13084 output_pic_addr_const (file, XEXP (x, 0), code);
13085 putc ('+', file);
13086 output_pic_addr_const (file, XEXP (x, 1), code);
13088 else
13090 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13091 output_pic_addr_const (file, XEXP (x, 1), code);
13092 putc ('+', file);
13093 output_pic_addr_const (file, XEXP (x, 0), code);
13095 break;
13097 case MINUS:
13098 if (!TARGET_MACHO)
13099 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13100 output_pic_addr_const (file, XEXP (x, 0), code);
13101 putc ('-', file);
13102 output_pic_addr_const (file, XEXP (x, 1), code);
13103 if (!TARGET_MACHO)
13104 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13105 break;
13107 case UNSPEC:
13108 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13110 bool f = i386_asm_output_addr_const_extra (file, x);
13111 gcc_assert (f);
13112 break;
13115 gcc_assert (XVECLEN (x, 0) == 1);
13116 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13117 switch (XINT (x, 1))
13119 case UNSPEC_GOT:
13120 fputs ("@GOT", file);
13121 break;
13122 case UNSPEC_GOTOFF:
13123 fputs ("@GOTOFF", file);
13124 break;
13125 case UNSPEC_PLTOFF:
13126 fputs ("@PLTOFF", file);
13127 break;
13128 case UNSPEC_PCREL:
13129 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13130 "(%rip)" : "[rip]", file);
13131 break;
13132 case UNSPEC_GOTPCREL:
13133 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13134 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13135 break;
13136 case UNSPEC_GOTTPOFF:
13137 /* FIXME: This might be @TPOFF in Sun ld too. */
13138 fputs ("@gottpoff", file);
13139 break;
13140 case UNSPEC_TPOFF:
13141 fputs ("@tpoff", file);
13142 break;
13143 case UNSPEC_NTPOFF:
13144 if (TARGET_64BIT)
13145 fputs ("@tpoff", file);
13146 else
13147 fputs ("@ntpoff", file);
13148 break;
13149 case UNSPEC_DTPOFF:
13150 fputs ("@dtpoff", file);
13151 break;
13152 case UNSPEC_GOTNTPOFF:
13153 if (TARGET_64BIT)
13154 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13155 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13156 else
13157 fputs ("@gotntpoff", file);
13158 break;
13159 case UNSPEC_INDNTPOFF:
13160 fputs ("@indntpoff", file);
13161 break;
13162 #if TARGET_MACHO
13163 case UNSPEC_MACHOPIC_OFFSET:
13164 putc ('-', file);
13165 machopic_output_function_base_name (file);
13166 break;
13167 #endif
13168 default:
13169 output_operand_lossage ("invalid UNSPEC as operand");
13170 break;
13172 break;
13174 default:
13175 output_operand_lossage ("invalid expression as operand");
13179 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13180 We need to emit DTP-relative relocations. */
13182 static void ATTRIBUTE_UNUSED
13183 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13185 fputs (ASM_LONG, file);
13186 output_addr_const (file, x);
13187 fputs ("@dtpoff", file);
13188 switch (size)
13190 case 4:
13191 break;
13192 case 8:
13193 fputs (", 0", file);
13194 break;
13195 default:
13196 gcc_unreachable ();
13200 /* Return true if X is a representation of the PIC register. This copes
13201 with calls from ix86_find_base_term, where the register might have
13202 been replaced by a cselib value. */
13204 static bool
13205 ix86_pic_register_p (rtx x)
13207 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13208 return (pic_offset_table_rtx
13209 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13210 else
13211 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13214 /* Helper function for ix86_delegitimize_address.
13215 Attempt to delegitimize TLS local-exec accesses. */
13217 static rtx
13218 ix86_delegitimize_tls_address (rtx orig_x)
13220 rtx x = orig_x, unspec;
13221 struct ix86_address addr;
13223 if (!TARGET_TLS_DIRECT_SEG_REFS)
13224 return orig_x;
13225 if (MEM_P (x))
13226 x = XEXP (x, 0);
13227 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13228 return orig_x;
13229 if (ix86_decompose_address (x, &addr) == 0
13230 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13231 || addr.disp == NULL_RTX
13232 || GET_CODE (addr.disp) != CONST)
13233 return orig_x;
13234 unspec = XEXP (addr.disp, 0);
13235 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13236 unspec = XEXP (unspec, 0);
13237 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13238 return orig_x;
13239 x = XVECEXP (unspec, 0, 0);
13240 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13241 if (unspec != XEXP (addr.disp, 0))
13242 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13243 if (addr.index)
13245 rtx idx = addr.index;
13246 if (addr.scale != 1)
13247 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13248 x = gen_rtx_PLUS (Pmode, idx, x);
13250 if (addr.base)
13251 x = gen_rtx_PLUS (Pmode, addr.base, x);
13252 if (MEM_P (orig_x))
13253 x = replace_equiv_address_nv (orig_x, x);
13254 return x;
13257 /* In the name of slightly smaller debug output, and to cater to
13258 general assembler lossage, recognize PIC+GOTOFF and turn it back
13259 into a direct symbol reference.
13261 On Darwin, this is necessary to avoid a crash, because Darwin
13262 has a different PIC label for each routine but the DWARF debugging
13263 information is not associated with any particular routine, so it's
13264 necessary to remove references to the PIC label from RTL stored by
13265 the DWARF output code. */
13267 static rtx
13268 ix86_delegitimize_address (rtx x)
13270 rtx orig_x = delegitimize_mem_from_attrs (x);
13271 /* addend is NULL or some rtx if x is something+GOTOFF where
13272 something doesn't include the PIC register. */
13273 rtx addend = NULL_RTX;
13274 /* reg_addend is NULL or a multiple of some register. */
13275 rtx reg_addend = NULL_RTX;
13276 /* const_addend is NULL or a const_int. */
13277 rtx const_addend = NULL_RTX;
13278 /* This is the result, or NULL. */
13279 rtx result = NULL_RTX;
13281 x = orig_x;
13283 if (MEM_P (x))
13284 x = XEXP (x, 0);
13286 if (TARGET_64BIT)
13288 if (GET_CODE (x) == CONST
13289 && GET_CODE (XEXP (x, 0)) == PLUS
13290 && GET_MODE (XEXP (x, 0)) == Pmode
13291 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13292 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13293 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13295 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13296 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13297 if (MEM_P (orig_x))
13298 x = replace_equiv_address_nv (orig_x, x);
13299 return x;
13301 if (GET_CODE (x) != CONST
13302 || GET_CODE (XEXP (x, 0)) != UNSPEC
13303 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13304 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13305 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13306 return ix86_delegitimize_tls_address (orig_x);
13307 x = XVECEXP (XEXP (x, 0), 0, 0);
13308 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13310 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13311 GET_MODE (x), 0);
13312 if (x == NULL_RTX)
13313 return orig_x;
13315 return x;
13318 if (GET_CODE (x) != PLUS
13319 || GET_CODE (XEXP (x, 1)) != CONST)
13320 return ix86_delegitimize_tls_address (orig_x);
13322 if (ix86_pic_register_p (XEXP (x, 0)))
13323 /* %ebx + GOT/GOTOFF */
13325 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13327 /* %ebx + %reg * scale + GOT/GOTOFF */
13328 reg_addend = XEXP (x, 0);
13329 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13330 reg_addend = XEXP (reg_addend, 1);
13331 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13332 reg_addend = XEXP (reg_addend, 0);
13333 else
13335 reg_addend = NULL_RTX;
13336 addend = XEXP (x, 0);
13339 else
13340 addend = XEXP (x, 0);
13342 x = XEXP (XEXP (x, 1), 0);
13343 if (GET_CODE (x) == PLUS
13344 && CONST_INT_P (XEXP (x, 1)))
13346 const_addend = XEXP (x, 1);
13347 x = XEXP (x, 0);
13350 if (GET_CODE (x) == UNSPEC
13351 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13352 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13353 result = XVECEXP (x, 0, 0);
13355 if (TARGET_MACHO && darwin_local_data_pic (x)
13356 && !MEM_P (orig_x))
13357 result = XVECEXP (x, 0, 0);
13359 if (! result)
13360 return ix86_delegitimize_tls_address (orig_x);
13362 if (const_addend)
13363 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13364 if (reg_addend)
13365 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13366 if (addend)
13368 /* If the rest of original X doesn't involve the PIC register, add
13369 addend and subtract pic_offset_table_rtx. This can happen e.g.
13370 for code like:
13371 leal (%ebx, %ecx, 4), %ecx
13373 movl foo@GOTOFF(%ecx), %edx
13374 in which case we return (%ecx - %ebx) + foo. */
13375 if (pic_offset_table_rtx)
13376 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13377 pic_offset_table_rtx),
13378 result);
13379 else
13380 return orig_x;
13382 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13384 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13385 if (result == NULL_RTX)
13386 return orig_x;
13388 return result;
13391 /* If X is a machine specific address (i.e. a symbol or label being
13392 referenced as a displacement from the GOT implemented using an
13393 UNSPEC), then return the base term. Otherwise return X. */
13396 ix86_find_base_term (rtx x)
13398 rtx term;
13400 if (TARGET_64BIT)
13402 if (GET_CODE (x) != CONST)
13403 return x;
13404 term = XEXP (x, 0);
13405 if (GET_CODE (term) == PLUS
13406 && (CONST_INT_P (XEXP (term, 1))
13407 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13408 term = XEXP (term, 0);
13409 if (GET_CODE (term) != UNSPEC
13410 || (XINT (term, 1) != UNSPEC_GOTPCREL
13411 && XINT (term, 1) != UNSPEC_PCREL))
13412 return x;
13414 return XVECEXP (term, 0, 0);
13417 return ix86_delegitimize_address (x);
13420 static void
13421 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13422 bool fp, FILE *file)
13424 const char *suffix;
13426 if (mode == CCFPmode || mode == CCFPUmode)
13428 code = ix86_fp_compare_code_to_integer (code);
13429 mode = CCmode;
13431 if (reverse)
13432 code = reverse_condition (code);
13434 switch (code)
13436 case EQ:
13437 switch (mode)
13439 case CCAmode:
13440 suffix = "a";
13441 break;
13443 case CCCmode:
13444 suffix = "c";
13445 break;
13447 case CCOmode:
13448 suffix = "o";
13449 break;
13451 case CCSmode:
13452 suffix = "s";
13453 break;
13455 default:
13456 suffix = "e";
13458 break;
13459 case NE:
13460 switch (mode)
13462 case CCAmode:
13463 suffix = "na";
13464 break;
13466 case CCCmode:
13467 suffix = "nc";
13468 break;
13470 case CCOmode:
13471 suffix = "no";
13472 break;
13474 case CCSmode:
13475 suffix = "ns";
13476 break;
13478 default:
13479 suffix = "ne";
13481 break;
13482 case GT:
13483 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13484 suffix = "g";
13485 break;
13486 case GTU:
13487 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13488 Those same assemblers have the same but opposite lossage on cmov. */
13489 if (mode == CCmode)
13490 suffix = fp ? "nbe" : "a";
13491 else if (mode == CCCmode)
13492 suffix = "b";
13493 else
13494 gcc_unreachable ();
13495 break;
13496 case LT:
13497 switch (mode)
13499 case CCNOmode:
13500 case CCGOCmode:
13501 suffix = "s";
13502 break;
13504 case CCmode:
13505 case CCGCmode:
13506 suffix = "l";
13507 break;
13509 default:
13510 gcc_unreachable ();
13512 break;
13513 case LTU:
13514 gcc_assert (mode == CCmode || mode == CCCmode);
13515 suffix = "b";
13516 break;
13517 case GE:
13518 switch (mode)
13520 case CCNOmode:
13521 case CCGOCmode:
13522 suffix = "ns";
13523 break;
13525 case CCmode:
13526 case CCGCmode:
13527 suffix = "ge";
13528 break;
13530 default:
13531 gcc_unreachable ();
13533 break;
13534 case GEU:
13535 /* ??? As above. */
13536 gcc_assert (mode == CCmode || mode == CCCmode);
13537 suffix = fp ? "nb" : "ae";
13538 break;
13539 case LE:
13540 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13541 suffix = "le";
13542 break;
13543 case LEU:
13544 /* ??? As above. */
13545 if (mode == CCmode)
13546 suffix = "be";
13547 else if (mode == CCCmode)
13548 suffix = fp ? "nb" : "ae";
13549 else
13550 gcc_unreachable ();
13551 break;
13552 case UNORDERED:
13553 suffix = fp ? "u" : "p";
13554 break;
13555 case ORDERED:
13556 suffix = fp ? "nu" : "np";
13557 break;
13558 default:
13559 gcc_unreachable ();
13561 fputs (suffix, file);
13564 /* Print the name of register X to FILE based on its machine mode and number.
13565 If CODE is 'w', pretend the mode is HImode.
13566 If CODE is 'b', pretend the mode is QImode.
13567 If CODE is 'k', pretend the mode is SImode.
13568 If CODE is 'q', pretend the mode is DImode.
13569 If CODE is 'x', pretend the mode is V4SFmode.
13570 If CODE is 't', pretend the mode is V8SFmode.
13571 If CODE is 'h', pretend the reg is the 'high' byte register.
13572 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13573 If CODE is 'd', duplicate the operand for AVX instruction.
13576 void
13577 print_reg (rtx x, int code, FILE *file)
13579 const char *reg;
13580 unsigned int regno;
13581 bool duplicated = code == 'd' && TARGET_AVX;
13583 if (ASSEMBLER_DIALECT == ASM_ATT)
13584 putc ('%', file);
13586 if (x == pc_rtx)
13588 gcc_assert (TARGET_64BIT);
13589 fputs ("rip", file);
13590 return;
13593 regno = true_regnum (x);
13594 gcc_assert (regno != ARG_POINTER_REGNUM
13595 && regno != FRAME_POINTER_REGNUM
13596 && regno != FLAGS_REG
13597 && regno != FPSR_REG
13598 && regno != FPCR_REG);
13600 if (code == 'w' || MMX_REG_P (x))
13601 code = 2;
13602 else if (code == 'b')
13603 code = 1;
13604 else if (code == 'k')
13605 code = 4;
13606 else if (code == 'q')
13607 code = 8;
13608 else if (code == 'y')
13609 code = 3;
13610 else if (code == 'h')
13611 code = 0;
13612 else if (code == 'x')
13613 code = 16;
13614 else if (code == 't')
13615 code = 32;
13616 else
13617 code = GET_MODE_SIZE (GET_MODE (x));
13619 /* Irritatingly, AMD extended registers use different naming convention
13620 from the normal registers: "r%d[bwd]" */
13621 if (REX_INT_REGNO_P (regno))
13623 gcc_assert (TARGET_64BIT);
13624 putc ('r', file);
13625 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13626 switch (code)
13628 case 0:
13629 error ("extended registers have no high halves");
13630 break;
13631 case 1:
13632 putc ('b', file);
13633 break;
13634 case 2:
13635 putc ('w', file);
13636 break;
13637 case 4:
13638 putc ('d', file);
13639 break;
13640 case 8:
13641 /* no suffix */
13642 break;
13643 default:
13644 error ("unsupported operand size for extended register");
13645 break;
13647 return;
13650 reg = NULL;
13651 switch (code)
13653 case 3:
13654 if (STACK_TOP_P (x))
13656 reg = "st(0)";
13657 break;
13659 /* FALLTHRU */
13660 case 8:
13661 case 4:
13662 case 12:
13663 if (! ANY_FP_REG_P (x))
13664 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13665 /* FALLTHRU */
13666 case 16:
13667 case 2:
13668 normal:
13669 reg = hi_reg_name[regno];
13670 break;
13671 case 1:
13672 if (regno >= ARRAY_SIZE (qi_reg_name))
13673 goto normal;
13674 reg = qi_reg_name[regno];
13675 break;
13676 case 0:
13677 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13678 goto normal;
13679 reg = qi_high_reg_name[regno];
13680 break;
13681 case 32:
13682 if (SSE_REG_P (x))
13684 gcc_assert (!duplicated);
13685 putc ('y', file);
13686 fputs (hi_reg_name[regno] + 1, file);
13687 return;
13689 break;
13690 default:
13691 gcc_unreachable ();
13694 fputs (reg, file);
13695 if (duplicated)
13697 if (ASSEMBLER_DIALECT == ASM_ATT)
13698 fprintf (file, ", %%%s", reg);
13699 else
13700 fprintf (file, ", %s", reg);
13704 /* Locate some local-dynamic symbol still in use by this function
13705 so that we can print its name in some tls_local_dynamic_base
13706 pattern. */
13708 static int
13709 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13711 rtx x = *px;
13713 if (GET_CODE (x) == SYMBOL_REF
13714 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13716 cfun->machine->some_ld_name = XSTR (x, 0);
13717 return 1;
13720 return 0;
13723 static const char *
13724 get_some_local_dynamic_name (void)
13726 rtx insn;
13728 if (cfun->machine->some_ld_name)
13729 return cfun->machine->some_ld_name;
13731 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13732 if (NONDEBUG_INSN_P (insn)
13733 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13734 return cfun->machine->some_ld_name;
13736 return NULL;
13739 /* Meaning of CODE:
13740 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13741 C -- print opcode suffix for set/cmov insn.
13742 c -- like C, but print reversed condition
13743 F,f -- likewise, but for floating-point.
13744 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13745 otherwise nothing
13746 R -- print the prefix for register names.
13747 z -- print the opcode suffix for the size of the current operand.
13748 Z -- likewise, with special suffixes for x87 instructions.
13749 * -- print a star (in certain assembler syntax)
13750 A -- print an absolute memory reference.
13751 E -- print address with DImode register names if TARGET_64BIT.
13752 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13753 s -- print a shift double count, followed by the assemblers argument
13754 delimiter.
13755 b -- print the QImode name of the register for the indicated operand.
13756 %b0 would print %al if operands[0] is reg 0.
13757 w -- likewise, print the HImode name of the register.
13758 k -- likewise, print the SImode name of the register.
13759 q -- likewise, print the DImode name of the register.
13760 x -- likewise, print the V4SFmode name of the register.
13761 t -- likewise, print the V8SFmode name of the register.
13762 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13763 y -- print "st(0)" instead of "st" as a register.
13764 d -- print duplicated register operand for AVX instruction.
13765 D -- print condition for SSE cmp instruction.
13766 P -- if PIC, print an @PLT suffix.
13767 p -- print raw symbol name.
13768 X -- don't print any sort of PIC '@' suffix for a symbol.
13769 & -- print some in-use local-dynamic symbol name.
13770 H -- print a memory address offset by 8; used for sse high-parts
13771 Y -- print condition for XOP pcom* instruction.
13772 + -- print a branch hint as 'cs' or 'ds' prefix
13773 ; -- print a semicolon (after prefixes due to bug in older gas).
13774 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13775 @ -- print a segment register of thread base pointer load
13776 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13779 void
13780 ix86_print_operand (FILE *file, rtx x, int code)
13782 if (code)
13784 switch (code)
13786 case 'A':
13787 switch (ASSEMBLER_DIALECT)
13789 case ASM_ATT:
13790 putc ('*', file);
13791 break;
13793 case ASM_INTEL:
13794 /* Intel syntax. For absolute addresses, registers should not
13795 be surrounded by braces. */
13796 if (!REG_P (x))
13798 putc ('[', file);
13799 ix86_print_operand (file, x, 0);
13800 putc (']', file);
13801 return;
13803 break;
13805 default:
13806 gcc_unreachable ();
13809 ix86_print_operand (file, x, 0);
13810 return;
13812 case 'E':
13813 /* Wrap address in an UNSPEC to declare special handling. */
13814 if (TARGET_64BIT)
13815 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
13817 output_address (x);
13818 return;
13820 case 'L':
13821 if (ASSEMBLER_DIALECT == ASM_ATT)
13822 putc ('l', file);
13823 return;
13825 case 'W':
13826 if (ASSEMBLER_DIALECT == ASM_ATT)
13827 putc ('w', file);
13828 return;
13830 case 'B':
13831 if (ASSEMBLER_DIALECT == ASM_ATT)
13832 putc ('b', file);
13833 return;
13835 case 'Q':
13836 if (ASSEMBLER_DIALECT == ASM_ATT)
13837 putc ('l', file);
13838 return;
13840 case 'S':
13841 if (ASSEMBLER_DIALECT == ASM_ATT)
13842 putc ('s', file);
13843 return;
13845 case 'T':
13846 if (ASSEMBLER_DIALECT == ASM_ATT)
13847 putc ('t', file);
13848 return;
13850 case 'O':
13851 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13852 if (ASSEMBLER_DIALECT != ASM_ATT)
13853 return;
13855 switch (GET_MODE_SIZE (GET_MODE (x)))
13857 case 2:
13858 putc ('w', file);
13859 break;
13861 case 4:
13862 putc ('l', file);
13863 break;
13865 case 8:
13866 putc ('q', file);
13867 break;
13869 default:
13870 output_operand_lossage
13871 ("invalid operand size for operand code 'O'");
13872 return;
13875 putc ('.', file);
13876 #endif
13877 return;
13879 case 'z':
13880 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13882 /* Opcodes don't get size suffixes if using Intel opcodes. */
13883 if (ASSEMBLER_DIALECT == ASM_INTEL)
13884 return;
13886 switch (GET_MODE_SIZE (GET_MODE (x)))
13888 case 1:
13889 putc ('b', file);
13890 return;
13892 case 2:
13893 putc ('w', file);
13894 return;
13896 case 4:
13897 putc ('l', file);
13898 return;
13900 case 8:
13901 putc ('q', file);
13902 return;
13904 default:
13905 output_operand_lossage
13906 ("invalid operand size for operand code 'z'");
13907 return;
13911 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13912 warning
13913 (0, "non-integer operand used with operand code 'z'");
13914 /* FALLTHRU */
13916 case 'Z':
13917 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13918 if (ASSEMBLER_DIALECT == ASM_INTEL)
13919 return;
13921 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13923 switch (GET_MODE_SIZE (GET_MODE (x)))
13925 case 2:
13926 #ifdef HAVE_AS_IX86_FILDS
13927 putc ('s', file);
13928 #endif
13929 return;
13931 case 4:
13932 putc ('l', file);
13933 return;
13935 case 8:
13936 #ifdef HAVE_AS_IX86_FILDQ
13937 putc ('q', file);
13938 #else
13939 fputs ("ll", file);
13940 #endif
13941 return;
13943 default:
13944 break;
13947 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13949 /* 387 opcodes don't get size suffixes
13950 if the operands are registers. */
13951 if (STACK_REG_P (x))
13952 return;
13954 switch (GET_MODE_SIZE (GET_MODE (x)))
13956 case 4:
13957 putc ('s', file);
13958 return;
13960 case 8:
13961 putc ('l', file);
13962 return;
13964 case 12:
13965 case 16:
13966 putc ('t', file);
13967 return;
13969 default:
13970 break;
13973 else
13975 output_operand_lossage
13976 ("invalid operand type used with operand code 'Z'");
13977 return;
13980 output_operand_lossage
13981 ("invalid operand size for operand code 'Z'");
13982 return;
13984 case 'd':
13985 case 'b':
13986 case 'w':
13987 case 'k':
13988 case 'q':
13989 case 'h':
13990 case 't':
13991 case 'y':
13992 case 'x':
13993 case 'X':
13994 case 'P':
13995 case 'p':
13996 break;
13998 case 's':
13999 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14001 ix86_print_operand (file, x, 0);
14002 fputs (", ", file);
14004 return;
14006 case 'Y':
14007 switch (GET_CODE (x))
14009 case NE:
14010 fputs ("neq", file);
14011 break;
14012 case EQ:
14013 fputs ("eq", file);
14014 break;
14015 case GE:
14016 case GEU:
14017 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14018 break;
14019 case GT:
14020 case GTU:
14021 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14022 break;
14023 case LE:
14024 case LEU:
14025 fputs ("le", file);
14026 break;
14027 case LT:
14028 case LTU:
14029 fputs ("lt", file);
14030 break;
14031 case UNORDERED:
14032 fputs ("unord", file);
14033 break;
14034 case ORDERED:
14035 fputs ("ord", file);
14036 break;
14037 case UNEQ:
14038 fputs ("ueq", file);
14039 break;
14040 case UNGE:
14041 fputs ("nlt", file);
14042 break;
14043 case UNGT:
14044 fputs ("nle", file);
14045 break;
14046 case UNLE:
14047 fputs ("ule", file);
14048 break;
14049 case UNLT:
14050 fputs ("ult", file);
14051 break;
14052 case LTGT:
14053 fputs ("une", file);
14054 break;
14055 default:
14056 output_operand_lossage ("operand is not a condition code, "
14057 "invalid operand code 'Y'");
14058 return;
14060 return;
14062 case 'D':
14063 /* Little bit of braindamage here. The SSE compare instructions
14064 does use completely different names for the comparisons that the
14065 fp conditional moves. */
14066 switch (GET_CODE (x))
14068 case UNEQ:
14069 if (TARGET_AVX)
14071 fputs ("eq_us", file);
14072 break;
14074 case EQ:
14075 fputs ("eq", file);
14076 break;
14077 case UNLT:
14078 if (TARGET_AVX)
14080 fputs ("nge", file);
14081 break;
14083 case LT:
14084 fputs ("lt", file);
14085 break;
14086 case UNLE:
14087 if (TARGET_AVX)
14089 fputs ("ngt", file);
14090 break;
14092 case LE:
14093 fputs ("le", file);
14094 break;
14095 case UNORDERED:
14096 fputs ("unord", file);
14097 break;
14098 case LTGT:
14099 if (TARGET_AVX)
14101 fputs ("neq_oq", file);
14102 break;
14104 case NE:
14105 fputs ("neq", file);
14106 break;
14107 case GE:
14108 if (TARGET_AVX)
14110 fputs ("ge", file);
14111 break;
14113 case UNGE:
14114 fputs ("nlt", file);
14115 break;
14116 case GT:
14117 if (TARGET_AVX)
14119 fputs ("gt", file);
14120 break;
14122 case UNGT:
14123 fputs ("nle", file);
14124 break;
14125 case ORDERED:
14126 fputs ("ord", file);
14127 break;
14128 default:
14129 output_operand_lossage ("operand is not a condition code, "
14130 "invalid operand code 'D'");
14131 return;
14133 return;
14135 case 'F':
14136 case 'f':
14137 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14138 if (ASSEMBLER_DIALECT == ASM_ATT)
14139 putc ('.', file);
14140 #endif
14142 case 'C':
14143 case 'c':
14144 if (!COMPARISON_P (x))
14146 output_operand_lossage ("operand is not a condition code, "
14147 "invalid operand code '%c'", code);
14148 return;
14150 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14151 code == 'c' || code == 'f',
14152 code == 'F' || code == 'f',
14153 file);
14154 return;
14156 case 'H':
14157 if (!offsettable_memref_p (x))
14159 output_operand_lossage ("operand is not an offsettable memory "
14160 "reference, invalid operand code 'H'");
14161 return;
14163 /* It doesn't actually matter what mode we use here, as we're
14164 only going to use this for printing. */
14165 x = adjust_address_nv (x, DImode, 8);
14166 break;
14168 case 'K':
14169 gcc_assert (CONST_INT_P (x));
14171 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14172 #ifdef HAVE_AS_IX86_HLE
14173 fputs ("xacquire ", file);
14174 #else
14175 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14176 #endif
14177 else if (INTVAL (x) & IX86_HLE_RELEASE)
14178 #ifdef HAVE_AS_IX86_HLE
14179 fputs ("xrelease ", file);
14180 #else
14181 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14182 #endif
14183 /* We do not want to print value of the operand. */
14184 return;
14186 case '*':
14187 if (ASSEMBLER_DIALECT == ASM_ATT)
14188 putc ('*', file);
14189 return;
14191 case '&':
14193 const char *name = get_some_local_dynamic_name ();
14194 if (name == NULL)
14195 output_operand_lossage ("'%%&' used without any "
14196 "local dynamic TLS references");
14197 else
14198 assemble_name (file, name);
14199 return;
14202 case '+':
14204 rtx x;
14206 if (!optimize
14207 || optimize_function_for_size_p (cfun)
14208 || !TARGET_BRANCH_PREDICTION_HINTS)
14209 return;
14211 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14212 if (x)
14214 int pred_val = INTVAL (XEXP (x, 0));
14216 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14217 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14219 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14220 bool cputaken
14221 = final_forward_branch_p (current_output_insn) == 0;
14223 /* Emit hints only in the case default branch prediction
14224 heuristics would fail. */
14225 if (taken != cputaken)
14227 /* We use 3e (DS) prefix for taken branches and
14228 2e (CS) prefix for not taken branches. */
14229 if (taken)
14230 fputs ("ds ; ", file);
14231 else
14232 fputs ("cs ; ", file);
14236 return;
14239 case ';':
14240 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14241 putc (';', file);
14242 #endif
14243 return;
14245 case '@':
14246 if (ASSEMBLER_DIALECT == ASM_ATT)
14247 putc ('%', file);
14249 /* The kernel uses a different segment register for performance
14250 reasons; a system call would not have to trash the userspace
14251 segment register, which would be expensive. */
14252 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14253 fputs ("fs", file);
14254 else
14255 fputs ("gs", file);
14256 return;
14258 case '~':
14259 putc (TARGET_AVX2 ? 'i' : 'f', file);
14260 return;
14262 case '^':
14263 if (TARGET_64BIT && Pmode != word_mode)
14264 fputs ("addr32 ", file);
14265 return;
14267 default:
14268 output_operand_lossage ("invalid operand code '%c'", code);
14272 if (REG_P (x))
14273 print_reg (x, code, file);
14275 else if (MEM_P (x))
14277 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14278 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14279 && GET_MODE (x) != BLKmode)
14281 const char * size;
14282 switch (GET_MODE_SIZE (GET_MODE (x)))
14284 case 1: size = "BYTE"; break;
14285 case 2: size = "WORD"; break;
14286 case 4: size = "DWORD"; break;
14287 case 8: size = "QWORD"; break;
14288 case 12: size = "TBYTE"; break;
14289 case 16:
14290 if (GET_MODE (x) == XFmode)
14291 size = "TBYTE";
14292 else
14293 size = "XMMWORD";
14294 break;
14295 case 32: size = "YMMWORD"; break;
14296 default:
14297 gcc_unreachable ();
14300 /* Check for explicit size override (codes 'b', 'w', 'k',
14301 'q' and 'x') */
14302 if (code == 'b')
14303 size = "BYTE";
14304 else if (code == 'w')
14305 size = "WORD";
14306 else if (code == 'k')
14307 size = "DWORD";
14308 else if (code == 'q')
14309 size = "QWORD";
14310 else if (code == 'x')
14311 size = "XMMWORD";
14313 fputs (size, file);
14314 fputs (" PTR ", file);
14317 x = XEXP (x, 0);
14318 /* Avoid (%rip) for call operands. */
14319 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14320 && !CONST_INT_P (x))
14321 output_addr_const (file, x);
14322 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14323 output_operand_lossage ("invalid constraints for operand");
14324 else
14325 output_address (x);
14328 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14330 REAL_VALUE_TYPE r;
14331 long l;
14333 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14334 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14336 if (ASSEMBLER_DIALECT == ASM_ATT)
14337 putc ('$', file);
14338 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14339 if (code == 'q')
14340 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14341 else
14342 fprintf (file, "0x%08x", (unsigned int) l);
14345 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14347 REAL_VALUE_TYPE r;
14348 long l[2];
14350 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14351 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14353 if (ASSEMBLER_DIALECT == ASM_ATT)
14354 putc ('$', file);
14355 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14358 /* These float cases don't actually occur as immediate operands. */
14359 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14361 char dstr[30];
14363 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14364 fputs (dstr, file);
14367 else
14369 /* We have patterns that allow zero sets of memory, for instance.
14370 In 64-bit mode, we should probably support all 8-byte vectors,
14371 since we can in fact encode that into an immediate. */
14372 if (GET_CODE (x) == CONST_VECTOR)
14374 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14375 x = const0_rtx;
14378 if (code != 'P' && code != 'p')
14380 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14382 if (ASSEMBLER_DIALECT == ASM_ATT)
14383 putc ('$', file);
14385 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14386 || GET_CODE (x) == LABEL_REF)
14388 if (ASSEMBLER_DIALECT == ASM_ATT)
14389 putc ('$', file);
14390 else
14391 fputs ("OFFSET FLAT:", file);
14394 if (CONST_INT_P (x))
14395 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14396 else if (flag_pic || MACHOPIC_INDIRECT)
14397 output_pic_addr_const (file, x, code);
14398 else
14399 output_addr_const (file, x);
14403 static bool
14404 ix86_print_operand_punct_valid_p (unsigned char code)
14406 return (code == '@' || code == '*' || code == '+' || code == '&'
14407 || code == ';' || code == '~' || code == '^');
14410 /* Print a memory operand whose address is ADDR. */
14412 static void
14413 ix86_print_operand_address (FILE *file, rtx addr)
14415 struct ix86_address parts;
14416 rtx base, index, disp;
14417 int scale;
14418 int ok;
14419 bool vsib = false;
14420 int code = 0;
14422 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14424 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14425 gcc_assert (parts.index == NULL_RTX);
14426 parts.index = XVECEXP (addr, 0, 1);
14427 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14428 addr = XVECEXP (addr, 0, 0);
14429 vsib = true;
14431 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14433 gcc_assert (TARGET_64BIT);
14434 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14435 code = 'q';
14437 else
14438 ok = ix86_decompose_address (addr, &parts);
14440 gcc_assert (ok);
14442 base = parts.base;
14443 index = parts.index;
14444 disp = parts.disp;
14445 scale = parts.scale;
14447 switch (parts.seg)
14449 case SEG_DEFAULT:
14450 break;
14451 case SEG_FS:
14452 case SEG_GS:
14453 if (ASSEMBLER_DIALECT == ASM_ATT)
14454 putc ('%', file);
14455 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14456 break;
14457 default:
14458 gcc_unreachable ();
14461 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14462 if (TARGET_64BIT && !base && !index)
14464 rtx symbol = disp;
14466 if (GET_CODE (disp) == CONST
14467 && GET_CODE (XEXP (disp, 0)) == PLUS
14468 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14469 symbol = XEXP (XEXP (disp, 0), 0);
14471 if (GET_CODE (symbol) == LABEL_REF
14472 || (GET_CODE (symbol) == SYMBOL_REF
14473 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14474 base = pc_rtx;
14476 if (!base && !index)
14478 /* Displacement only requires special attention. */
14480 if (CONST_INT_P (disp))
14482 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14483 fputs ("ds:", file);
14484 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14486 else if (flag_pic)
14487 output_pic_addr_const (file, disp, 0);
14488 else
14489 output_addr_const (file, disp);
14491 else
14493 /* Print SImode register names to force addr32 prefix. */
14494 if (SImode_address_operand (addr, VOIDmode))
14496 #ifdef ENABLE_CHECKING
14497 gcc_assert (TARGET_64BIT);
14498 switch (GET_CODE (addr))
14500 case SUBREG:
14501 gcc_assert (GET_MODE (addr) == SImode);
14502 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14503 break;
14504 case ZERO_EXTEND:
14505 case AND:
14506 gcc_assert (GET_MODE (addr) == DImode);
14507 break;
14508 default:
14509 gcc_unreachable ();
14511 #endif
14512 gcc_assert (!code);
14513 code = 'l';
14516 if (ASSEMBLER_DIALECT == ASM_ATT)
14518 if (disp)
14520 if (flag_pic)
14521 output_pic_addr_const (file, disp, 0);
14522 else if (GET_CODE (disp) == LABEL_REF)
14523 output_asm_label (disp);
14524 else
14525 output_addr_const (file, disp);
14528 putc ('(', file);
14529 if (base)
14530 print_reg (base, code, file);
14531 if (index)
14533 putc (',', file);
14534 print_reg (index, vsib ? 0 : code, file);
14535 if (scale != 1 || vsib)
14536 fprintf (file, ",%d", scale);
14538 putc (')', file);
14540 else
14542 rtx offset = NULL_RTX;
14544 if (disp)
14546 /* Pull out the offset of a symbol; print any symbol itself. */
14547 if (GET_CODE (disp) == CONST
14548 && GET_CODE (XEXP (disp, 0)) == PLUS
14549 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14551 offset = XEXP (XEXP (disp, 0), 1);
14552 disp = gen_rtx_CONST (VOIDmode,
14553 XEXP (XEXP (disp, 0), 0));
14556 if (flag_pic)
14557 output_pic_addr_const (file, disp, 0);
14558 else if (GET_CODE (disp) == LABEL_REF)
14559 output_asm_label (disp);
14560 else if (CONST_INT_P (disp))
14561 offset = disp;
14562 else
14563 output_addr_const (file, disp);
14566 putc ('[', file);
14567 if (base)
14569 print_reg (base, code, file);
14570 if (offset)
14572 if (INTVAL (offset) >= 0)
14573 putc ('+', file);
14574 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14577 else if (offset)
14578 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14579 else
14580 putc ('0', file);
14582 if (index)
14584 putc ('+', file);
14585 print_reg (index, vsib ? 0 : code, file);
14586 if (scale != 1 || vsib)
14587 fprintf (file, "*%d", scale);
14589 putc (']', file);
14594 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14596 static bool
14597 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14599 rtx op;
14601 if (GET_CODE (x) != UNSPEC)
14602 return false;
14604 op = XVECEXP (x, 0, 0);
14605 switch (XINT (x, 1))
14607 case UNSPEC_GOTTPOFF:
14608 output_addr_const (file, op);
14609 /* FIXME: This might be @TPOFF in Sun ld. */
14610 fputs ("@gottpoff", file);
14611 break;
14612 case UNSPEC_TPOFF:
14613 output_addr_const (file, op);
14614 fputs ("@tpoff", file);
14615 break;
14616 case UNSPEC_NTPOFF:
14617 output_addr_const (file, op);
14618 if (TARGET_64BIT)
14619 fputs ("@tpoff", file);
14620 else
14621 fputs ("@ntpoff", file);
14622 break;
14623 case UNSPEC_DTPOFF:
14624 output_addr_const (file, op);
14625 fputs ("@dtpoff", file);
14626 break;
14627 case UNSPEC_GOTNTPOFF:
14628 output_addr_const (file, op);
14629 if (TARGET_64BIT)
14630 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14631 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14632 else
14633 fputs ("@gotntpoff", file);
14634 break;
14635 case UNSPEC_INDNTPOFF:
14636 output_addr_const (file, op);
14637 fputs ("@indntpoff", file);
14638 break;
14639 #if TARGET_MACHO
14640 case UNSPEC_MACHOPIC_OFFSET:
14641 output_addr_const (file, op);
14642 putc ('-', file);
14643 machopic_output_function_base_name (file);
14644 break;
14645 #endif
14647 case UNSPEC_STACK_CHECK:
14649 int offset;
14651 gcc_assert (flag_split_stack);
14653 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14654 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14655 #else
14656 gcc_unreachable ();
14657 #endif
14659 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14661 break;
14663 default:
14664 return false;
14667 return true;
14670 /* Split one or more double-mode RTL references into pairs of half-mode
14671 references. The RTL can be REG, offsettable MEM, integer constant, or
14672 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14673 split and "num" is its length. lo_half and hi_half are output arrays
14674 that parallel "operands". */
14676 void
14677 split_double_mode (enum machine_mode mode, rtx operands[],
14678 int num, rtx lo_half[], rtx hi_half[])
14680 enum machine_mode half_mode;
14681 unsigned int byte;
14683 switch (mode)
14685 case TImode:
14686 half_mode = DImode;
14687 break;
14688 case DImode:
14689 half_mode = SImode;
14690 break;
14691 default:
14692 gcc_unreachable ();
14695 byte = GET_MODE_SIZE (half_mode);
14697 while (num--)
14699 rtx op = operands[num];
14701 /* simplify_subreg refuse to split volatile memory addresses,
14702 but we still have to handle it. */
14703 if (MEM_P (op))
14705 lo_half[num] = adjust_address (op, half_mode, 0);
14706 hi_half[num] = adjust_address (op, half_mode, byte);
14708 else
14710 lo_half[num] = simplify_gen_subreg (half_mode, op,
14711 GET_MODE (op) == VOIDmode
14712 ? mode : GET_MODE (op), 0);
14713 hi_half[num] = simplify_gen_subreg (half_mode, op,
14714 GET_MODE (op) == VOIDmode
14715 ? mode : GET_MODE (op), byte);
14720 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14721 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14722 is the expression of the binary operation. The output may either be
14723 emitted here, or returned to the caller, like all output_* functions.
14725 There is no guarantee that the operands are the same mode, as they
14726 might be within FLOAT or FLOAT_EXTEND expressions. */
14728 #ifndef SYSV386_COMPAT
14729 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14730 wants to fix the assemblers because that causes incompatibility
14731 with gcc. No-one wants to fix gcc because that causes
14732 incompatibility with assemblers... You can use the option of
14733 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14734 #define SYSV386_COMPAT 1
14735 #endif
14737 const char *
14738 output_387_binary_op (rtx insn, rtx *operands)
14740 static char buf[40];
14741 const char *p;
14742 const char *ssep;
14743 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14745 #ifdef ENABLE_CHECKING
14746 /* Even if we do not want to check the inputs, this documents input
14747 constraints. Which helps in understanding the following code. */
14748 if (STACK_REG_P (operands[0])
14749 && ((REG_P (operands[1])
14750 && REGNO (operands[0]) == REGNO (operands[1])
14751 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14752 || (REG_P (operands[2])
14753 && REGNO (operands[0]) == REGNO (operands[2])
14754 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14755 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14756 ; /* ok */
14757 else
14758 gcc_assert (is_sse);
14759 #endif
14761 switch (GET_CODE (operands[3]))
14763 case PLUS:
14764 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14765 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14766 p = "fiadd";
14767 else
14768 p = "fadd";
14769 ssep = "vadd";
14770 break;
14772 case MINUS:
14773 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14774 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14775 p = "fisub";
14776 else
14777 p = "fsub";
14778 ssep = "vsub";
14779 break;
14781 case MULT:
14782 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14783 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14784 p = "fimul";
14785 else
14786 p = "fmul";
14787 ssep = "vmul";
14788 break;
14790 case DIV:
14791 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14792 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14793 p = "fidiv";
14794 else
14795 p = "fdiv";
14796 ssep = "vdiv";
14797 break;
14799 default:
14800 gcc_unreachable ();
14803 if (is_sse)
14805 if (TARGET_AVX)
14807 strcpy (buf, ssep);
14808 if (GET_MODE (operands[0]) == SFmode)
14809 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14810 else
14811 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14813 else
14815 strcpy (buf, ssep + 1);
14816 if (GET_MODE (operands[0]) == SFmode)
14817 strcat (buf, "ss\t{%2, %0|%0, %2}");
14818 else
14819 strcat (buf, "sd\t{%2, %0|%0, %2}");
14821 return buf;
14823 strcpy (buf, p);
14825 switch (GET_CODE (operands[3]))
14827 case MULT:
14828 case PLUS:
14829 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14831 rtx temp = operands[2];
14832 operands[2] = operands[1];
14833 operands[1] = temp;
14836 /* know operands[0] == operands[1]. */
14838 if (MEM_P (operands[2]))
14840 p = "%Z2\t%2";
14841 break;
14844 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14846 if (STACK_TOP_P (operands[0]))
14847 /* How is it that we are storing to a dead operand[2]?
14848 Well, presumably operands[1] is dead too. We can't
14849 store the result to st(0) as st(0) gets popped on this
14850 instruction. Instead store to operands[2] (which I
14851 think has to be st(1)). st(1) will be popped later.
14852 gcc <= 2.8.1 didn't have this check and generated
14853 assembly code that the Unixware assembler rejected. */
14854 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14855 else
14856 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14857 break;
14860 if (STACK_TOP_P (operands[0]))
14861 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14862 else
14863 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14864 break;
14866 case MINUS:
14867 case DIV:
14868 if (MEM_P (operands[1]))
14870 p = "r%Z1\t%1";
14871 break;
14874 if (MEM_P (operands[2]))
14876 p = "%Z2\t%2";
14877 break;
14880 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14882 #if SYSV386_COMPAT
14883 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14884 derived assemblers, confusingly reverse the direction of
14885 the operation for fsub{r} and fdiv{r} when the
14886 destination register is not st(0). The Intel assembler
14887 doesn't have this brain damage. Read !SYSV386_COMPAT to
14888 figure out what the hardware really does. */
14889 if (STACK_TOP_P (operands[0]))
14890 p = "{p\t%0, %2|rp\t%2, %0}";
14891 else
14892 p = "{rp\t%2, %0|p\t%0, %2}";
14893 #else
14894 if (STACK_TOP_P (operands[0]))
14895 /* As above for fmul/fadd, we can't store to st(0). */
14896 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14897 else
14898 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14899 #endif
14900 break;
14903 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14905 #if SYSV386_COMPAT
14906 if (STACK_TOP_P (operands[0]))
14907 p = "{rp\t%0, %1|p\t%1, %0}";
14908 else
14909 p = "{p\t%1, %0|rp\t%0, %1}";
14910 #else
14911 if (STACK_TOP_P (operands[0]))
14912 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14913 else
14914 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14915 #endif
14916 break;
14919 if (STACK_TOP_P (operands[0]))
14921 if (STACK_TOP_P (operands[1]))
14922 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14923 else
14924 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14925 break;
14927 else if (STACK_TOP_P (operands[1]))
14929 #if SYSV386_COMPAT
14930 p = "{\t%1, %0|r\t%0, %1}";
14931 #else
14932 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14933 #endif
14935 else
14937 #if SYSV386_COMPAT
14938 p = "{r\t%2, %0|\t%0, %2}";
14939 #else
14940 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14941 #endif
14943 break;
14945 default:
14946 gcc_unreachable ();
14949 strcat (buf, p);
14950 return buf;
14953 /* Check if a 256bit AVX register is referenced in stores. */
14955 static void
14956 check_avx256_stores (rtx dest, const_rtx set, void *data)
14958 if (((REG_P (dest) || MEM_P (dest))
14959 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (dest)))
14960 || (GET_CODE (set) == SET
14961 && (REG_P (SET_SRC (set)) || MEM_P (SET_SRC (set)))
14962 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (SET_SRC (set)))))
14964 bool *used = (bool *) data;
14965 *used = true;
14969 /* Return needed mode for entity in optimize_mode_switching pass. */
14971 static int
14972 ix86_avx_u128_mode_needed (rtx insn)
14974 bool avx_u128_used;
14976 if (CALL_P (insn))
14978 rtx link;
14980 /* Needed mode is set to AVX_U128_CLEAN if there are
14981 no 256bit modes used in function arguments. */
14982 for (link = CALL_INSN_FUNCTION_USAGE (insn);
14983 link;
14984 link = XEXP (link, 1))
14986 if (GET_CODE (XEXP (link, 0)) == USE)
14988 rtx arg = XEXP (XEXP (link, 0), 0);
14990 if (REG_P (arg)
14991 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (arg)))
14992 return AVX_U128_ANY;
14996 return AVX_U128_CLEAN;
14999 /* Check if a 256bit AVX register is referenced in stores. */
15000 avx_u128_used = false;
15001 note_stores (PATTERN (insn), check_avx256_stores, &avx_u128_used);
15002 if (avx_u128_used)
15003 return AVX_U128_DIRTY;
15005 return AVX_U128_ANY;
15008 /* Return mode that i387 must be switched into
15009 prior to the execution of insn. */
15011 static int
15012 ix86_i387_mode_needed (int entity, rtx insn)
15014 enum attr_i387_cw mode;
15016 /* The mode UNINITIALIZED is used to store control word after a
15017 function call or ASM pattern. The mode ANY specify that function
15018 has no requirements on the control word and make no changes in the
15019 bits we are interested in. */
15021 if (CALL_P (insn)
15022 || (NONJUMP_INSN_P (insn)
15023 && (asm_noperands (PATTERN (insn)) >= 0
15024 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15025 return I387_CW_UNINITIALIZED;
15027 if (recog_memoized (insn) < 0)
15028 return I387_CW_ANY;
15030 mode = get_attr_i387_cw (insn);
15032 switch (entity)
15034 case I387_TRUNC:
15035 if (mode == I387_CW_TRUNC)
15036 return mode;
15037 break;
15039 case I387_FLOOR:
15040 if (mode == I387_CW_FLOOR)
15041 return mode;
15042 break;
15044 case I387_CEIL:
15045 if (mode == I387_CW_CEIL)
15046 return mode;
15047 break;
15049 case I387_MASK_PM:
15050 if (mode == I387_CW_MASK_PM)
15051 return mode;
15052 break;
15054 default:
15055 gcc_unreachable ();
15058 return I387_CW_ANY;
15061 /* Return mode that entity must be switched into
15062 prior to the execution of insn. */
15065 ix86_mode_needed (int entity, rtx insn)
15067 switch (entity)
15069 case AVX_U128:
15070 return ix86_avx_u128_mode_needed (insn);
15071 case I387_TRUNC:
15072 case I387_FLOOR:
15073 case I387_CEIL:
15074 case I387_MASK_PM:
15075 return ix86_i387_mode_needed (entity, insn);
15076 default:
15077 gcc_unreachable ();
15079 return 0;
15082 /* Calculate mode of upper 128bit AVX registers after the insn. */
15084 static int
15085 ix86_avx_u128_mode_after (int mode, rtx insn)
15087 rtx pat = PATTERN (insn);
15088 bool avx_u128_used;
15090 if (vzeroupper_operation (pat, VOIDmode)
15091 || vzeroall_operation (pat, VOIDmode))
15092 return AVX_U128_CLEAN;
15094 /* Check if a 256bit AVX register is referenced in stores. */
15095 avx_u128_used = false;
15096 note_stores (pat, check_avx256_stores, &avx_u128_used);
15097 if (avx_u128_used)
15098 return AVX_U128_DIRTY;
15099 /* We know that state is clean after CALL insn if there are no
15100 256bit modes used in the function return register. */
15101 else if (CALL_P (insn))
15102 return AVX_U128_CLEAN;
15104 return mode;
15107 /* Return the mode that an insn results in. */
15110 ix86_mode_after (int entity, int mode, rtx insn)
15112 switch (entity)
15114 case AVX_U128:
15115 return ix86_avx_u128_mode_after (mode, insn);
15116 case I387_TRUNC:
15117 case I387_FLOOR:
15118 case I387_CEIL:
15119 case I387_MASK_PM:
15120 return mode;
15121 default:
15122 gcc_unreachable ();
15126 static int
15127 ix86_avx_u128_mode_entry (void)
15129 tree arg;
15131 /* Entry mode is set to AVX_U128_DIRTY if there are
15132 256bit modes used in function arguments. */
15133 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15134 arg = TREE_CHAIN (arg))
15136 rtx incoming = DECL_INCOMING_RTL (arg);
15138 if (incoming && REG_P (incoming)
15139 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (incoming)))
15140 return AVX_U128_DIRTY;
15143 return AVX_U128_CLEAN;
15146 /* Return a mode that ENTITY is assumed to be
15147 switched to at function entry. */
15150 ix86_mode_entry (int entity)
15152 switch (entity)
15154 case AVX_U128:
15155 return ix86_avx_u128_mode_entry ();
15156 case I387_TRUNC:
15157 case I387_FLOOR:
15158 case I387_CEIL:
15159 case I387_MASK_PM:
15160 return I387_CW_ANY;
15161 default:
15162 gcc_unreachable ();
15166 static int
15167 ix86_avx_u128_mode_exit (void)
15169 rtx reg = crtl->return_rtx;
15171 /* Exit mode is set to AVX_U128_DIRTY if there are
15172 256bit modes used in the function return register. */
15173 if (reg && REG_P (reg) && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (reg)))
15174 return AVX_U128_DIRTY;
15176 return AVX_U128_CLEAN;
15179 /* Return a mode that ENTITY is assumed to be
15180 switched to at function exit. */
15183 ix86_mode_exit (int entity)
15185 switch (entity)
15187 case AVX_U128:
15188 return ix86_avx_u128_mode_exit ();
15189 case I387_TRUNC:
15190 case I387_FLOOR:
15191 case I387_CEIL:
15192 case I387_MASK_PM:
15193 return I387_CW_ANY;
15194 default:
15195 gcc_unreachable ();
15199 /* Output code to initialize control word copies used by trunc?f?i and
15200 rounding patterns. CURRENT_MODE is set to current control word,
15201 while NEW_MODE is set to new control word. */
15203 static void
15204 emit_i387_cw_initialization (int mode)
15206 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15207 rtx new_mode;
15209 enum ix86_stack_slot slot;
15211 rtx reg = gen_reg_rtx (HImode);
15213 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15214 emit_move_insn (reg, copy_rtx (stored_mode));
15216 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15217 || optimize_function_for_size_p (cfun))
15219 switch (mode)
15221 case I387_CW_TRUNC:
15222 /* round toward zero (truncate) */
15223 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15224 slot = SLOT_CW_TRUNC;
15225 break;
15227 case I387_CW_FLOOR:
15228 /* round down toward -oo */
15229 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15230 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15231 slot = SLOT_CW_FLOOR;
15232 break;
15234 case I387_CW_CEIL:
15235 /* round up toward +oo */
15236 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15237 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15238 slot = SLOT_CW_CEIL;
15239 break;
15241 case I387_CW_MASK_PM:
15242 /* mask precision exception for nearbyint() */
15243 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15244 slot = SLOT_CW_MASK_PM;
15245 break;
15247 default:
15248 gcc_unreachable ();
15251 else
15253 switch (mode)
15255 case I387_CW_TRUNC:
15256 /* round toward zero (truncate) */
15257 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15258 slot = SLOT_CW_TRUNC;
15259 break;
15261 case I387_CW_FLOOR:
15262 /* round down toward -oo */
15263 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15264 slot = SLOT_CW_FLOOR;
15265 break;
15267 case I387_CW_CEIL:
15268 /* round up toward +oo */
15269 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15270 slot = SLOT_CW_CEIL;
15271 break;
15273 case I387_CW_MASK_PM:
15274 /* mask precision exception for nearbyint() */
15275 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15276 slot = SLOT_CW_MASK_PM;
15277 break;
15279 default:
15280 gcc_unreachable ();
15284 gcc_assert (slot < MAX_386_STACK_LOCALS);
15286 new_mode = assign_386_stack_local (HImode, slot);
15287 emit_move_insn (new_mode, reg);
15290 /* Generate one or more insns to set ENTITY to MODE. */
15292 void
15293 ix86_emit_mode_set (int entity, int mode)
15295 switch (entity)
15297 case AVX_U128:
15298 if (mode == AVX_U128_CLEAN)
15299 emit_insn (gen_avx_vzeroupper ());
15300 break;
15301 case I387_TRUNC:
15302 case I387_FLOOR:
15303 case I387_CEIL:
15304 case I387_MASK_PM:
15305 if (mode != I387_CW_ANY
15306 && mode != I387_CW_UNINITIALIZED)
15307 emit_i387_cw_initialization (mode);
15308 break;
15309 default:
15310 gcc_unreachable ();
15314 /* Output code for INSN to convert a float to a signed int. OPERANDS
15315 are the insn operands. The output may be [HSD]Imode and the input
15316 operand may be [SDX]Fmode. */
15318 const char *
15319 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15321 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15322 int dimode_p = GET_MODE (operands[0]) == DImode;
15323 int round_mode = get_attr_i387_cw (insn);
15325 /* Jump through a hoop or two for DImode, since the hardware has no
15326 non-popping instruction. We used to do this a different way, but
15327 that was somewhat fragile and broke with post-reload splitters. */
15328 if ((dimode_p || fisttp) && !stack_top_dies)
15329 output_asm_insn ("fld\t%y1", operands);
15331 gcc_assert (STACK_TOP_P (operands[1]));
15332 gcc_assert (MEM_P (operands[0]));
15333 gcc_assert (GET_MODE (operands[1]) != TFmode);
15335 if (fisttp)
15336 output_asm_insn ("fisttp%Z0\t%0", operands);
15337 else
15339 if (round_mode != I387_CW_ANY)
15340 output_asm_insn ("fldcw\t%3", operands);
15341 if (stack_top_dies || dimode_p)
15342 output_asm_insn ("fistp%Z0\t%0", operands);
15343 else
15344 output_asm_insn ("fist%Z0\t%0", operands);
15345 if (round_mode != I387_CW_ANY)
15346 output_asm_insn ("fldcw\t%2", operands);
15349 return "";
15352 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15353 have the values zero or one, indicates the ffreep insn's operand
15354 from the OPERANDS array. */
15356 static const char *
15357 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15359 if (TARGET_USE_FFREEP)
15360 #ifdef HAVE_AS_IX86_FFREEP
15361 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15362 #else
15364 static char retval[32];
15365 int regno = REGNO (operands[opno]);
15367 gcc_assert (STACK_REGNO_P (regno));
15369 regno -= FIRST_STACK_REG;
15371 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15372 return retval;
15374 #endif
15376 return opno ? "fstp\t%y1" : "fstp\t%y0";
15380 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15381 should be used. UNORDERED_P is true when fucom should be used. */
15383 const char *
15384 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15386 int stack_top_dies;
15387 rtx cmp_op0, cmp_op1;
15388 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15390 if (eflags_p)
15392 cmp_op0 = operands[0];
15393 cmp_op1 = operands[1];
15395 else
15397 cmp_op0 = operands[1];
15398 cmp_op1 = operands[2];
15401 if (is_sse)
15403 if (GET_MODE (operands[0]) == SFmode)
15404 if (unordered_p)
15405 return "%vucomiss\t{%1, %0|%0, %1}";
15406 else
15407 return "%vcomiss\t{%1, %0|%0, %1}";
15408 else
15409 if (unordered_p)
15410 return "%vucomisd\t{%1, %0|%0, %1}";
15411 else
15412 return "%vcomisd\t{%1, %0|%0, %1}";
15415 gcc_assert (STACK_TOP_P (cmp_op0));
15417 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15419 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15421 if (stack_top_dies)
15423 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15424 return output_387_ffreep (operands, 1);
15426 else
15427 return "ftst\n\tfnstsw\t%0";
15430 if (STACK_REG_P (cmp_op1)
15431 && stack_top_dies
15432 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15433 && REGNO (cmp_op1) != FIRST_STACK_REG)
15435 /* If both the top of the 387 stack dies, and the other operand
15436 is also a stack register that dies, then this must be a
15437 `fcompp' float compare */
15439 if (eflags_p)
15441 /* There is no double popping fcomi variant. Fortunately,
15442 eflags is immune from the fstp's cc clobbering. */
15443 if (unordered_p)
15444 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15445 else
15446 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15447 return output_387_ffreep (operands, 0);
15449 else
15451 if (unordered_p)
15452 return "fucompp\n\tfnstsw\t%0";
15453 else
15454 return "fcompp\n\tfnstsw\t%0";
15457 else
15459 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15461 static const char * const alt[16] =
15463 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15464 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15465 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15466 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15468 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15469 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15470 NULL,
15471 NULL,
15473 "fcomi\t{%y1, %0|%0, %y1}",
15474 "fcomip\t{%y1, %0|%0, %y1}",
15475 "fucomi\t{%y1, %0|%0, %y1}",
15476 "fucomip\t{%y1, %0|%0, %y1}",
15478 NULL,
15479 NULL,
15480 NULL,
15481 NULL
15484 int mask;
15485 const char *ret;
15487 mask = eflags_p << 3;
15488 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15489 mask |= unordered_p << 1;
15490 mask |= stack_top_dies;
15492 gcc_assert (mask < 16);
15493 ret = alt[mask];
15494 gcc_assert (ret);
15496 return ret;
15500 void
15501 ix86_output_addr_vec_elt (FILE *file, int value)
15503 const char *directive = ASM_LONG;
15505 #ifdef ASM_QUAD
15506 if (TARGET_LP64)
15507 directive = ASM_QUAD;
15508 #else
15509 gcc_assert (!TARGET_64BIT);
15510 #endif
15512 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15515 void
15516 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15518 const char *directive = ASM_LONG;
15520 #ifdef ASM_QUAD
15521 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15522 directive = ASM_QUAD;
15523 #else
15524 gcc_assert (!TARGET_64BIT);
15525 #endif
15526 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15527 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15528 fprintf (file, "%s%s%d-%s%d\n",
15529 directive, LPREFIX, value, LPREFIX, rel);
15530 else if (HAVE_AS_GOTOFF_IN_DATA)
15531 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15532 #if TARGET_MACHO
15533 else if (TARGET_MACHO)
15535 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15536 machopic_output_function_base_name (file);
15537 putc ('\n', file);
15539 #endif
15540 else
15541 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15542 GOT_SYMBOL_NAME, LPREFIX, value);
15545 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15546 for the target. */
15548 void
15549 ix86_expand_clear (rtx dest)
15551 rtx tmp;
15553 /* We play register width games, which are only valid after reload. */
15554 gcc_assert (reload_completed);
15556 /* Avoid HImode and its attendant prefix byte. */
15557 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15558 dest = gen_rtx_REG (SImode, REGNO (dest));
15559 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15561 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15562 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15564 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15565 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15568 emit_insn (tmp);
15571 /* X is an unchanging MEM. If it is a constant pool reference, return
15572 the constant pool rtx, else NULL. */
15575 maybe_get_pool_constant (rtx x)
15577 x = ix86_delegitimize_address (XEXP (x, 0));
15579 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15580 return get_pool_constant (x);
15582 return NULL_RTX;
15585 void
15586 ix86_expand_move (enum machine_mode mode, rtx operands[])
15588 rtx op0, op1;
15589 enum tls_model model;
15591 op0 = operands[0];
15592 op1 = operands[1];
15594 if (GET_CODE (op1) == SYMBOL_REF)
15596 model = SYMBOL_REF_TLS_MODEL (op1);
15597 if (model)
15599 op1 = legitimize_tls_address (op1, model, true);
15600 op1 = force_operand (op1, op0);
15601 if (op1 == op0)
15602 return;
15603 if (GET_MODE (op1) != mode)
15604 op1 = convert_to_mode (mode, op1, 1);
15606 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15607 && SYMBOL_REF_DLLIMPORT_P (op1))
15608 op1 = legitimize_dllimport_symbol (op1, false);
15610 else if (GET_CODE (op1) == CONST
15611 && GET_CODE (XEXP (op1, 0)) == PLUS
15612 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15614 rtx addend = XEXP (XEXP (op1, 0), 1);
15615 rtx symbol = XEXP (XEXP (op1, 0), 0);
15616 rtx tmp = NULL;
15618 model = SYMBOL_REF_TLS_MODEL (symbol);
15619 if (model)
15620 tmp = legitimize_tls_address (symbol, model, true);
15621 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15622 && SYMBOL_REF_DLLIMPORT_P (symbol))
15623 tmp = legitimize_dllimport_symbol (symbol, true);
15625 if (tmp)
15627 tmp = force_operand (tmp, NULL);
15628 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15629 op0, 1, OPTAB_DIRECT);
15630 if (tmp == op0)
15631 return;
15632 if (GET_MODE (tmp) != mode)
15633 op1 = convert_to_mode (mode, tmp, 1);
15637 if ((flag_pic || MACHOPIC_INDIRECT)
15638 && symbolic_operand (op1, mode))
15640 if (TARGET_MACHO && !TARGET_64BIT)
15642 #if TARGET_MACHO
15643 /* dynamic-no-pic */
15644 if (MACHOPIC_INDIRECT)
15646 rtx temp = ((reload_in_progress
15647 || ((op0 && REG_P (op0))
15648 && mode == Pmode))
15649 ? op0 : gen_reg_rtx (Pmode));
15650 op1 = machopic_indirect_data_reference (op1, temp);
15651 if (MACHOPIC_PURE)
15652 op1 = machopic_legitimize_pic_address (op1, mode,
15653 temp == op1 ? 0 : temp);
15655 if (op0 != op1 && GET_CODE (op0) != MEM)
15657 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15658 emit_insn (insn);
15659 return;
15661 if (GET_CODE (op0) == MEM)
15662 op1 = force_reg (Pmode, op1);
15663 else
15665 rtx temp = op0;
15666 if (GET_CODE (temp) != REG)
15667 temp = gen_reg_rtx (Pmode);
15668 temp = legitimize_pic_address (op1, temp);
15669 if (temp == op0)
15670 return;
15671 op1 = temp;
15673 /* dynamic-no-pic */
15674 #endif
15676 else
15678 if (MEM_P (op0))
15679 op1 = force_reg (mode, op1);
15680 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15682 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15683 op1 = legitimize_pic_address (op1, reg);
15684 if (op0 == op1)
15685 return;
15686 if (GET_MODE (op1) != mode)
15687 op1 = convert_to_mode (mode, op1, 1);
15691 else
15693 if (MEM_P (op0)
15694 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15695 || !push_operand (op0, mode))
15696 && MEM_P (op1))
15697 op1 = force_reg (mode, op1);
15699 if (push_operand (op0, mode)
15700 && ! general_no_elim_operand (op1, mode))
15701 op1 = copy_to_mode_reg (mode, op1);
15703 /* Force large constants in 64bit compilation into register
15704 to get them CSEed. */
15705 if (can_create_pseudo_p ()
15706 && (mode == DImode) && TARGET_64BIT
15707 && immediate_operand (op1, mode)
15708 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15709 && !register_operand (op0, mode)
15710 && optimize)
15711 op1 = copy_to_mode_reg (mode, op1);
15713 if (can_create_pseudo_p ()
15714 && FLOAT_MODE_P (mode)
15715 && GET_CODE (op1) == CONST_DOUBLE)
15717 /* If we are loading a floating point constant to a register,
15718 force the value to memory now, since we'll get better code
15719 out the back end. */
15721 op1 = validize_mem (force_const_mem (mode, op1));
15722 if (!register_operand (op0, mode))
15724 rtx temp = gen_reg_rtx (mode);
15725 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15726 emit_move_insn (op0, temp);
15727 return;
15732 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15735 void
15736 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15738 rtx op0 = operands[0], op1 = operands[1];
15739 unsigned int align = GET_MODE_ALIGNMENT (mode);
15741 /* Force constants other than zero into memory. We do not know how
15742 the instructions used to build constants modify the upper 64 bits
15743 of the register, once we have that information we may be able
15744 to handle some of them more efficiently. */
15745 if (can_create_pseudo_p ()
15746 && register_operand (op0, mode)
15747 && (CONSTANT_P (op1)
15748 || (GET_CODE (op1) == SUBREG
15749 && CONSTANT_P (SUBREG_REG (op1))))
15750 && !standard_sse_constant_p (op1))
15751 op1 = validize_mem (force_const_mem (mode, op1));
15753 /* We need to check memory alignment for SSE mode since attribute
15754 can make operands unaligned. */
15755 if (can_create_pseudo_p ()
15756 && SSE_REG_MODE_P (mode)
15757 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15758 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15760 rtx tmp[2];
15762 /* ix86_expand_vector_move_misalign() does not like constants ... */
15763 if (CONSTANT_P (op1)
15764 || (GET_CODE (op1) == SUBREG
15765 && CONSTANT_P (SUBREG_REG (op1))))
15766 op1 = validize_mem (force_const_mem (mode, op1));
15768 /* ... nor both arguments in memory. */
15769 if (!register_operand (op0, mode)
15770 && !register_operand (op1, mode))
15771 op1 = force_reg (mode, op1);
15773 tmp[0] = op0; tmp[1] = op1;
15774 ix86_expand_vector_move_misalign (mode, tmp);
15775 return;
15778 /* Make operand1 a register if it isn't already. */
15779 if (can_create_pseudo_p ()
15780 && !register_operand (op0, mode)
15781 && !register_operand (op1, mode))
15783 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15784 return;
15787 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15790 /* Split 32-byte AVX unaligned load and store if needed. */
15792 static void
15793 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15795 rtx m;
15796 rtx (*extract) (rtx, rtx, rtx);
15797 rtx (*load_unaligned) (rtx, rtx);
15798 rtx (*store_unaligned) (rtx, rtx);
15799 enum machine_mode mode;
15801 switch (GET_MODE (op0))
15803 default:
15804 gcc_unreachable ();
15805 case V32QImode:
15806 extract = gen_avx_vextractf128v32qi;
15807 load_unaligned = gen_avx_loaddqu256;
15808 store_unaligned = gen_avx_storedqu256;
15809 mode = V16QImode;
15810 break;
15811 case V8SFmode:
15812 extract = gen_avx_vextractf128v8sf;
15813 load_unaligned = gen_avx_loadups256;
15814 store_unaligned = gen_avx_storeups256;
15815 mode = V4SFmode;
15816 break;
15817 case V4DFmode:
15818 extract = gen_avx_vextractf128v4df;
15819 load_unaligned = gen_avx_loadupd256;
15820 store_unaligned = gen_avx_storeupd256;
15821 mode = V2DFmode;
15822 break;
15825 if (MEM_P (op1))
15827 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15829 rtx r = gen_reg_rtx (mode);
15830 m = adjust_address (op1, mode, 0);
15831 emit_move_insn (r, m);
15832 m = adjust_address (op1, mode, 16);
15833 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15834 emit_move_insn (op0, r);
15836 else
15837 emit_insn (load_unaligned (op0, op1));
15839 else if (MEM_P (op0))
15841 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15843 m = adjust_address (op0, mode, 0);
15844 emit_insn (extract (m, op1, const0_rtx));
15845 m = adjust_address (op0, mode, 16);
15846 emit_insn (extract (m, op1, const1_rtx));
15848 else
15849 emit_insn (store_unaligned (op0, op1));
15851 else
15852 gcc_unreachable ();
15855 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15856 straight to ix86_expand_vector_move. */
15857 /* Code generation for scalar reg-reg moves of single and double precision data:
15858 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15859 movaps reg, reg
15860 else
15861 movss reg, reg
15862 if (x86_sse_partial_reg_dependency == true)
15863 movapd reg, reg
15864 else
15865 movsd reg, reg
15867 Code generation for scalar loads of double precision data:
15868 if (x86_sse_split_regs == true)
15869 movlpd mem, reg (gas syntax)
15870 else
15871 movsd mem, reg
15873 Code generation for unaligned packed loads of single precision data
15874 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15875 if (x86_sse_unaligned_move_optimal)
15876 movups mem, reg
15878 if (x86_sse_partial_reg_dependency == true)
15880 xorps reg, reg
15881 movlps mem, reg
15882 movhps mem+8, reg
15884 else
15886 movlps mem, reg
15887 movhps mem+8, reg
15890 Code generation for unaligned packed loads of double precision data
15891 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15892 if (x86_sse_unaligned_move_optimal)
15893 movupd mem, reg
15895 if (x86_sse_split_regs == true)
15897 movlpd mem, reg
15898 movhpd mem+8, reg
15900 else
15902 movsd mem, reg
15903 movhpd mem+8, reg
15907 void
15908 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15910 rtx op0, op1, m;
15912 op0 = operands[0];
15913 op1 = operands[1];
15915 if (TARGET_AVX
15916 && GET_MODE_SIZE (mode) == 32)
15918 switch (GET_MODE_CLASS (mode))
15920 case MODE_VECTOR_INT:
15921 case MODE_INT:
15922 op0 = gen_lowpart (V32QImode, op0);
15923 op1 = gen_lowpart (V32QImode, op1);
15924 /* FALLTHRU */
15926 case MODE_VECTOR_FLOAT:
15927 ix86_avx256_split_vector_move_misalign (op0, op1);
15928 break;
15930 default:
15931 gcc_unreachable ();
15934 return;
15937 if (MEM_P (op1))
15939 /* ??? If we have typed data, then it would appear that using
15940 movdqu is the only way to get unaligned data loaded with
15941 integer type. */
15942 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15944 op0 = gen_lowpart (V16QImode, op0);
15945 op1 = gen_lowpart (V16QImode, op1);
15946 /* We will eventually emit movups based on insn attributes. */
15947 emit_insn (gen_sse2_loaddqu (op0, op1));
15949 else if (TARGET_SSE2 && mode == V2DFmode)
15951 rtx zero;
15953 if (TARGET_AVX
15954 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
15955 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15956 || optimize_function_for_size_p (cfun))
15958 /* We will eventually emit movups based on insn attributes. */
15959 emit_insn (gen_sse2_loadupd (op0, op1));
15960 return;
15963 /* When SSE registers are split into halves, we can avoid
15964 writing to the top half twice. */
15965 if (TARGET_SSE_SPLIT_REGS)
15967 emit_clobber (op0);
15968 zero = op0;
15970 else
15972 /* ??? Not sure about the best option for the Intel chips.
15973 The following would seem to satisfy; the register is
15974 entirely cleared, breaking the dependency chain. We
15975 then store to the upper half, with a dependency depth
15976 of one. A rumor has it that Intel recommends two movsd
15977 followed by an unpacklpd, but this is unconfirmed. And
15978 given that the dependency depth of the unpacklpd would
15979 still be one, I'm not sure why this would be better. */
15980 zero = CONST0_RTX (V2DFmode);
15983 m = adjust_address (op1, DFmode, 0);
15984 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15985 m = adjust_address (op1, DFmode, 8);
15986 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15988 else
15990 if (TARGET_AVX
15991 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
15992 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15993 || optimize_function_for_size_p (cfun))
15995 op0 = gen_lowpart (V4SFmode, op0);
15996 op1 = gen_lowpart (V4SFmode, op1);
15997 emit_insn (gen_sse_loadups (op0, op1));
15998 return;
16001 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16002 emit_move_insn (op0, CONST0_RTX (mode));
16003 else
16004 emit_clobber (op0);
16006 if (mode != V4SFmode)
16007 op0 = gen_lowpart (V4SFmode, op0);
16009 m = adjust_address (op1, V2SFmode, 0);
16010 emit_insn (gen_sse_loadlps (op0, op0, m));
16011 m = adjust_address (op1, V2SFmode, 8);
16012 emit_insn (gen_sse_loadhps (op0, op0, m));
16015 else if (MEM_P (op0))
16017 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16019 op0 = gen_lowpart (V16QImode, op0);
16020 op1 = gen_lowpart (V16QImode, op1);
16021 /* We will eventually emit movups based on insn attributes. */
16022 emit_insn (gen_sse2_storedqu (op0, op1));
16024 else if (TARGET_SSE2 && mode == V2DFmode)
16026 if (TARGET_AVX
16027 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16028 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16029 || optimize_function_for_size_p (cfun))
16030 /* We will eventually emit movups based on insn attributes. */
16031 emit_insn (gen_sse2_storeupd (op0, op1));
16032 else
16034 m = adjust_address (op0, DFmode, 0);
16035 emit_insn (gen_sse2_storelpd (m, op1));
16036 m = adjust_address (op0, DFmode, 8);
16037 emit_insn (gen_sse2_storehpd (m, op1));
16040 else
16042 if (mode != V4SFmode)
16043 op1 = gen_lowpart (V4SFmode, op1);
16045 if (TARGET_AVX
16046 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16047 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16048 || optimize_function_for_size_p (cfun))
16050 op0 = gen_lowpart (V4SFmode, op0);
16051 emit_insn (gen_sse_storeups (op0, op1));
16053 else
16055 m = adjust_address (op0, V2SFmode, 0);
16056 emit_insn (gen_sse_storelps (m, op1));
16057 m = adjust_address (op0, V2SFmode, 8);
16058 emit_insn (gen_sse_storehps (m, op1));
16062 else
16063 gcc_unreachable ();
16066 /* Expand a push in MODE. This is some mode for which we do not support
16067 proper push instructions, at least from the registers that we expect
16068 the value to live in. */
16070 void
16071 ix86_expand_push (enum machine_mode mode, rtx x)
16073 rtx tmp;
16075 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16076 GEN_INT (-GET_MODE_SIZE (mode)),
16077 stack_pointer_rtx, 1, OPTAB_DIRECT);
16078 if (tmp != stack_pointer_rtx)
16079 emit_move_insn (stack_pointer_rtx, tmp);
16081 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16083 /* When we push an operand onto stack, it has to be aligned at least
16084 at the function argument boundary. However since we don't have
16085 the argument type, we can't determine the actual argument
16086 boundary. */
16087 emit_move_insn (tmp, x);
16090 /* Helper function of ix86_fixup_binary_operands to canonicalize
16091 operand order. Returns true if the operands should be swapped. */
16093 static bool
16094 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16095 rtx operands[])
16097 rtx dst = operands[0];
16098 rtx src1 = operands[1];
16099 rtx src2 = operands[2];
16101 /* If the operation is not commutative, we can't do anything. */
16102 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16103 return false;
16105 /* Highest priority is that src1 should match dst. */
16106 if (rtx_equal_p (dst, src1))
16107 return false;
16108 if (rtx_equal_p (dst, src2))
16109 return true;
16111 /* Next highest priority is that immediate constants come second. */
16112 if (immediate_operand (src2, mode))
16113 return false;
16114 if (immediate_operand (src1, mode))
16115 return true;
16117 /* Lowest priority is that memory references should come second. */
16118 if (MEM_P (src2))
16119 return false;
16120 if (MEM_P (src1))
16121 return true;
16123 return false;
16127 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16128 destination to use for the operation. If different from the true
16129 destination in operands[0], a copy operation will be required. */
16132 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16133 rtx operands[])
16135 rtx dst = operands[0];
16136 rtx src1 = operands[1];
16137 rtx src2 = operands[2];
16139 /* Canonicalize operand order. */
16140 if (ix86_swap_binary_operands_p (code, mode, operands))
16142 rtx temp;
16144 /* It is invalid to swap operands of different modes. */
16145 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16147 temp = src1;
16148 src1 = src2;
16149 src2 = temp;
16152 /* Both source operands cannot be in memory. */
16153 if (MEM_P (src1) && MEM_P (src2))
16155 /* Optimization: Only read from memory once. */
16156 if (rtx_equal_p (src1, src2))
16158 src2 = force_reg (mode, src2);
16159 src1 = src2;
16161 else
16162 src2 = force_reg (mode, src2);
16165 /* If the destination is memory, and we do not have matching source
16166 operands, do things in registers. */
16167 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16168 dst = gen_reg_rtx (mode);
16170 /* Source 1 cannot be a constant. */
16171 if (CONSTANT_P (src1))
16172 src1 = force_reg (mode, src1);
16174 /* Source 1 cannot be a non-matching memory. */
16175 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16176 src1 = force_reg (mode, src1);
16178 /* Improve address combine. */
16179 if (code == PLUS
16180 && GET_MODE_CLASS (mode) == MODE_INT
16181 && MEM_P (src2))
16182 src2 = force_reg (mode, src2);
16184 operands[1] = src1;
16185 operands[2] = src2;
16186 return dst;
16189 /* Similarly, but assume that the destination has already been
16190 set up properly. */
16192 void
16193 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16194 enum machine_mode mode, rtx operands[])
16196 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16197 gcc_assert (dst == operands[0]);
16200 /* Attempt to expand a binary operator. Make the expansion closer to the
16201 actual machine, then just general_operand, which will allow 3 separate
16202 memory references (one output, two input) in a single insn. */
16204 void
16205 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16206 rtx operands[])
16208 rtx src1, src2, dst, op, clob;
16210 dst = ix86_fixup_binary_operands (code, mode, operands);
16211 src1 = operands[1];
16212 src2 = operands[2];
16214 /* Emit the instruction. */
16216 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16217 if (reload_in_progress)
16219 /* Reload doesn't know about the flags register, and doesn't know that
16220 it doesn't want to clobber it. We can only do this with PLUS. */
16221 gcc_assert (code == PLUS);
16222 emit_insn (op);
16224 else if (reload_completed
16225 && code == PLUS
16226 && !rtx_equal_p (dst, src1))
16228 /* This is going to be an LEA; avoid splitting it later. */
16229 emit_insn (op);
16231 else
16233 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16234 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16237 /* Fix up the destination if needed. */
16238 if (dst != operands[0])
16239 emit_move_insn (operands[0], dst);
16242 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16243 the given OPERANDS. */
16245 void
16246 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16247 rtx operands[])
16249 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16250 if (GET_CODE (operands[1]) == SUBREG)
16252 op1 = operands[1];
16253 op2 = operands[2];
16255 else if (GET_CODE (operands[2]) == SUBREG)
16257 op1 = operands[2];
16258 op2 = operands[1];
16260 /* Optimize (__m128i) d | (__m128i) e and similar code
16261 when d and e are float vectors into float vector logical
16262 insn. In C/C++ without using intrinsics there is no other way
16263 to express vector logical operation on float vectors than
16264 to cast them temporarily to integer vectors. */
16265 if (op1
16266 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16267 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16268 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16269 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16270 && SUBREG_BYTE (op1) == 0
16271 && (GET_CODE (op2) == CONST_VECTOR
16272 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16273 && SUBREG_BYTE (op2) == 0))
16274 && can_create_pseudo_p ())
16276 rtx dst;
16277 switch (GET_MODE (SUBREG_REG (op1)))
16279 case V4SFmode:
16280 case V8SFmode:
16281 case V2DFmode:
16282 case V4DFmode:
16283 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16284 if (GET_CODE (op2) == CONST_VECTOR)
16286 op2 = gen_lowpart (GET_MODE (dst), op2);
16287 op2 = force_reg (GET_MODE (dst), op2);
16289 else
16291 op1 = operands[1];
16292 op2 = SUBREG_REG (operands[2]);
16293 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16294 op2 = force_reg (GET_MODE (dst), op2);
16296 op1 = SUBREG_REG (op1);
16297 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16298 op1 = force_reg (GET_MODE (dst), op1);
16299 emit_insn (gen_rtx_SET (VOIDmode, dst,
16300 gen_rtx_fmt_ee (code, GET_MODE (dst),
16301 op1, op2)));
16302 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16303 return;
16304 default:
16305 break;
16308 if (!nonimmediate_operand (operands[1], mode))
16309 operands[1] = force_reg (mode, operands[1]);
16310 if (!nonimmediate_operand (operands[2], mode))
16311 operands[2] = force_reg (mode, operands[2]);
16312 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16313 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16314 gen_rtx_fmt_ee (code, mode, operands[1],
16315 operands[2])));
16318 /* Return TRUE or FALSE depending on whether the binary operator meets the
16319 appropriate constraints. */
16321 bool
16322 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16323 rtx operands[3])
16325 rtx dst = operands[0];
16326 rtx src1 = operands[1];
16327 rtx src2 = operands[2];
16329 /* Both source operands cannot be in memory. */
16330 if (MEM_P (src1) && MEM_P (src2))
16331 return false;
16333 /* Canonicalize operand order for commutative operators. */
16334 if (ix86_swap_binary_operands_p (code, mode, operands))
16336 rtx temp = src1;
16337 src1 = src2;
16338 src2 = temp;
16341 /* If the destination is memory, we must have a matching source operand. */
16342 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16343 return false;
16345 /* Source 1 cannot be a constant. */
16346 if (CONSTANT_P (src1))
16347 return false;
16349 /* Source 1 cannot be a non-matching memory. */
16350 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16351 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16352 return (code == AND
16353 && (mode == HImode
16354 || mode == SImode
16355 || (TARGET_64BIT && mode == DImode))
16356 && satisfies_constraint_L (src2));
16358 return true;
16361 /* Attempt to expand a unary operator. Make the expansion closer to the
16362 actual machine, then just general_operand, which will allow 2 separate
16363 memory references (one output, one input) in a single insn. */
16365 void
16366 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16367 rtx operands[])
16369 int matching_memory;
16370 rtx src, dst, op, clob;
16372 dst = operands[0];
16373 src = operands[1];
16375 /* If the destination is memory, and we do not have matching source
16376 operands, do things in registers. */
16377 matching_memory = 0;
16378 if (MEM_P (dst))
16380 if (rtx_equal_p (dst, src))
16381 matching_memory = 1;
16382 else
16383 dst = gen_reg_rtx (mode);
16386 /* When source operand is memory, destination must match. */
16387 if (MEM_P (src) && !matching_memory)
16388 src = force_reg (mode, src);
16390 /* Emit the instruction. */
16392 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16393 if (reload_in_progress || code == NOT)
16395 /* Reload doesn't know about the flags register, and doesn't know that
16396 it doesn't want to clobber it. */
16397 gcc_assert (code == NOT);
16398 emit_insn (op);
16400 else
16402 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16403 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16406 /* Fix up the destination if needed. */
16407 if (dst != operands[0])
16408 emit_move_insn (operands[0], dst);
16411 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16412 divisor are within the range [0-255]. */
16414 void
16415 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16416 bool signed_p)
16418 rtx end_label, qimode_label;
16419 rtx insn, div, mod;
16420 rtx scratch, tmp0, tmp1, tmp2;
16421 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16422 rtx (*gen_zero_extend) (rtx, rtx);
16423 rtx (*gen_test_ccno_1) (rtx, rtx);
16425 switch (mode)
16427 case SImode:
16428 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16429 gen_test_ccno_1 = gen_testsi_ccno_1;
16430 gen_zero_extend = gen_zero_extendqisi2;
16431 break;
16432 case DImode:
16433 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16434 gen_test_ccno_1 = gen_testdi_ccno_1;
16435 gen_zero_extend = gen_zero_extendqidi2;
16436 break;
16437 default:
16438 gcc_unreachable ();
16441 end_label = gen_label_rtx ();
16442 qimode_label = gen_label_rtx ();
16444 scratch = gen_reg_rtx (mode);
16446 /* Use 8bit unsigned divimod if dividend and divisor are within
16447 the range [0-255]. */
16448 emit_move_insn (scratch, operands[2]);
16449 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16450 scratch, 1, OPTAB_DIRECT);
16451 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16452 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16453 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16454 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16455 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16456 pc_rtx);
16457 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16458 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16459 JUMP_LABEL (insn) = qimode_label;
16461 /* Generate original signed/unsigned divimod. */
16462 div = gen_divmod4_1 (operands[0], operands[1],
16463 operands[2], operands[3]);
16464 emit_insn (div);
16466 /* Branch to the end. */
16467 emit_jump_insn (gen_jump (end_label));
16468 emit_barrier ();
16470 /* Generate 8bit unsigned divide. */
16471 emit_label (qimode_label);
16472 /* Don't use operands[0] for result of 8bit divide since not all
16473 registers support QImode ZERO_EXTRACT. */
16474 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16475 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16476 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16477 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16479 if (signed_p)
16481 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16482 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16484 else
16486 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16487 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16490 /* Extract remainder from AH. */
16491 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16492 if (REG_P (operands[1]))
16493 insn = emit_move_insn (operands[1], tmp1);
16494 else
16496 /* Need a new scratch register since the old one has result
16497 of 8bit divide. */
16498 scratch = gen_reg_rtx (mode);
16499 emit_move_insn (scratch, tmp1);
16500 insn = emit_move_insn (operands[1], scratch);
16502 set_unique_reg_note (insn, REG_EQUAL, mod);
16504 /* Zero extend quotient from AL. */
16505 tmp1 = gen_lowpart (QImode, tmp0);
16506 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16507 set_unique_reg_note (insn, REG_EQUAL, div);
16509 emit_label (end_label);
16512 #define LEA_MAX_STALL (3)
16513 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16515 /* Increase given DISTANCE in half-cycles according to
16516 dependencies between PREV and NEXT instructions.
16517 Add 1 half-cycle if there is no dependency and
16518 go to next cycle if there is some dependecy. */
16520 static unsigned int
16521 increase_distance (rtx prev, rtx next, unsigned int distance)
16523 df_ref *use_rec;
16524 df_ref *def_rec;
16526 if (!prev || !next)
16527 return distance + (distance & 1) + 2;
16529 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16530 return distance + 1;
16532 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16533 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16534 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16535 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16536 return distance + (distance & 1) + 2;
16538 return distance + 1;
16541 /* Function checks if instruction INSN defines register number
16542 REGNO1 or REGNO2. */
16544 static bool
16545 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16546 rtx insn)
16548 df_ref *def_rec;
16550 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16551 if (DF_REF_REG_DEF_P (*def_rec)
16552 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16553 && (regno1 == DF_REF_REGNO (*def_rec)
16554 || regno2 == DF_REF_REGNO (*def_rec)))
16556 return true;
16559 return false;
16562 /* Function checks if instruction INSN uses register number
16563 REGNO as a part of address expression. */
16565 static bool
16566 insn_uses_reg_mem (unsigned int regno, rtx insn)
16568 df_ref *use_rec;
16570 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16571 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16572 return true;
16574 return false;
16577 /* Search backward for non-agu definition of register number REGNO1
16578 or register number REGNO2 in basic block starting from instruction
16579 START up to head of basic block or instruction INSN.
16581 Function puts true value into *FOUND var if definition was found
16582 and false otherwise.
16584 Distance in half-cycles between START and found instruction or head
16585 of BB is added to DISTANCE and returned. */
16587 static int
16588 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16589 rtx insn, int distance,
16590 rtx start, bool *found)
16592 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16593 rtx prev = start;
16594 rtx next = NULL;
16596 *found = false;
16598 while (prev
16599 && prev != insn
16600 && distance < LEA_SEARCH_THRESHOLD)
16602 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16604 distance = increase_distance (prev, next, distance);
16605 if (insn_defines_reg (regno1, regno2, prev))
16607 if (recog_memoized (prev) < 0
16608 || get_attr_type (prev) != TYPE_LEA)
16610 *found = true;
16611 return distance;
16615 next = prev;
16617 if (prev == BB_HEAD (bb))
16618 break;
16620 prev = PREV_INSN (prev);
16623 return distance;
16626 /* Search backward for non-agu definition of register number REGNO1
16627 or register number REGNO2 in INSN's basic block until
16628 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16629 2. Reach neighbour BBs boundary, or
16630 3. Reach agu definition.
16631 Returns the distance between the non-agu definition point and INSN.
16632 If no definition point, returns -1. */
16634 static int
16635 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16636 rtx insn)
16638 basic_block bb = BLOCK_FOR_INSN (insn);
16639 int distance = 0;
16640 bool found = false;
16642 if (insn != BB_HEAD (bb))
16643 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16644 distance, PREV_INSN (insn),
16645 &found);
16647 if (!found && distance < LEA_SEARCH_THRESHOLD)
16649 edge e;
16650 edge_iterator ei;
16651 bool simple_loop = false;
16653 FOR_EACH_EDGE (e, ei, bb->preds)
16654 if (e->src == bb)
16656 simple_loop = true;
16657 break;
16660 if (simple_loop)
16661 distance = distance_non_agu_define_in_bb (regno1, regno2,
16662 insn, distance,
16663 BB_END (bb), &found);
16664 else
16666 int shortest_dist = -1;
16667 bool found_in_bb = false;
16669 FOR_EACH_EDGE (e, ei, bb->preds)
16671 int bb_dist
16672 = distance_non_agu_define_in_bb (regno1, regno2,
16673 insn, distance,
16674 BB_END (e->src),
16675 &found_in_bb);
16676 if (found_in_bb)
16678 if (shortest_dist < 0)
16679 shortest_dist = bb_dist;
16680 else if (bb_dist > 0)
16681 shortest_dist = MIN (bb_dist, shortest_dist);
16683 found = true;
16687 distance = shortest_dist;
16691 /* get_attr_type may modify recog data. We want to make sure
16692 that recog data is valid for instruction INSN, on which
16693 distance_non_agu_define is called. INSN is unchanged here. */
16694 extract_insn_cached (insn);
16696 if (!found)
16697 return -1;
16699 return distance >> 1;
16702 /* Return the distance in half-cycles between INSN and the next
16703 insn that uses register number REGNO in memory address added
16704 to DISTANCE. Return -1 if REGNO0 is set.
16706 Put true value into *FOUND if register usage was found and
16707 false otherwise.
16708 Put true value into *REDEFINED if register redefinition was
16709 found and false otherwise. */
16711 static int
16712 distance_agu_use_in_bb (unsigned int regno,
16713 rtx insn, int distance, rtx start,
16714 bool *found, bool *redefined)
16716 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16717 rtx next = start;
16718 rtx prev = NULL;
16720 *found = false;
16721 *redefined = false;
16723 while (next
16724 && next != insn
16725 && distance < LEA_SEARCH_THRESHOLD)
16727 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16729 distance = increase_distance(prev, next, distance);
16730 if (insn_uses_reg_mem (regno, next))
16732 /* Return DISTANCE if OP0 is used in memory
16733 address in NEXT. */
16734 *found = true;
16735 return distance;
16738 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16740 /* Return -1 if OP0 is set in NEXT. */
16741 *redefined = true;
16742 return -1;
16745 prev = next;
16748 if (next == BB_END (bb))
16749 break;
16751 next = NEXT_INSN (next);
16754 return distance;
16757 /* Return the distance between INSN and the next insn that uses
16758 register number REGNO0 in memory address. Return -1 if no such
16759 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16761 static int
16762 distance_agu_use (unsigned int regno0, rtx insn)
16764 basic_block bb = BLOCK_FOR_INSN (insn);
16765 int distance = 0;
16766 bool found = false;
16767 bool redefined = false;
16769 if (insn != BB_END (bb))
16770 distance = distance_agu_use_in_bb (regno0, insn, distance,
16771 NEXT_INSN (insn),
16772 &found, &redefined);
16774 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16776 edge e;
16777 edge_iterator ei;
16778 bool simple_loop = false;
16780 FOR_EACH_EDGE (e, ei, bb->succs)
16781 if (e->dest == bb)
16783 simple_loop = true;
16784 break;
16787 if (simple_loop)
16788 distance = distance_agu_use_in_bb (regno0, insn,
16789 distance, BB_HEAD (bb),
16790 &found, &redefined);
16791 else
16793 int shortest_dist = -1;
16794 bool found_in_bb = false;
16795 bool redefined_in_bb = false;
16797 FOR_EACH_EDGE (e, ei, bb->succs)
16799 int bb_dist
16800 = distance_agu_use_in_bb (regno0, insn,
16801 distance, BB_HEAD (e->dest),
16802 &found_in_bb, &redefined_in_bb);
16803 if (found_in_bb)
16805 if (shortest_dist < 0)
16806 shortest_dist = bb_dist;
16807 else if (bb_dist > 0)
16808 shortest_dist = MIN (bb_dist, shortest_dist);
16810 found = true;
16814 distance = shortest_dist;
16818 if (!found || redefined)
16819 return -1;
16821 return distance >> 1;
16824 /* Define this macro to tune LEA priority vs ADD, it take effect when
16825 there is a dilemma of choicing LEA or ADD
16826 Negative value: ADD is more preferred than LEA
16827 Zero: Netrual
16828 Positive value: LEA is more preferred than ADD*/
16829 #define IX86_LEA_PRIORITY 0
16831 /* Return true if usage of lea INSN has performance advantage
16832 over a sequence of instructions. Instructions sequence has
16833 SPLIT_COST cycles higher latency than lea latency. */
16835 static bool
16836 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16837 unsigned int regno2, int split_cost)
16839 int dist_define, dist_use;
16841 dist_define = distance_non_agu_define (regno1, regno2, insn);
16842 dist_use = distance_agu_use (regno0, insn);
16844 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16846 /* If there is no non AGU operand definition, no AGU
16847 operand usage and split cost is 0 then both lea
16848 and non lea variants have same priority. Currently
16849 we prefer lea for 64 bit code and non lea on 32 bit
16850 code. */
16851 if (dist_use < 0 && split_cost == 0)
16852 return TARGET_64BIT || IX86_LEA_PRIORITY;
16853 else
16854 return true;
16857 /* With longer definitions distance lea is more preferable.
16858 Here we change it to take into account splitting cost and
16859 lea priority. */
16860 dist_define += split_cost + IX86_LEA_PRIORITY;
16862 /* If there is no use in memory addess then we just check
16863 that split cost exceeds AGU stall. */
16864 if (dist_use < 0)
16865 return dist_define > LEA_MAX_STALL;
16867 /* If this insn has both backward non-agu dependence and forward
16868 agu dependence, the one with short distance takes effect. */
16869 return dist_define >= dist_use;
16872 /* Return true if it is legal to clobber flags by INSN and
16873 false otherwise. */
16875 static bool
16876 ix86_ok_to_clobber_flags (rtx insn)
16878 basic_block bb = BLOCK_FOR_INSN (insn);
16879 df_ref *use;
16880 bitmap live;
16882 while (insn)
16884 if (NONDEBUG_INSN_P (insn))
16886 for (use = DF_INSN_USES (insn); *use; use++)
16887 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16888 return false;
16890 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16891 return true;
16894 if (insn == BB_END (bb))
16895 break;
16897 insn = NEXT_INSN (insn);
16900 live = df_get_live_out(bb);
16901 return !REGNO_REG_SET_P (live, FLAGS_REG);
16904 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16905 move and add to avoid AGU stalls. */
16907 bool
16908 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16910 unsigned int regno0, regno1, regno2;
16912 /* Check if we need to optimize. */
16913 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16914 return false;
16916 /* Check it is correct to split here. */
16917 if (!ix86_ok_to_clobber_flags(insn))
16918 return false;
16920 regno0 = true_regnum (operands[0]);
16921 regno1 = true_regnum (operands[1]);
16922 regno2 = true_regnum (operands[2]);
16924 /* We need to split only adds with non destructive
16925 destination operand. */
16926 if (regno0 == regno1 || regno0 == regno2)
16927 return false;
16928 else
16929 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16932 /* Return true if we should emit lea instruction instead of mov
16933 instruction. */
16935 bool
16936 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16938 unsigned int regno0, regno1;
16940 /* Check if we need to optimize. */
16941 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16942 return false;
16944 /* Use lea for reg to reg moves only. */
16945 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16946 return false;
16948 regno0 = true_regnum (operands[0]);
16949 regno1 = true_regnum (operands[1]);
16951 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
16954 /* Return true if we need to split lea into a sequence of
16955 instructions to avoid AGU stalls. */
16957 bool
16958 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16960 unsigned int regno0, regno1, regno2;
16961 int split_cost;
16962 struct ix86_address parts;
16963 int ok;
16965 /* Check we need to optimize. */
16966 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16967 return false;
16969 /* Check it is correct to split here. */
16970 if (!ix86_ok_to_clobber_flags(insn))
16971 return false;
16973 ok = ix86_decompose_address (operands[1], &parts);
16974 gcc_assert (ok);
16976 /* There should be at least two components in the address. */
16977 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
16978 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
16979 return false;
16981 /* We should not split into add if non legitimate pic
16982 operand is used as displacement. */
16983 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16984 return false;
16986 regno0 = true_regnum (operands[0]) ;
16987 regno1 = INVALID_REGNUM;
16988 regno2 = INVALID_REGNUM;
16990 if (parts.base)
16991 regno1 = true_regnum (parts.base);
16992 if (parts.index)
16993 regno2 = true_regnum (parts.index);
16995 split_cost = 0;
16997 /* Compute how many cycles we will add to execution time
16998 if split lea into a sequence of instructions. */
16999 if (parts.base || parts.index)
17001 /* Have to use mov instruction if non desctructive
17002 destination form is used. */
17003 if (regno1 != regno0 && regno2 != regno0)
17004 split_cost += 1;
17006 /* Have to add index to base if both exist. */
17007 if (parts.base && parts.index)
17008 split_cost += 1;
17010 /* Have to use shift and adds if scale is 2 or greater. */
17011 if (parts.scale > 1)
17013 if (regno0 != regno1)
17014 split_cost += 1;
17015 else if (regno2 == regno0)
17016 split_cost += 4;
17017 else
17018 split_cost += parts.scale;
17021 /* Have to use add instruction with immediate if
17022 disp is non zero. */
17023 if (parts.disp && parts.disp != const0_rtx)
17024 split_cost += 1;
17026 /* Subtract the price of lea. */
17027 split_cost -= 1;
17030 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17033 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17034 matches destination. RTX includes clobber of FLAGS_REG. */
17036 static void
17037 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17038 rtx dst, rtx src)
17040 rtx op, clob;
17042 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17043 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17045 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17048 /* Return true if regno1 def is nearest to the insn. */
17050 static bool
17051 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17053 rtx prev = insn;
17054 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17056 if (insn == start)
17057 return false;
17058 while (prev && prev != start)
17060 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17062 prev = PREV_INSN (prev);
17063 continue;
17065 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17066 return true;
17067 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17068 return false;
17069 prev = PREV_INSN (prev);
17072 /* None of the regs is defined in the bb. */
17073 return false;
17076 /* Split lea instructions into a sequence of instructions
17077 which are executed on ALU to avoid AGU stalls.
17078 It is assumed that it is allowed to clobber flags register
17079 at lea position. */
17081 void
17082 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17084 unsigned int regno0, regno1, regno2;
17085 struct ix86_address parts;
17086 rtx target, tmp;
17087 int ok, adds;
17089 ok = ix86_decompose_address (operands[1], &parts);
17090 gcc_assert (ok);
17092 target = gen_lowpart (mode, operands[0]);
17094 regno0 = true_regnum (target);
17095 regno1 = INVALID_REGNUM;
17096 regno2 = INVALID_REGNUM;
17098 if (parts.base)
17100 parts.base = gen_lowpart (mode, parts.base);
17101 regno1 = true_regnum (parts.base);
17104 if (parts.index)
17106 parts.index = gen_lowpart (mode, parts.index);
17107 regno2 = true_regnum (parts.index);
17110 if (parts.disp)
17111 parts.disp = gen_lowpart (mode, parts.disp);
17113 if (parts.scale > 1)
17115 /* Case r1 = r1 + ... */
17116 if (regno1 == regno0)
17118 /* If we have a case r1 = r1 + C * r1 then we
17119 should use multiplication which is very
17120 expensive. Assume cost model is wrong if we
17121 have such case here. */
17122 gcc_assert (regno2 != regno0);
17124 for (adds = parts.scale; adds > 0; adds--)
17125 ix86_emit_binop (PLUS, mode, target, parts.index);
17127 else
17129 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17130 if (regno0 != regno2)
17131 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17133 /* Use shift for scaling. */
17134 ix86_emit_binop (ASHIFT, mode, target,
17135 GEN_INT (exact_log2 (parts.scale)));
17137 if (parts.base)
17138 ix86_emit_binop (PLUS, mode, target, parts.base);
17140 if (parts.disp && parts.disp != const0_rtx)
17141 ix86_emit_binop (PLUS, mode, target, parts.disp);
17144 else if (!parts.base && !parts.index)
17146 gcc_assert(parts.disp);
17147 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17149 else
17151 if (!parts.base)
17153 if (regno0 != regno2)
17154 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17156 else if (!parts.index)
17158 if (regno0 != regno1)
17159 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17161 else
17163 if (regno0 == regno1)
17164 tmp = parts.index;
17165 else if (regno0 == regno2)
17166 tmp = parts.base;
17167 else
17169 rtx tmp1;
17171 /* Find better operand for SET instruction, depending
17172 on which definition is farther from the insn. */
17173 if (find_nearest_reg_def (insn, regno1, regno2))
17174 tmp = parts.index, tmp1 = parts.base;
17175 else
17176 tmp = parts.base, tmp1 = parts.index;
17178 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17180 if (parts.disp && parts.disp != const0_rtx)
17181 ix86_emit_binop (PLUS, mode, target, parts.disp);
17183 ix86_emit_binop (PLUS, mode, target, tmp1);
17184 return;
17187 ix86_emit_binop (PLUS, mode, target, tmp);
17190 if (parts.disp && parts.disp != const0_rtx)
17191 ix86_emit_binop (PLUS, mode, target, parts.disp);
17195 /* Return true if it is ok to optimize an ADD operation to LEA
17196 operation to avoid flag register consumation. For most processors,
17197 ADD is faster than LEA. For the processors like ATOM, if the
17198 destination register of LEA holds an actual address which will be
17199 used soon, LEA is better and otherwise ADD is better. */
17201 bool
17202 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17204 unsigned int regno0 = true_regnum (operands[0]);
17205 unsigned int regno1 = true_regnum (operands[1]);
17206 unsigned int regno2 = true_regnum (operands[2]);
17208 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17209 if (regno0 != regno1 && regno0 != regno2)
17210 return true;
17212 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17213 return false;
17215 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17218 /* Return true if destination reg of SET_BODY is shift count of
17219 USE_BODY. */
17221 static bool
17222 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17224 rtx set_dest;
17225 rtx shift_rtx;
17226 int i;
17228 /* Retrieve destination of SET_BODY. */
17229 switch (GET_CODE (set_body))
17231 case SET:
17232 set_dest = SET_DEST (set_body);
17233 if (!set_dest || !REG_P (set_dest))
17234 return false;
17235 break;
17236 case PARALLEL:
17237 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17238 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17239 use_body))
17240 return true;
17241 default:
17242 return false;
17243 break;
17246 /* Retrieve shift count of USE_BODY. */
17247 switch (GET_CODE (use_body))
17249 case SET:
17250 shift_rtx = XEXP (use_body, 1);
17251 break;
17252 case PARALLEL:
17253 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17254 if (ix86_dep_by_shift_count_body (set_body,
17255 XVECEXP (use_body, 0, i)))
17256 return true;
17257 default:
17258 return false;
17259 break;
17262 if (shift_rtx
17263 && (GET_CODE (shift_rtx) == ASHIFT
17264 || GET_CODE (shift_rtx) == LSHIFTRT
17265 || GET_CODE (shift_rtx) == ASHIFTRT
17266 || GET_CODE (shift_rtx) == ROTATE
17267 || GET_CODE (shift_rtx) == ROTATERT))
17269 rtx shift_count = XEXP (shift_rtx, 1);
17271 /* Return true if shift count is dest of SET_BODY. */
17272 if (REG_P (shift_count))
17274 /* Add check since it can be invoked before register
17275 allocation in pre-reload schedule. */
17276 if (reload_completed
17277 && true_regnum (set_dest) == true_regnum (shift_count))
17278 return true;
17279 else if (REGNO(set_dest) == REGNO(shift_count))
17280 return true;
17284 return false;
17287 /* Return true if destination reg of SET_INSN is shift count of
17288 USE_INSN. */
17290 bool
17291 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17293 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17294 PATTERN (use_insn));
17297 /* Return TRUE or FALSE depending on whether the unary operator meets the
17298 appropriate constraints. */
17300 bool
17301 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17302 enum machine_mode mode ATTRIBUTE_UNUSED,
17303 rtx operands[2] ATTRIBUTE_UNUSED)
17305 /* If one of operands is memory, source and destination must match. */
17306 if ((MEM_P (operands[0])
17307 || MEM_P (operands[1]))
17308 && ! rtx_equal_p (operands[0], operands[1]))
17309 return false;
17310 return true;
17313 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17314 are ok, keeping in mind the possible movddup alternative. */
17316 bool
17317 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17319 if (MEM_P (operands[0]))
17320 return rtx_equal_p (operands[0], operands[1 + high]);
17321 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17322 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17323 return true;
17326 /* Post-reload splitter for converting an SF or DFmode value in an
17327 SSE register into an unsigned SImode. */
17329 void
17330 ix86_split_convert_uns_si_sse (rtx operands[])
17332 enum machine_mode vecmode;
17333 rtx value, large, zero_or_two31, input, two31, x;
17335 large = operands[1];
17336 zero_or_two31 = operands[2];
17337 input = operands[3];
17338 two31 = operands[4];
17339 vecmode = GET_MODE (large);
17340 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17342 /* Load up the value into the low element. We must ensure that the other
17343 elements are valid floats -- zero is the easiest such value. */
17344 if (MEM_P (input))
17346 if (vecmode == V4SFmode)
17347 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17348 else
17349 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17351 else
17353 input = gen_rtx_REG (vecmode, REGNO (input));
17354 emit_move_insn (value, CONST0_RTX (vecmode));
17355 if (vecmode == V4SFmode)
17356 emit_insn (gen_sse_movss (value, value, input));
17357 else
17358 emit_insn (gen_sse2_movsd (value, value, input));
17361 emit_move_insn (large, two31);
17362 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17364 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17365 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17367 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17368 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17370 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17371 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17373 large = gen_rtx_REG (V4SImode, REGNO (large));
17374 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17376 x = gen_rtx_REG (V4SImode, REGNO (value));
17377 if (vecmode == V4SFmode)
17378 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17379 else
17380 emit_insn (gen_sse2_cvttpd2dq (x, value));
17381 value = x;
17383 emit_insn (gen_xorv4si3 (value, value, large));
17386 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17387 Expects the 64-bit DImode to be supplied in a pair of integral
17388 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17389 -mfpmath=sse, !optimize_size only. */
17391 void
17392 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17394 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17395 rtx int_xmm, fp_xmm;
17396 rtx biases, exponents;
17397 rtx x;
17399 int_xmm = gen_reg_rtx (V4SImode);
17400 if (TARGET_INTER_UNIT_MOVES)
17401 emit_insn (gen_movdi_to_sse (int_xmm, input));
17402 else if (TARGET_SSE_SPLIT_REGS)
17404 emit_clobber (int_xmm);
17405 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17407 else
17409 x = gen_reg_rtx (V2DImode);
17410 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17411 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17414 x = gen_rtx_CONST_VECTOR (V4SImode,
17415 gen_rtvec (4, GEN_INT (0x43300000UL),
17416 GEN_INT (0x45300000UL),
17417 const0_rtx, const0_rtx));
17418 exponents = validize_mem (force_const_mem (V4SImode, x));
17420 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17421 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17423 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17424 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17425 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17426 (0x1.0p84 + double(fp_value_hi_xmm)).
17427 Note these exponents differ by 32. */
17429 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17431 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17432 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17433 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17434 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17435 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17436 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17437 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17438 biases = validize_mem (force_const_mem (V2DFmode, biases));
17439 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17441 /* Add the upper and lower DFmode values together. */
17442 if (TARGET_SSE3)
17443 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17444 else
17446 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17447 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17448 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17451 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17454 /* Not used, but eases macroization of patterns. */
17455 void
17456 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17457 rtx input ATTRIBUTE_UNUSED)
17459 gcc_unreachable ();
17462 /* Convert an unsigned SImode value into a DFmode. Only currently used
17463 for SSE, but applicable anywhere. */
17465 void
17466 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17468 REAL_VALUE_TYPE TWO31r;
17469 rtx x, fp;
17471 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17472 NULL, 1, OPTAB_DIRECT);
17474 fp = gen_reg_rtx (DFmode);
17475 emit_insn (gen_floatsidf2 (fp, x));
17477 real_ldexp (&TWO31r, &dconst1, 31);
17478 x = const_double_from_real_value (TWO31r, DFmode);
17480 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17481 if (x != target)
17482 emit_move_insn (target, x);
17485 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17486 32-bit mode; otherwise we have a direct convert instruction. */
17488 void
17489 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17491 REAL_VALUE_TYPE TWO32r;
17492 rtx fp_lo, fp_hi, x;
17494 fp_lo = gen_reg_rtx (DFmode);
17495 fp_hi = gen_reg_rtx (DFmode);
17497 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17499 real_ldexp (&TWO32r, &dconst1, 32);
17500 x = const_double_from_real_value (TWO32r, DFmode);
17501 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17503 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17505 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17506 0, OPTAB_DIRECT);
17507 if (x != target)
17508 emit_move_insn (target, x);
17511 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17512 For x86_32, -mfpmath=sse, !optimize_size only. */
17513 void
17514 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17516 REAL_VALUE_TYPE ONE16r;
17517 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17519 real_ldexp (&ONE16r, &dconst1, 16);
17520 x = const_double_from_real_value (ONE16r, SFmode);
17521 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17522 NULL, 0, OPTAB_DIRECT);
17523 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17524 NULL, 0, OPTAB_DIRECT);
17525 fp_hi = gen_reg_rtx (SFmode);
17526 fp_lo = gen_reg_rtx (SFmode);
17527 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17528 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17529 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17530 0, OPTAB_DIRECT);
17531 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17532 0, OPTAB_DIRECT);
17533 if (!rtx_equal_p (target, fp_hi))
17534 emit_move_insn (target, fp_hi);
17537 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17538 a vector of unsigned ints VAL to vector of floats TARGET. */
17540 void
17541 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17543 rtx tmp[8];
17544 REAL_VALUE_TYPE TWO16r;
17545 enum machine_mode intmode = GET_MODE (val);
17546 enum machine_mode fltmode = GET_MODE (target);
17547 rtx (*cvt) (rtx, rtx);
17549 if (intmode == V4SImode)
17550 cvt = gen_floatv4siv4sf2;
17551 else
17552 cvt = gen_floatv8siv8sf2;
17553 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17554 tmp[0] = force_reg (intmode, tmp[0]);
17555 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17556 OPTAB_DIRECT);
17557 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17558 NULL_RTX, 1, OPTAB_DIRECT);
17559 tmp[3] = gen_reg_rtx (fltmode);
17560 emit_insn (cvt (tmp[3], tmp[1]));
17561 tmp[4] = gen_reg_rtx (fltmode);
17562 emit_insn (cvt (tmp[4], tmp[2]));
17563 real_ldexp (&TWO16r, &dconst1, 16);
17564 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17565 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17566 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17567 OPTAB_DIRECT);
17568 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17569 OPTAB_DIRECT);
17570 if (tmp[7] != target)
17571 emit_move_insn (target, tmp[7]);
17574 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17575 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17576 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17577 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17580 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17582 REAL_VALUE_TYPE TWO31r;
17583 rtx two31r, tmp[4];
17584 enum machine_mode mode = GET_MODE (val);
17585 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17586 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17587 rtx (*cmp) (rtx, rtx, rtx, rtx);
17588 int i;
17590 for (i = 0; i < 3; i++)
17591 tmp[i] = gen_reg_rtx (mode);
17592 real_ldexp (&TWO31r, &dconst1, 31);
17593 two31r = const_double_from_real_value (TWO31r, scalarmode);
17594 two31r = ix86_build_const_vector (mode, 1, two31r);
17595 two31r = force_reg (mode, two31r);
17596 switch (mode)
17598 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17599 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17600 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17601 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17602 default: gcc_unreachable ();
17604 tmp[3] = gen_rtx_LE (mode, two31r, val);
17605 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17606 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17607 0, OPTAB_DIRECT);
17608 if (intmode == V4SImode || TARGET_AVX2)
17609 *xorp = expand_simple_binop (intmode, ASHIFT,
17610 gen_lowpart (intmode, tmp[0]),
17611 GEN_INT (31), NULL_RTX, 0,
17612 OPTAB_DIRECT);
17613 else
17615 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17616 two31 = ix86_build_const_vector (intmode, 1, two31);
17617 *xorp = expand_simple_binop (intmode, AND,
17618 gen_lowpart (intmode, tmp[0]),
17619 two31, NULL_RTX, 0,
17620 OPTAB_DIRECT);
17622 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17623 0, OPTAB_DIRECT);
17626 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17627 then replicate the value for all elements of the vector
17628 register. */
17631 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17633 int i, n_elt;
17634 rtvec v;
17635 enum machine_mode scalar_mode;
17637 switch (mode)
17639 case V32QImode:
17640 case V16QImode:
17641 case V16HImode:
17642 case V8HImode:
17643 case V8SImode:
17644 case V4SImode:
17645 case V4DImode:
17646 case V2DImode:
17647 gcc_assert (vect);
17648 case V8SFmode:
17649 case V4SFmode:
17650 case V4DFmode:
17651 case V2DFmode:
17652 n_elt = GET_MODE_NUNITS (mode);
17653 v = rtvec_alloc (n_elt);
17654 scalar_mode = GET_MODE_INNER (mode);
17656 RTVEC_ELT (v, 0) = value;
17658 for (i = 1; i < n_elt; ++i)
17659 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17661 return gen_rtx_CONST_VECTOR (mode, v);
17663 default:
17664 gcc_unreachable ();
17668 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17669 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17670 for an SSE register. If VECT is true, then replicate the mask for
17671 all elements of the vector register. If INVERT is true, then create
17672 a mask excluding the sign bit. */
17675 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17677 enum machine_mode vec_mode, imode;
17678 HOST_WIDE_INT hi, lo;
17679 int shift = 63;
17680 rtx v;
17681 rtx mask;
17683 /* Find the sign bit, sign extended to 2*HWI. */
17684 switch (mode)
17686 case V8SImode:
17687 case V4SImode:
17688 case V8SFmode:
17689 case V4SFmode:
17690 vec_mode = mode;
17691 mode = GET_MODE_INNER (mode);
17692 imode = SImode;
17693 lo = 0x80000000, hi = lo < 0;
17694 break;
17696 case V4DImode:
17697 case V2DImode:
17698 case V4DFmode:
17699 case V2DFmode:
17700 vec_mode = mode;
17701 mode = GET_MODE_INNER (mode);
17702 imode = DImode;
17703 if (HOST_BITS_PER_WIDE_INT >= 64)
17704 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17705 else
17706 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17707 break;
17709 case TImode:
17710 case TFmode:
17711 vec_mode = VOIDmode;
17712 if (HOST_BITS_PER_WIDE_INT >= 64)
17714 imode = TImode;
17715 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17717 else
17719 rtvec vec;
17721 imode = DImode;
17722 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17724 if (invert)
17726 lo = ~lo, hi = ~hi;
17727 v = constm1_rtx;
17729 else
17730 v = const0_rtx;
17732 mask = immed_double_const (lo, hi, imode);
17734 vec = gen_rtvec (2, v, mask);
17735 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17736 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17738 return v;
17740 break;
17742 default:
17743 gcc_unreachable ();
17746 if (invert)
17747 lo = ~lo, hi = ~hi;
17749 /* Force this value into the low part of a fp vector constant. */
17750 mask = immed_double_const (lo, hi, imode);
17751 mask = gen_lowpart (mode, mask);
17753 if (vec_mode == VOIDmode)
17754 return force_reg (mode, mask);
17756 v = ix86_build_const_vector (vec_mode, vect, mask);
17757 return force_reg (vec_mode, v);
17760 /* Generate code for floating point ABS or NEG. */
17762 void
17763 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17764 rtx operands[])
17766 rtx mask, set, dst, src;
17767 bool use_sse = false;
17768 bool vector_mode = VECTOR_MODE_P (mode);
17769 enum machine_mode vmode = mode;
17771 if (vector_mode)
17772 use_sse = true;
17773 else if (mode == TFmode)
17774 use_sse = true;
17775 else if (TARGET_SSE_MATH)
17777 use_sse = SSE_FLOAT_MODE_P (mode);
17778 if (mode == SFmode)
17779 vmode = V4SFmode;
17780 else if (mode == DFmode)
17781 vmode = V2DFmode;
17784 /* NEG and ABS performed with SSE use bitwise mask operations.
17785 Create the appropriate mask now. */
17786 if (use_sse)
17787 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17788 else
17789 mask = NULL_RTX;
17791 dst = operands[0];
17792 src = operands[1];
17794 set = gen_rtx_fmt_e (code, mode, src);
17795 set = gen_rtx_SET (VOIDmode, dst, set);
17797 if (mask)
17799 rtx use, clob;
17800 rtvec par;
17802 use = gen_rtx_USE (VOIDmode, mask);
17803 if (vector_mode)
17804 par = gen_rtvec (2, set, use);
17805 else
17807 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17808 par = gen_rtvec (3, set, use, clob);
17810 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17812 else
17813 emit_insn (set);
17816 /* Expand a copysign operation. Special case operand 0 being a constant. */
17818 void
17819 ix86_expand_copysign (rtx operands[])
17821 enum machine_mode mode, vmode;
17822 rtx dest, op0, op1, mask, nmask;
17824 dest = operands[0];
17825 op0 = operands[1];
17826 op1 = operands[2];
17828 mode = GET_MODE (dest);
17830 if (mode == SFmode)
17831 vmode = V4SFmode;
17832 else if (mode == DFmode)
17833 vmode = V2DFmode;
17834 else
17835 vmode = mode;
17837 if (GET_CODE (op0) == CONST_DOUBLE)
17839 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17841 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17842 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17844 if (mode == SFmode || mode == DFmode)
17846 if (op0 == CONST0_RTX (mode))
17847 op0 = CONST0_RTX (vmode);
17848 else
17850 rtx v = ix86_build_const_vector (vmode, false, op0);
17852 op0 = force_reg (vmode, v);
17855 else if (op0 != CONST0_RTX (mode))
17856 op0 = force_reg (mode, op0);
17858 mask = ix86_build_signbit_mask (vmode, 0, 0);
17860 if (mode == SFmode)
17861 copysign_insn = gen_copysignsf3_const;
17862 else if (mode == DFmode)
17863 copysign_insn = gen_copysigndf3_const;
17864 else
17865 copysign_insn = gen_copysigntf3_const;
17867 emit_insn (copysign_insn (dest, op0, op1, mask));
17869 else
17871 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17873 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17874 mask = ix86_build_signbit_mask (vmode, 0, 0);
17876 if (mode == SFmode)
17877 copysign_insn = gen_copysignsf3_var;
17878 else if (mode == DFmode)
17879 copysign_insn = gen_copysigndf3_var;
17880 else
17881 copysign_insn = gen_copysigntf3_var;
17883 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17887 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17888 be a constant, and so has already been expanded into a vector constant. */
17890 void
17891 ix86_split_copysign_const (rtx operands[])
17893 enum machine_mode mode, vmode;
17894 rtx dest, op0, mask, x;
17896 dest = operands[0];
17897 op0 = operands[1];
17898 mask = operands[3];
17900 mode = GET_MODE (dest);
17901 vmode = GET_MODE (mask);
17903 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17904 x = gen_rtx_AND (vmode, dest, mask);
17905 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17907 if (op0 != CONST0_RTX (vmode))
17909 x = gen_rtx_IOR (vmode, dest, op0);
17910 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17914 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17915 so we have to do two masks. */
17917 void
17918 ix86_split_copysign_var (rtx operands[])
17920 enum machine_mode mode, vmode;
17921 rtx dest, scratch, op0, op1, mask, nmask, x;
17923 dest = operands[0];
17924 scratch = operands[1];
17925 op0 = operands[2];
17926 op1 = operands[3];
17927 nmask = operands[4];
17928 mask = operands[5];
17930 mode = GET_MODE (dest);
17931 vmode = GET_MODE (mask);
17933 if (rtx_equal_p (op0, op1))
17935 /* Shouldn't happen often (it's useless, obviously), but when it does
17936 we'd generate incorrect code if we continue below. */
17937 emit_move_insn (dest, op0);
17938 return;
17941 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17943 gcc_assert (REGNO (op1) == REGNO (scratch));
17945 x = gen_rtx_AND (vmode, scratch, mask);
17946 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17948 dest = mask;
17949 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17950 x = gen_rtx_NOT (vmode, dest);
17951 x = gen_rtx_AND (vmode, x, op0);
17952 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17954 else
17956 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17958 x = gen_rtx_AND (vmode, scratch, mask);
17960 else /* alternative 2,4 */
17962 gcc_assert (REGNO (mask) == REGNO (scratch));
17963 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17964 x = gen_rtx_AND (vmode, scratch, op1);
17966 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17968 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17970 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17971 x = gen_rtx_AND (vmode, dest, nmask);
17973 else /* alternative 3,4 */
17975 gcc_assert (REGNO (nmask) == REGNO (dest));
17976 dest = nmask;
17977 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17978 x = gen_rtx_AND (vmode, dest, op0);
17980 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17983 x = gen_rtx_IOR (vmode, dest, scratch);
17984 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17987 /* Return TRUE or FALSE depending on whether the first SET in INSN
17988 has source and destination with matching CC modes, and that the
17989 CC mode is at least as constrained as REQ_MODE. */
17991 bool
17992 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17994 rtx set;
17995 enum machine_mode set_mode;
17997 set = PATTERN (insn);
17998 if (GET_CODE (set) == PARALLEL)
17999 set = XVECEXP (set, 0, 0);
18000 gcc_assert (GET_CODE (set) == SET);
18001 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18003 set_mode = GET_MODE (SET_DEST (set));
18004 switch (set_mode)
18006 case CCNOmode:
18007 if (req_mode != CCNOmode
18008 && (req_mode != CCmode
18009 || XEXP (SET_SRC (set), 1) != const0_rtx))
18010 return false;
18011 break;
18012 case CCmode:
18013 if (req_mode == CCGCmode)
18014 return false;
18015 /* FALLTHRU */
18016 case CCGCmode:
18017 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18018 return false;
18019 /* FALLTHRU */
18020 case CCGOCmode:
18021 if (req_mode == CCZmode)
18022 return false;
18023 /* FALLTHRU */
18024 case CCZmode:
18025 break;
18027 case CCAmode:
18028 case CCCmode:
18029 case CCOmode:
18030 case CCSmode:
18031 if (set_mode != req_mode)
18032 return false;
18033 break;
18035 default:
18036 gcc_unreachable ();
18039 return GET_MODE (SET_SRC (set)) == set_mode;
18042 /* Generate insn patterns to do an integer compare of OPERANDS. */
18044 static rtx
18045 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18047 enum machine_mode cmpmode;
18048 rtx tmp, flags;
18050 cmpmode = SELECT_CC_MODE (code, op0, op1);
18051 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18053 /* This is very simple, but making the interface the same as in the
18054 FP case makes the rest of the code easier. */
18055 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18056 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18058 /* Return the test that should be put into the flags user, i.e.
18059 the bcc, scc, or cmov instruction. */
18060 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18063 /* Figure out whether to use ordered or unordered fp comparisons.
18064 Return the appropriate mode to use. */
18066 enum machine_mode
18067 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18069 /* ??? In order to make all comparisons reversible, we do all comparisons
18070 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18071 all forms trapping and nontrapping comparisons, we can make inequality
18072 comparisons trapping again, since it results in better code when using
18073 FCOM based compares. */
18074 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18077 enum machine_mode
18078 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18080 enum machine_mode mode = GET_MODE (op0);
18082 if (SCALAR_FLOAT_MODE_P (mode))
18084 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18085 return ix86_fp_compare_mode (code);
18088 switch (code)
18090 /* Only zero flag is needed. */
18091 case EQ: /* ZF=0 */
18092 case NE: /* ZF!=0 */
18093 return CCZmode;
18094 /* Codes needing carry flag. */
18095 case GEU: /* CF=0 */
18096 case LTU: /* CF=1 */
18097 /* Detect overflow checks. They need just the carry flag. */
18098 if (GET_CODE (op0) == PLUS
18099 && rtx_equal_p (op1, XEXP (op0, 0)))
18100 return CCCmode;
18101 else
18102 return CCmode;
18103 case GTU: /* CF=0 & ZF=0 */
18104 case LEU: /* CF=1 | ZF=1 */
18105 /* Detect overflow checks. They need just the carry flag. */
18106 if (GET_CODE (op0) == MINUS
18107 && rtx_equal_p (op1, XEXP (op0, 0)))
18108 return CCCmode;
18109 else
18110 return CCmode;
18111 /* Codes possibly doable only with sign flag when
18112 comparing against zero. */
18113 case GE: /* SF=OF or SF=0 */
18114 case LT: /* SF<>OF or SF=1 */
18115 if (op1 == const0_rtx)
18116 return CCGOCmode;
18117 else
18118 /* For other cases Carry flag is not required. */
18119 return CCGCmode;
18120 /* Codes doable only with sign flag when comparing
18121 against zero, but we miss jump instruction for it
18122 so we need to use relational tests against overflow
18123 that thus needs to be zero. */
18124 case GT: /* ZF=0 & SF=OF */
18125 case LE: /* ZF=1 | SF<>OF */
18126 if (op1 == const0_rtx)
18127 return CCNOmode;
18128 else
18129 return CCGCmode;
18130 /* strcmp pattern do (use flags) and combine may ask us for proper
18131 mode. */
18132 case USE:
18133 return CCmode;
18134 default:
18135 gcc_unreachable ();
18139 /* Return the fixed registers used for condition codes. */
18141 static bool
18142 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18144 *p1 = FLAGS_REG;
18145 *p2 = FPSR_REG;
18146 return true;
18149 /* If two condition code modes are compatible, return a condition code
18150 mode which is compatible with both. Otherwise, return
18151 VOIDmode. */
18153 static enum machine_mode
18154 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18156 if (m1 == m2)
18157 return m1;
18159 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18160 return VOIDmode;
18162 if ((m1 == CCGCmode && m2 == CCGOCmode)
18163 || (m1 == CCGOCmode && m2 == CCGCmode))
18164 return CCGCmode;
18166 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18167 return m2;
18168 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18169 return m1;
18171 switch (m1)
18173 default:
18174 gcc_unreachable ();
18176 case CCmode:
18177 case CCGCmode:
18178 case CCGOCmode:
18179 case CCNOmode:
18180 case CCAmode:
18181 case CCCmode:
18182 case CCOmode:
18183 case CCSmode:
18184 case CCZmode:
18185 switch (m2)
18187 default:
18188 return VOIDmode;
18190 case CCmode:
18191 case CCGCmode:
18192 case CCGOCmode:
18193 case CCNOmode:
18194 case CCAmode:
18195 case CCCmode:
18196 case CCOmode:
18197 case CCSmode:
18198 case CCZmode:
18199 return CCmode;
18202 case CCFPmode:
18203 case CCFPUmode:
18204 /* These are only compatible with themselves, which we already
18205 checked above. */
18206 return VOIDmode;
18211 /* Return a comparison we can do and that it is equivalent to
18212 swap_condition (code) apart possibly from orderedness.
18213 But, never change orderedness if TARGET_IEEE_FP, returning
18214 UNKNOWN in that case if necessary. */
18216 static enum rtx_code
18217 ix86_fp_swap_condition (enum rtx_code code)
18219 switch (code)
18221 case GT: /* GTU - CF=0 & ZF=0 */
18222 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18223 case GE: /* GEU - CF=0 */
18224 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18225 case UNLT: /* LTU - CF=1 */
18226 return TARGET_IEEE_FP ? UNKNOWN : GT;
18227 case UNLE: /* LEU - CF=1 | ZF=1 */
18228 return TARGET_IEEE_FP ? UNKNOWN : GE;
18229 default:
18230 return swap_condition (code);
18234 /* Return cost of comparison CODE using the best strategy for performance.
18235 All following functions do use number of instructions as a cost metrics.
18236 In future this should be tweaked to compute bytes for optimize_size and
18237 take into account performance of various instructions on various CPUs. */
18239 static int
18240 ix86_fp_comparison_cost (enum rtx_code code)
18242 int arith_cost;
18244 /* The cost of code using bit-twiddling on %ah. */
18245 switch (code)
18247 case UNLE:
18248 case UNLT:
18249 case LTGT:
18250 case GT:
18251 case GE:
18252 case UNORDERED:
18253 case ORDERED:
18254 case UNEQ:
18255 arith_cost = 4;
18256 break;
18257 case LT:
18258 case NE:
18259 case EQ:
18260 case UNGE:
18261 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18262 break;
18263 case LE:
18264 case UNGT:
18265 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18266 break;
18267 default:
18268 gcc_unreachable ();
18271 switch (ix86_fp_comparison_strategy (code))
18273 case IX86_FPCMP_COMI:
18274 return arith_cost > 4 ? 3 : 2;
18275 case IX86_FPCMP_SAHF:
18276 return arith_cost > 4 ? 4 : 3;
18277 default:
18278 return arith_cost;
18282 /* Return strategy to use for floating-point. We assume that fcomi is always
18283 preferrable where available, since that is also true when looking at size
18284 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18286 enum ix86_fpcmp_strategy
18287 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18289 /* Do fcomi/sahf based test when profitable. */
18291 if (TARGET_CMOVE)
18292 return IX86_FPCMP_COMI;
18294 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18295 return IX86_FPCMP_SAHF;
18297 return IX86_FPCMP_ARITH;
18300 /* Swap, force into registers, or otherwise massage the two operands
18301 to a fp comparison. The operands are updated in place; the new
18302 comparison code is returned. */
18304 static enum rtx_code
18305 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18307 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18308 rtx op0 = *pop0, op1 = *pop1;
18309 enum machine_mode op_mode = GET_MODE (op0);
18310 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18312 /* All of the unordered compare instructions only work on registers.
18313 The same is true of the fcomi compare instructions. The XFmode
18314 compare instructions require registers except when comparing
18315 against zero or when converting operand 1 from fixed point to
18316 floating point. */
18318 if (!is_sse
18319 && (fpcmp_mode == CCFPUmode
18320 || (op_mode == XFmode
18321 && ! (standard_80387_constant_p (op0) == 1
18322 || standard_80387_constant_p (op1) == 1)
18323 && GET_CODE (op1) != FLOAT)
18324 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18326 op0 = force_reg (op_mode, op0);
18327 op1 = force_reg (op_mode, op1);
18329 else
18331 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18332 things around if they appear profitable, otherwise force op0
18333 into a register. */
18335 if (standard_80387_constant_p (op0) == 0
18336 || (MEM_P (op0)
18337 && ! (standard_80387_constant_p (op1) == 0
18338 || MEM_P (op1))))
18340 enum rtx_code new_code = ix86_fp_swap_condition (code);
18341 if (new_code != UNKNOWN)
18343 rtx tmp;
18344 tmp = op0, op0 = op1, op1 = tmp;
18345 code = new_code;
18349 if (!REG_P (op0))
18350 op0 = force_reg (op_mode, op0);
18352 if (CONSTANT_P (op1))
18354 int tmp = standard_80387_constant_p (op1);
18355 if (tmp == 0)
18356 op1 = validize_mem (force_const_mem (op_mode, op1));
18357 else if (tmp == 1)
18359 if (TARGET_CMOVE)
18360 op1 = force_reg (op_mode, op1);
18362 else
18363 op1 = force_reg (op_mode, op1);
18367 /* Try to rearrange the comparison to make it cheaper. */
18368 if (ix86_fp_comparison_cost (code)
18369 > ix86_fp_comparison_cost (swap_condition (code))
18370 && (REG_P (op1) || can_create_pseudo_p ()))
18372 rtx tmp;
18373 tmp = op0, op0 = op1, op1 = tmp;
18374 code = swap_condition (code);
18375 if (!REG_P (op0))
18376 op0 = force_reg (op_mode, op0);
18379 *pop0 = op0;
18380 *pop1 = op1;
18381 return code;
18384 /* Convert comparison codes we use to represent FP comparison to integer
18385 code that will result in proper branch. Return UNKNOWN if no such code
18386 is available. */
18388 enum rtx_code
18389 ix86_fp_compare_code_to_integer (enum rtx_code code)
18391 switch (code)
18393 case GT:
18394 return GTU;
18395 case GE:
18396 return GEU;
18397 case ORDERED:
18398 case UNORDERED:
18399 return code;
18400 break;
18401 case UNEQ:
18402 return EQ;
18403 break;
18404 case UNLT:
18405 return LTU;
18406 break;
18407 case UNLE:
18408 return LEU;
18409 break;
18410 case LTGT:
18411 return NE;
18412 break;
18413 default:
18414 return UNKNOWN;
18418 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18420 static rtx
18421 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18423 enum machine_mode fpcmp_mode, intcmp_mode;
18424 rtx tmp, tmp2;
18426 fpcmp_mode = ix86_fp_compare_mode (code);
18427 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18429 /* Do fcomi/sahf based test when profitable. */
18430 switch (ix86_fp_comparison_strategy (code))
18432 case IX86_FPCMP_COMI:
18433 intcmp_mode = fpcmp_mode;
18434 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18435 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18436 tmp);
18437 emit_insn (tmp);
18438 break;
18440 case IX86_FPCMP_SAHF:
18441 intcmp_mode = fpcmp_mode;
18442 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18443 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18444 tmp);
18446 if (!scratch)
18447 scratch = gen_reg_rtx (HImode);
18448 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18449 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18450 break;
18452 case IX86_FPCMP_ARITH:
18453 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18454 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18455 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18456 if (!scratch)
18457 scratch = gen_reg_rtx (HImode);
18458 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18460 /* In the unordered case, we have to check C2 for NaN's, which
18461 doesn't happen to work out to anything nice combination-wise.
18462 So do some bit twiddling on the value we've got in AH to come
18463 up with an appropriate set of condition codes. */
18465 intcmp_mode = CCNOmode;
18466 switch (code)
18468 case GT:
18469 case UNGT:
18470 if (code == GT || !TARGET_IEEE_FP)
18472 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18473 code = EQ;
18475 else
18477 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18478 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18479 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18480 intcmp_mode = CCmode;
18481 code = GEU;
18483 break;
18484 case LT:
18485 case UNLT:
18486 if (code == LT && TARGET_IEEE_FP)
18488 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18489 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18490 intcmp_mode = CCmode;
18491 code = EQ;
18493 else
18495 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18496 code = NE;
18498 break;
18499 case GE:
18500 case UNGE:
18501 if (code == GE || !TARGET_IEEE_FP)
18503 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18504 code = EQ;
18506 else
18508 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18509 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18510 code = NE;
18512 break;
18513 case LE:
18514 case UNLE:
18515 if (code == LE && TARGET_IEEE_FP)
18517 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18518 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18519 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18520 intcmp_mode = CCmode;
18521 code = LTU;
18523 else
18525 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18526 code = NE;
18528 break;
18529 case EQ:
18530 case UNEQ:
18531 if (code == EQ && TARGET_IEEE_FP)
18533 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18534 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18535 intcmp_mode = CCmode;
18536 code = EQ;
18538 else
18540 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18541 code = NE;
18543 break;
18544 case NE:
18545 case LTGT:
18546 if (code == NE && TARGET_IEEE_FP)
18548 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18549 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18550 GEN_INT (0x40)));
18551 code = NE;
18553 else
18555 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18556 code = EQ;
18558 break;
18560 case UNORDERED:
18561 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18562 code = NE;
18563 break;
18564 case ORDERED:
18565 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18566 code = EQ;
18567 break;
18569 default:
18570 gcc_unreachable ();
18572 break;
18574 default:
18575 gcc_unreachable();
18578 /* Return the test that should be put into the flags user, i.e.
18579 the bcc, scc, or cmov instruction. */
18580 return gen_rtx_fmt_ee (code, VOIDmode,
18581 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18582 const0_rtx);
18585 static rtx
18586 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18588 rtx ret;
18590 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18591 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18593 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18595 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18596 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18598 else
18599 ret = ix86_expand_int_compare (code, op0, op1);
18601 return ret;
18604 void
18605 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18607 enum machine_mode mode = GET_MODE (op0);
18608 rtx tmp;
18610 switch (mode)
18612 case SFmode:
18613 case DFmode:
18614 case XFmode:
18615 case QImode:
18616 case HImode:
18617 case SImode:
18618 simple:
18619 tmp = ix86_expand_compare (code, op0, op1);
18620 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18621 gen_rtx_LABEL_REF (VOIDmode, label),
18622 pc_rtx);
18623 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18624 return;
18626 case DImode:
18627 if (TARGET_64BIT)
18628 goto simple;
18629 case TImode:
18630 /* Expand DImode branch into multiple compare+branch. */
18632 rtx lo[2], hi[2], label2;
18633 enum rtx_code code1, code2, code3;
18634 enum machine_mode submode;
18636 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18638 tmp = op0, op0 = op1, op1 = tmp;
18639 code = swap_condition (code);
18642 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18643 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18645 submode = mode == DImode ? SImode : DImode;
18647 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18648 avoid two branches. This costs one extra insn, so disable when
18649 optimizing for size. */
18651 if ((code == EQ || code == NE)
18652 && (!optimize_insn_for_size_p ()
18653 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18655 rtx xor0, xor1;
18657 xor1 = hi[0];
18658 if (hi[1] != const0_rtx)
18659 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18660 NULL_RTX, 0, OPTAB_WIDEN);
18662 xor0 = lo[0];
18663 if (lo[1] != const0_rtx)
18664 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18665 NULL_RTX, 0, OPTAB_WIDEN);
18667 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18668 NULL_RTX, 0, OPTAB_WIDEN);
18670 ix86_expand_branch (code, tmp, const0_rtx, label);
18671 return;
18674 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18675 op1 is a constant and the low word is zero, then we can just
18676 examine the high word. Similarly for low word -1 and
18677 less-or-equal-than or greater-than. */
18679 if (CONST_INT_P (hi[1]))
18680 switch (code)
18682 case LT: case LTU: case GE: case GEU:
18683 if (lo[1] == const0_rtx)
18685 ix86_expand_branch (code, hi[0], hi[1], label);
18686 return;
18688 break;
18689 case LE: case LEU: case GT: case GTU:
18690 if (lo[1] == constm1_rtx)
18692 ix86_expand_branch (code, hi[0], hi[1], label);
18693 return;
18695 break;
18696 default:
18697 break;
18700 /* Otherwise, we need two or three jumps. */
18702 label2 = gen_label_rtx ();
18704 code1 = code;
18705 code2 = swap_condition (code);
18706 code3 = unsigned_condition (code);
18708 switch (code)
18710 case LT: case GT: case LTU: case GTU:
18711 break;
18713 case LE: code1 = LT; code2 = GT; break;
18714 case GE: code1 = GT; code2 = LT; break;
18715 case LEU: code1 = LTU; code2 = GTU; break;
18716 case GEU: code1 = GTU; code2 = LTU; break;
18718 case EQ: code1 = UNKNOWN; code2 = NE; break;
18719 case NE: code2 = UNKNOWN; break;
18721 default:
18722 gcc_unreachable ();
18726 * a < b =>
18727 * if (hi(a) < hi(b)) goto true;
18728 * if (hi(a) > hi(b)) goto false;
18729 * if (lo(a) < lo(b)) goto true;
18730 * false:
18733 if (code1 != UNKNOWN)
18734 ix86_expand_branch (code1, hi[0], hi[1], label);
18735 if (code2 != UNKNOWN)
18736 ix86_expand_branch (code2, hi[0], hi[1], label2);
18738 ix86_expand_branch (code3, lo[0], lo[1], label);
18740 if (code2 != UNKNOWN)
18741 emit_label (label2);
18742 return;
18745 default:
18746 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18747 goto simple;
18751 /* Split branch based on floating point condition. */
18752 void
18753 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18754 rtx target1, rtx target2, rtx tmp, rtx pushed)
18756 rtx condition;
18757 rtx i;
18759 if (target2 != pc_rtx)
18761 rtx tmp = target2;
18762 code = reverse_condition_maybe_unordered (code);
18763 target2 = target1;
18764 target1 = tmp;
18767 condition = ix86_expand_fp_compare (code, op1, op2,
18768 tmp);
18770 /* Remove pushed operand from stack. */
18771 if (pushed)
18772 ix86_free_from_memory (GET_MODE (pushed));
18774 i = emit_jump_insn (gen_rtx_SET
18775 (VOIDmode, pc_rtx,
18776 gen_rtx_IF_THEN_ELSE (VOIDmode,
18777 condition, target1, target2)));
18778 if (split_branch_probability >= 0)
18779 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18782 void
18783 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18785 rtx ret;
18787 gcc_assert (GET_MODE (dest) == QImode);
18789 ret = ix86_expand_compare (code, op0, op1);
18790 PUT_MODE (ret, QImode);
18791 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18794 /* Expand comparison setting or clearing carry flag. Return true when
18795 successful and set pop for the operation. */
18796 static bool
18797 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18799 enum machine_mode mode =
18800 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18802 /* Do not handle double-mode compares that go through special path. */
18803 if (mode == (TARGET_64BIT ? TImode : DImode))
18804 return false;
18806 if (SCALAR_FLOAT_MODE_P (mode))
18808 rtx compare_op, compare_seq;
18810 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18812 /* Shortcut: following common codes never translate
18813 into carry flag compares. */
18814 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18815 || code == ORDERED || code == UNORDERED)
18816 return false;
18818 /* These comparisons require zero flag; swap operands so they won't. */
18819 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18820 && !TARGET_IEEE_FP)
18822 rtx tmp = op0;
18823 op0 = op1;
18824 op1 = tmp;
18825 code = swap_condition (code);
18828 /* Try to expand the comparison and verify that we end up with
18829 carry flag based comparison. This fails to be true only when
18830 we decide to expand comparison using arithmetic that is not
18831 too common scenario. */
18832 start_sequence ();
18833 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18834 compare_seq = get_insns ();
18835 end_sequence ();
18837 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18838 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18839 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18840 else
18841 code = GET_CODE (compare_op);
18843 if (code != LTU && code != GEU)
18844 return false;
18846 emit_insn (compare_seq);
18847 *pop = compare_op;
18848 return true;
18851 if (!INTEGRAL_MODE_P (mode))
18852 return false;
18854 switch (code)
18856 case LTU:
18857 case GEU:
18858 break;
18860 /* Convert a==0 into (unsigned)a<1. */
18861 case EQ:
18862 case NE:
18863 if (op1 != const0_rtx)
18864 return false;
18865 op1 = const1_rtx;
18866 code = (code == EQ ? LTU : GEU);
18867 break;
18869 /* Convert a>b into b<a or a>=b-1. */
18870 case GTU:
18871 case LEU:
18872 if (CONST_INT_P (op1))
18874 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18875 /* Bail out on overflow. We still can swap operands but that
18876 would force loading of the constant into register. */
18877 if (op1 == const0_rtx
18878 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18879 return false;
18880 code = (code == GTU ? GEU : LTU);
18882 else
18884 rtx tmp = op1;
18885 op1 = op0;
18886 op0 = tmp;
18887 code = (code == GTU ? LTU : GEU);
18889 break;
18891 /* Convert a>=0 into (unsigned)a<0x80000000. */
18892 case LT:
18893 case GE:
18894 if (mode == DImode || op1 != const0_rtx)
18895 return false;
18896 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18897 code = (code == LT ? GEU : LTU);
18898 break;
18899 case LE:
18900 case GT:
18901 if (mode == DImode || op1 != constm1_rtx)
18902 return false;
18903 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18904 code = (code == LE ? GEU : LTU);
18905 break;
18907 default:
18908 return false;
18910 /* Swapping operands may cause constant to appear as first operand. */
18911 if (!nonimmediate_operand (op0, VOIDmode))
18913 if (!can_create_pseudo_p ())
18914 return false;
18915 op0 = force_reg (mode, op0);
18917 *pop = ix86_expand_compare (code, op0, op1);
18918 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18919 return true;
18922 bool
18923 ix86_expand_int_movcc (rtx operands[])
18925 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18926 rtx compare_seq, compare_op;
18927 enum machine_mode mode = GET_MODE (operands[0]);
18928 bool sign_bit_compare_p = false;
18929 rtx op0 = XEXP (operands[1], 0);
18930 rtx op1 = XEXP (operands[1], 1);
18932 if (GET_MODE (op0) == TImode
18933 || (GET_MODE (op0) == DImode
18934 && !TARGET_64BIT))
18935 return false;
18937 start_sequence ();
18938 compare_op = ix86_expand_compare (code, op0, op1);
18939 compare_seq = get_insns ();
18940 end_sequence ();
18942 compare_code = GET_CODE (compare_op);
18944 if ((op1 == const0_rtx && (code == GE || code == LT))
18945 || (op1 == constm1_rtx && (code == GT || code == LE)))
18946 sign_bit_compare_p = true;
18948 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18949 HImode insns, we'd be swallowed in word prefix ops. */
18951 if ((mode != HImode || TARGET_FAST_PREFIX)
18952 && (mode != (TARGET_64BIT ? TImode : DImode))
18953 && CONST_INT_P (operands[2])
18954 && CONST_INT_P (operands[3]))
18956 rtx out = operands[0];
18957 HOST_WIDE_INT ct = INTVAL (operands[2]);
18958 HOST_WIDE_INT cf = INTVAL (operands[3]);
18959 HOST_WIDE_INT diff;
18961 diff = ct - cf;
18962 /* Sign bit compares are better done using shifts than we do by using
18963 sbb. */
18964 if (sign_bit_compare_p
18965 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18967 /* Detect overlap between destination and compare sources. */
18968 rtx tmp = out;
18970 if (!sign_bit_compare_p)
18972 rtx flags;
18973 bool fpcmp = false;
18975 compare_code = GET_CODE (compare_op);
18977 flags = XEXP (compare_op, 0);
18979 if (GET_MODE (flags) == CCFPmode
18980 || GET_MODE (flags) == CCFPUmode)
18982 fpcmp = true;
18983 compare_code
18984 = ix86_fp_compare_code_to_integer (compare_code);
18987 /* To simplify rest of code, restrict to the GEU case. */
18988 if (compare_code == LTU)
18990 HOST_WIDE_INT tmp = ct;
18991 ct = cf;
18992 cf = tmp;
18993 compare_code = reverse_condition (compare_code);
18994 code = reverse_condition (code);
18996 else
18998 if (fpcmp)
18999 PUT_CODE (compare_op,
19000 reverse_condition_maybe_unordered
19001 (GET_CODE (compare_op)));
19002 else
19003 PUT_CODE (compare_op,
19004 reverse_condition (GET_CODE (compare_op)));
19006 diff = ct - cf;
19008 if (reg_overlap_mentioned_p (out, op0)
19009 || reg_overlap_mentioned_p (out, op1))
19010 tmp = gen_reg_rtx (mode);
19012 if (mode == DImode)
19013 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19014 else
19015 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19016 flags, compare_op));
19018 else
19020 if (code == GT || code == GE)
19021 code = reverse_condition (code);
19022 else
19024 HOST_WIDE_INT tmp = ct;
19025 ct = cf;
19026 cf = tmp;
19027 diff = ct - cf;
19029 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19032 if (diff == 1)
19035 * cmpl op0,op1
19036 * sbbl dest,dest
19037 * [addl dest, ct]
19039 * Size 5 - 8.
19041 if (ct)
19042 tmp = expand_simple_binop (mode, PLUS,
19043 tmp, GEN_INT (ct),
19044 copy_rtx (tmp), 1, OPTAB_DIRECT);
19046 else if (cf == -1)
19049 * cmpl op0,op1
19050 * sbbl dest,dest
19051 * orl $ct, dest
19053 * Size 8.
19055 tmp = expand_simple_binop (mode, IOR,
19056 tmp, GEN_INT (ct),
19057 copy_rtx (tmp), 1, OPTAB_DIRECT);
19059 else if (diff == -1 && ct)
19062 * cmpl op0,op1
19063 * sbbl dest,dest
19064 * notl dest
19065 * [addl dest, cf]
19067 * Size 8 - 11.
19069 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19070 if (cf)
19071 tmp = expand_simple_binop (mode, PLUS,
19072 copy_rtx (tmp), GEN_INT (cf),
19073 copy_rtx (tmp), 1, OPTAB_DIRECT);
19075 else
19078 * cmpl op0,op1
19079 * sbbl dest,dest
19080 * [notl dest]
19081 * andl cf - ct, dest
19082 * [addl dest, ct]
19084 * Size 8 - 11.
19087 if (cf == 0)
19089 cf = ct;
19090 ct = 0;
19091 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19094 tmp = expand_simple_binop (mode, AND,
19095 copy_rtx (tmp),
19096 gen_int_mode (cf - ct, mode),
19097 copy_rtx (tmp), 1, OPTAB_DIRECT);
19098 if (ct)
19099 tmp = expand_simple_binop (mode, PLUS,
19100 copy_rtx (tmp), GEN_INT (ct),
19101 copy_rtx (tmp), 1, OPTAB_DIRECT);
19104 if (!rtx_equal_p (tmp, out))
19105 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19107 return true;
19110 if (diff < 0)
19112 enum machine_mode cmp_mode = GET_MODE (op0);
19114 HOST_WIDE_INT tmp;
19115 tmp = ct, ct = cf, cf = tmp;
19116 diff = -diff;
19118 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19120 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19122 /* We may be reversing unordered compare to normal compare, that
19123 is not valid in general (we may convert non-trapping condition
19124 to trapping one), however on i386 we currently emit all
19125 comparisons unordered. */
19126 compare_code = reverse_condition_maybe_unordered (compare_code);
19127 code = reverse_condition_maybe_unordered (code);
19129 else
19131 compare_code = reverse_condition (compare_code);
19132 code = reverse_condition (code);
19136 compare_code = UNKNOWN;
19137 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19138 && CONST_INT_P (op1))
19140 if (op1 == const0_rtx
19141 && (code == LT || code == GE))
19142 compare_code = code;
19143 else if (op1 == constm1_rtx)
19145 if (code == LE)
19146 compare_code = LT;
19147 else if (code == GT)
19148 compare_code = GE;
19152 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19153 if (compare_code != UNKNOWN
19154 && GET_MODE (op0) == GET_MODE (out)
19155 && (cf == -1 || ct == -1))
19157 /* If lea code below could be used, only optimize
19158 if it results in a 2 insn sequence. */
19160 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19161 || diff == 3 || diff == 5 || diff == 9)
19162 || (compare_code == LT && ct == -1)
19163 || (compare_code == GE && cf == -1))
19166 * notl op1 (if necessary)
19167 * sarl $31, op1
19168 * orl cf, op1
19170 if (ct != -1)
19172 cf = ct;
19173 ct = -1;
19174 code = reverse_condition (code);
19177 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19179 out = expand_simple_binop (mode, IOR,
19180 out, GEN_INT (cf),
19181 out, 1, OPTAB_DIRECT);
19182 if (out != operands[0])
19183 emit_move_insn (operands[0], out);
19185 return true;
19190 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19191 || diff == 3 || diff == 5 || diff == 9)
19192 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19193 && (mode != DImode
19194 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19197 * xorl dest,dest
19198 * cmpl op1,op2
19199 * setcc dest
19200 * lea cf(dest*(ct-cf)),dest
19202 * Size 14.
19204 * This also catches the degenerate setcc-only case.
19207 rtx tmp;
19208 int nops;
19210 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19212 nops = 0;
19213 /* On x86_64 the lea instruction operates on Pmode, so we need
19214 to get arithmetics done in proper mode to match. */
19215 if (diff == 1)
19216 tmp = copy_rtx (out);
19217 else
19219 rtx out1;
19220 out1 = copy_rtx (out);
19221 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19222 nops++;
19223 if (diff & 1)
19225 tmp = gen_rtx_PLUS (mode, tmp, out1);
19226 nops++;
19229 if (cf != 0)
19231 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19232 nops++;
19234 if (!rtx_equal_p (tmp, out))
19236 if (nops == 1)
19237 out = force_operand (tmp, copy_rtx (out));
19238 else
19239 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19241 if (!rtx_equal_p (out, operands[0]))
19242 emit_move_insn (operands[0], copy_rtx (out));
19244 return true;
19248 * General case: Jumpful:
19249 * xorl dest,dest cmpl op1, op2
19250 * cmpl op1, op2 movl ct, dest
19251 * setcc dest jcc 1f
19252 * decl dest movl cf, dest
19253 * andl (cf-ct),dest 1:
19254 * addl ct,dest
19256 * Size 20. Size 14.
19258 * This is reasonably steep, but branch mispredict costs are
19259 * high on modern cpus, so consider failing only if optimizing
19260 * for space.
19263 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19264 && BRANCH_COST (optimize_insn_for_speed_p (),
19265 false) >= 2)
19267 if (cf == 0)
19269 enum machine_mode cmp_mode = GET_MODE (op0);
19271 cf = ct;
19272 ct = 0;
19274 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19276 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19278 /* We may be reversing unordered compare to normal compare,
19279 that is not valid in general (we may convert non-trapping
19280 condition to trapping one), however on i386 we currently
19281 emit all comparisons unordered. */
19282 code = reverse_condition_maybe_unordered (code);
19284 else
19286 code = reverse_condition (code);
19287 if (compare_code != UNKNOWN)
19288 compare_code = reverse_condition (compare_code);
19292 if (compare_code != UNKNOWN)
19294 /* notl op1 (if needed)
19295 sarl $31, op1
19296 andl (cf-ct), op1
19297 addl ct, op1
19299 For x < 0 (resp. x <= -1) there will be no notl,
19300 so if possible swap the constants to get rid of the
19301 complement.
19302 True/false will be -1/0 while code below (store flag
19303 followed by decrement) is 0/-1, so the constants need
19304 to be exchanged once more. */
19306 if (compare_code == GE || !cf)
19308 code = reverse_condition (code);
19309 compare_code = LT;
19311 else
19313 HOST_WIDE_INT tmp = cf;
19314 cf = ct;
19315 ct = tmp;
19318 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19320 else
19322 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19324 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19325 constm1_rtx,
19326 copy_rtx (out), 1, OPTAB_DIRECT);
19329 out = expand_simple_binop (mode, AND, copy_rtx (out),
19330 gen_int_mode (cf - ct, mode),
19331 copy_rtx (out), 1, OPTAB_DIRECT);
19332 if (ct)
19333 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19334 copy_rtx (out), 1, OPTAB_DIRECT);
19335 if (!rtx_equal_p (out, operands[0]))
19336 emit_move_insn (operands[0], copy_rtx (out));
19338 return true;
19342 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19344 /* Try a few things more with specific constants and a variable. */
19346 optab op;
19347 rtx var, orig_out, out, tmp;
19349 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19350 return false;
19352 /* If one of the two operands is an interesting constant, load a
19353 constant with the above and mask it in with a logical operation. */
19355 if (CONST_INT_P (operands[2]))
19357 var = operands[3];
19358 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19359 operands[3] = constm1_rtx, op = and_optab;
19360 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19361 operands[3] = const0_rtx, op = ior_optab;
19362 else
19363 return false;
19365 else if (CONST_INT_P (operands[3]))
19367 var = operands[2];
19368 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19369 operands[2] = constm1_rtx, op = and_optab;
19370 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19371 operands[2] = const0_rtx, op = ior_optab;
19372 else
19373 return false;
19375 else
19376 return false;
19378 orig_out = operands[0];
19379 tmp = gen_reg_rtx (mode);
19380 operands[0] = tmp;
19382 /* Recurse to get the constant loaded. */
19383 if (ix86_expand_int_movcc (operands) == 0)
19384 return false;
19386 /* Mask in the interesting variable. */
19387 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19388 OPTAB_WIDEN);
19389 if (!rtx_equal_p (out, orig_out))
19390 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19392 return true;
19396 * For comparison with above,
19398 * movl cf,dest
19399 * movl ct,tmp
19400 * cmpl op1,op2
19401 * cmovcc tmp,dest
19403 * Size 15.
19406 if (! nonimmediate_operand (operands[2], mode))
19407 operands[2] = force_reg (mode, operands[2]);
19408 if (! nonimmediate_operand (operands[3], mode))
19409 operands[3] = force_reg (mode, operands[3]);
19411 if (! register_operand (operands[2], VOIDmode)
19412 && (mode == QImode
19413 || ! register_operand (operands[3], VOIDmode)))
19414 operands[2] = force_reg (mode, operands[2]);
19416 if (mode == QImode
19417 && ! register_operand (operands[3], VOIDmode))
19418 operands[3] = force_reg (mode, operands[3]);
19420 emit_insn (compare_seq);
19421 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19422 gen_rtx_IF_THEN_ELSE (mode,
19423 compare_op, operands[2],
19424 operands[3])));
19425 return true;
19428 /* Swap, force into registers, or otherwise massage the two operands
19429 to an sse comparison with a mask result. Thus we differ a bit from
19430 ix86_prepare_fp_compare_args which expects to produce a flags result.
19432 The DEST operand exists to help determine whether to commute commutative
19433 operators. The POP0/POP1 operands are updated in place. The new
19434 comparison code is returned, or UNKNOWN if not implementable. */
19436 static enum rtx_code
19437 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19438 rtx *pop0, rtx *pop1)
19440 rtx tmp;
19442 switch (code)
19444 case LTGT:
19445 case UNEQ:
19446 /* AVX supports all the needed comparisons. */
19447 if (TARGET_AVX)
19448 break;
19449 /* We have no LTGT as an operator. We could implement it with
19450 NE & ORDERED, but this requires an extra temporary. It's
19451 not clear that it's worth it. */
19452 return UNKNOWN;
19454 case LT:
19455 case LE:
19456 case UNGT:
19457 case UNGE:
19458 /* These are supported directly. */
19459 break;
19461 case EQ:
19462 case NE:
19463 case UNORDERED:
19464 case ORDERED:
19465 /* AVX has 3 operand comparisons, no need to swap anything. */
19466 if (TARGET_AVX)
19467 break;
19468 /* For commutative operators, try to canonicalize the destination
19469 operand to be first in the comparison - this helps reload to
19470 avoid extra moves. */
19471 if (!dest || !rtx_equal_p (dest, *pop1))
19472 break;
19473 /* FALLTHRU */
19475 case GE:
19476 case GT:
19477 case UNLE:
19478 case UNLT:
19479 /* These are not supported directly before AVX, and furthermore
19480 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19481 comparison operands to transform into something that is
19482 supported. */
19483 tmp = *pop0;
19484 *pop0 = *pop1;
19485 *pop1 = tmp;
19486 code = swap_condition (code);
19487 break;
19489 default:
19490 gcc_unreachable ();
19493 return code;
19496 /* Detect conditional moves that exactly match min/max operational
19497 semantics. Note that this is IEEE safe, as long as we don't
19498 interchange the operands.
19500 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19501 and TRUE if the operation is successful and instructions are emitted. */
19503 static bool
19504 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19505 rtx cmp_op1, rtx if_true, rtx if_false)
19507 enum machine_mode mode;
19508 bool is_min;
19509 rtx tmp;
19511 if (code == LT)
19513 else if (code == UNGE)
19515 tmp = if_true;
19516 if_true = if_false;
19517 if_false = tmp;
19519 else
19520 return false;
19522 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19523 is_min = true;
19524 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19525 is_min = false;
19526 else
19527 return false;
19529 mode = GET_MODE (dest);
19531 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19532 but MODE may be a vector mode and thus not appropriate. */
19533 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19535 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19536 rtvec v;
19538 if_true = force_reg (mode, if_true);
19539 v = gen_rtvec (2, if_true, if_false);
19540 tmp = gen_rtx_UNSPEC (mode, v, u);
19542 else
19544 code = is_min ? SMIN : SMAX;
19545 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19548 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19549 return true;
19552 /* Expand an sse vector comparison. Return the register with the result. */
19554 static rtx
19555 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19556 rtx op_true, rtx op_false)
19558 enum machine_mode mode = GET_MODE (dest);
19559 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19560 rtx x;
19562 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19563 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19564 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19566 if (optimize
19567 || reg_overlap_mentioned_p (dest, op_true)
19568 || reg_overlap_mentioned_p (dest, op_false))
19569 dest = gen_reg_rtx (mode);
19571 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19572 if (cmp_mode != mode)
19574 x = force_reg (cmp_mode, x);
19575 convert_move (dest, x, false);
19577 else
19578 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19580 return dest;
19583 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19584 operations. This is used for both scalar and vector conditional moves. */
19586 static void
19587 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19589 enum machine_mode mode = GET_MODE (dest);
19590 rtx t2, t3, x;
19592 if (vector_all_ones_operand (op_true, mode)
19593 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19595 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19597 else if (op_false == CONST0_RTX (mode))
19599 op_true = force_reg (mode, op_true);
19600 x = gen_rtx_AND (mode, cmp, op_true);
19601 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19603 else if (op_true == CONST0_RTX (mode))
19605 op_false = force_reg (mode, op_false);
19606 x = gen_rtx_NOT (mode, cmp);
19607 x = gen_rtx_AND (mode, x, op_false);
19608 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19610 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19612 op_false = force_reg (mode, op_false);
19613 x = gen_rtx_IOR (mode, cmp, op_false);
19614 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19616 else if (TARGET_XOP)
19618 op_true = force_reg (mode, op_true);
19620 if (!nonimmediate_operand (op_false, mode))
19621 op_false = force_reg (mode, op_false);
19623 emit_insn (gen_rtx_SET (mode, dest,
19624 gen_rtx_IF_THEN_ELSE (mode, cmp,
19625 op_true,
19626 op_false)));
19628 else
19630 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19632 if (!nonimmediate_operand (op_true, mode))
19633 op_true = force_reg (mode, op_true);
19635 op_false = force_reg (mode, op_false);
19637 switch (mode)
19639 case V4SFmode:
19640 if (TARGET_SSE4_1)
19641 gen = gen_sse4_1_blendvps;
19642 break;
19643 case V2DFmode:
19644 if (TARGET_SSE4_1)
19645 gen = gen_sse4_1_blendvpd;
19646 break;
19647 case V16QImode:
19648 case V8HImode:
19649 case V4SImode:
19650 case V2DImode:
19651 if (TARGET_SSE4_1)
19653 gen = gen_sse4_1_pblendvb;
19654 dest = gen_lowpart (V16QImode, dest);
19655 op_false = gen_lowpart (V16QImode, op_false);
19656 op_true = gen_lowpart (V16QImode, op_true);
19657 cmp = gen_lowpart (V16QImode, cmp);
19659 break;
19660 case V8SFmode:
19661 if (TARGET_AVX)
19662 gen = gen_avx_blendvps256;
19663 break;
19664 case V4DFmode:
19665 if (TARGET_AVX)
19666 gen = gen_avx_blendvpd256;
19667 break;
19668 case V32QImode:
19669 case V16HImode:
19670 case V8SImode:
19671 case V4DImode:
19672 if (TARGET_AVX2)
19674 gen = gen_avx2_pblendvb;
19675 dest = gen_lowpart (V32QImode, dest);
19676 op_false = gen_lowpart (V32QImode, op_false);
19677 op_true = gen_lowpart (V32QImode, op_true);
19678 cmp = gen_lowpart (V32QImode, cmp);
19680 break;
19681 default:
19682 break;
19685 if (gen != NULL)
19686 emit_insn (gen (dest, op_false, op_true, cmp));
19687 else
19689 op_true = force_reg (mode, op_true);
19691 t2 = gen_reg_rtx (mode);
19692 if (optimize)
19693 t3 = gen_reg_rtx (mode);
19694 else
19695 t3 = dest;
19697 x = gen_rtx_AND (mode, op_true, cmp);
19698 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19700 x = gen_rtx_NOT (mode, cmp);
19701 x = gen_rtx_AND (mode, x, op_false);
19702 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19704 x = gen_rtx_IOR (mode, t3, t2);
19705 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19710 /* Expand a floating-point conditional move. Return true if successful. */
19712 bool
19713 ix86_expand_fp_movcc (rtx operands[])
19715 enum machine_mode mode = GET_MODE (operands[0]);
19716 enum rtx_code code = GET_CODE (operands[1]);
19717 rtx tmp, compare_op;
19718 rtx op0 = XEXP (operands[1], 0);
19719 rtx op1 = XEXP (operands[1], 1);
19721 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19723 enum machine_mode cmode;
19725 /* Since we've no cmove for sse registers, don't force bad register
19726 allocation just to gain access to it. Deny movcc when the
19727 comparison mode doesn't match the move mode. */
19728 cmode = GET_MODE (op0);
19729 if (cmode == VOIDmode)
19730 cmode = GET_MODE (op1);
19731 if (cmode != mode)
19732 return false;
19734 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19735 if (code == UNKNOWN)
19736 return false;
19738 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19739 operands[2], operands[3]))
19740 return true;
19742 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19743 operands[2], operands[3]);
19744 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19745 return true;
19748 /* The floating point conditional move instructions don't directly
19749 support conditions resulting from a signed integer comparison. */
19751 compare_op = ix86_expand_compare (code, op0, op1);
19752 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19754 tmp = gen_reg_rtx (QImode);
19755 ix86_expand_setcc (tmp, code, op0, op1);
19757 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19760 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19761 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19762 operands[2], operands[3])));
19764 return true;
19767 /* Expand a floating-point vector conditional move; a vcond operation
19768 rather than a movcc operation. */
19770 bool
19771 ix86_expand_fp_vcond (rtx operands[])
19773 enum rtx_code code = GET_CODE (operands[3]);
19774 rtx cmp;
19776 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19777 &operands[4], &operands[5]);
19778 if (code == UNKNOWN)
19780 rtx temp;
19781 switch (GET_CODE (operands[3]))
19783 case LTGT:
19784 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19785 operands[5], operands[0], operands[0]);
19786 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19787 operands[5], operands[1], operands[2]);
19788 code = AND;
19789 break;
19790 case UNEQ:
19791 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19792 operands[5], operands[0], operands[0]);
19793 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19794 operands[5], operands[1], operands[2]);
19795 code = IOR;
19796 break;
19797 default:
19798 gcc_unreachable ();
19800 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19801 OPTAB_DIRECT);
19802 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19803 return true;
19806 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19807 operands[5], operands[1], operands[2]))
19808 return true;
19810 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19811 operands[1], operands[2]);
19812 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19813 return true;
19816 /* Expand a signed/unsigned integral vector conditional move. */
19818 bool
19819 ix86_expand_int_vcond (rtx operands[])
19821 enum machine_mode data_mode = GET_MODE (operands[0]);
19822 enum machine_mode mode = GET_MODE (operands[4]);
19823 enum rtx_code code = GET_CODE (operands[3]);
19824 bool negate = false;
19825 rtx x, cop0, cop1;
19827 cop0 = operands[4];
19828 cop1 = operands[5];
19830 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19831 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19832 if ((code == LT || code == GE)
19833 && data_mode == mode
19834 && cop1 == CONST0_RTX (mode)
19835 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19836 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19837 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19838 && (GET_MODE_SIZE (data_mode) == 16
19839 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19841 rtx negop = operands[2 - (code == LT)];
19842 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19843 if (negop == CONST1_RTX (data_mode))
19845 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19846 operands[0], 1, OPTAB_DIRECT);
19847 if (res != operands[0])
19848 emit_move_insn (operands[0], res);
19849 return true;
19851 else if (GET_MODE_INNER (data_mode) != DImode
19852 && vector_all_ones_operand (negop, data_mode))
19854 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19855 operands[0], 0, OPTAB_DIRECT);
19856 if (res != operands[0])
19857 emit_move_insn (operands[0], res);
19858 return true;
19862 if (!nonimmediate_operand (cop1, mode))
19863 cop1 = force_reg (mode, cop1);
19864 if (!general_operand (operands[1], data_mode))
19865 operands[1] = force_reg (data_mode, operands[1]);
19866 if (!general_operand (operands[2], data_mode))
19867 operands[2] = force_reg (data_mode, operands[2]);
19869 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19870 if (TARGET_XOP
19871 && (mode == V16QImode || mode == V8HImode
19872 || mode == V4SImode || mode == V2DImode))
19874 else
19876 /* Canonicalize the comparison to EQ, GT, GTU. */
19877 switch (code)
19879 case EQ:
19880 case GT:
19881 case GTU:
19882 break;
19884 case NE:
19885 case LE:
19886 case LEU:
19887 code = reverse_condition (code);
19888 negate = true;
19889 break;
19891 case GE:
19892 case GEU:
19893 code = reverse_condition (code);
19894 negate = true;
19895 /* FALLTHRU */
19897 case LT:
19898 case LTU:
19899 code = swap_condition (code);
19900 x = cop0, cop0 = cop1, cop1 = x;
19901 break;
19903 default:
19904 gcc_unreachable ();
19907 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19908 if (mode == V2DImode)
19910 switch (code)
19912 case EQ:
19913 /* SSE4.1 supports EQ. */
19914 if (!TARGET_SSE4_1)
19915 return false;
19916 break;
19918 case GT:
19919 case GTU:
19920 /* SSE4.2 supports GT/GTU. */
19921 if (!TARGET_SSE4_2)
19922 return false;
19923 break;
19925 default:
19926 gcc_unreachable ();
19930 /* Unsigned parallel compare is not supported by the hardware.
19931 Play some tricks to turn this into a signed comparison
19932 against 0. */
19933 if (code == GTU)
19935 cop0 = force_reg (mode, cop0);
19937 switch (mode)
19939 case V8SImode:
19940 case V4DImode:
19941 case V4SImode:
19942 case V2DImode:
19944 rtx t1, t2, mask;
19945 rtx (*gen_sub3) (rtx, rtx, rtx);
19947 switch (mode)
19949 case V8SImode: gen_sub3 = gen_subv8si3; break;
19950 case V4DImode: gen_sub3 = gen_subv4di3; break;
19951 case V4SImode: gen_sub3 = gen_subv4si3; break;
19952 case V2DImode: gen_sub3 = gen_subv2di3; break;
19953 default:
19954 gcc_unreachable ();
19956 /* Subtract (-(INT MAX) - 1) from both operands to make
19957 them signed. */
19958 mask = ix86_build_signbit_mask (mode, true, false);
19959 t1 = gen_reg_rtx (mode);
19960 emit_insn (gen_sub3 (t1, cop0, mask));
19962 t2 = gen_reg_rtx (mode);
19963 emit_insn (gen_sub3 (t2, cop1, mask));
19965 cop0 = t1;
19966 cop1 = t2;
19967 code = GT;
19969 break;
19971 case V32QImode:
19972 case V16HImode:
19973 case V16QImode:
19974 case V8HImode:
19975 /* Perform a parallel unsigned saturating subtraction. */
19976 x = gen_reg_rtx (mode);
19977 emit_insn (gen_rtx_SET (VOIDmode, x,
19978 gen_rtx_US_MINUS (mode, cop0, cop1)));
19980 cop0 = x;
19981 cop1 = CONST0_RTX (mode);
19982 code = EQ;
19983 negate = !negate;
19984 break;
19986 default:
19987 gcc_unreachable ();
19992 /* Allow the comparison to be done in one mode, but the movcc to
19993 happen in another mode. */
19994 if (data_mode == mode)
19996 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19997 operands[1+negate], operands[2-negate]);
19999 else
20001 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20002 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20003 code, cop0, cop1,
20004 operands[1+negate], operands[2-negate]);
20005 x = gen_lowpart (data_mode, x);
20008 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20009 operands[2-negate]);
20010 return true;
20013 /* Expand a variable vector permutation. */
20015 void
20016 ix86_expand_vec_perm (rtx operands[])
20018 rtx target = operands[0];
20019 rtx op0 = operands[1];
20020 rtx op1 = operands[2];
20021 rtx mask = operands[3];
20022 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20023 enum machine_mode mode = GET_MODE (op0);
20024 enum machine_mode maskmode = GET_MODE (mask);
20025 int w, e, i;
20026 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20028 /* Number of elements in the vector. */
20029 w = GET_MODE_NUNITS (mode);
20030 e = GET_MODE_UNIT_SIZE (mode);
20031 gcc_assert (w <= 32);
20033 if (TARGET_AVX2)
20035 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20037 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20038 an constant shuffle operand. With a tiny bit of effort we can
20039 use VPERMD instead. A re-interpretation stall for V4DFmode is
20040 unfortunate but there's no avoiding it.
20041 Similarly for V16HImode we don't have instructions for variable
20042 shuffling, while for V32QImode we can use after preparing suitable
20043 masks vpshufb; vpshufb; vpermq; vpor. */
20045 if (mode == V16HImode)
20047 maskmode = mode = V32QImode;
20048 w = 32;
20049 e = 1;
20051 else
20053 maskmode = mode = V8SImode;
20054 w = 8;
20055 e = 4;
20057 t1 = gen_reg_rtx (maskmode);
20059 /* Replicate the low bits of the V4DImode mask into V8SImode:
20060 mask = { A B C D }
20061 t1 = { A A B B C C D D }. */
20062 for (i = 0; i < w / 2; ++i)
20063 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20064 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20065 vt = force_reg (maskmode, vt);
20066 mask = gen_lowpart (maskmode, mask);
20067 if (maskmode == V8SImode)
20068 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20069 else
20070 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20072 /* Multiply the shuffle indicies by two. */
20073 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20074 OPTAB_DIRECT);
20076 /* Add one to the odd shuffle indicies:
20077 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20078 for (i = 0; i < w / 2; ++i)
20080 vec[i * 2] = const0_rtx;
20081 vec[i * 2 + 1] = const1_rtx;
20083 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20084 vt = force_const_mem (maskmode, vt);
20085 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20086 OPTAB_DIRECT);
20088 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20089 operands[3] = mask = t1;
20090 target = gen_lowpart (mode, target);
20091 op0 = gen_lowpart (mode, op0);
20092 op1 = gen_lowpart (mode, op1);
20095 switch (mode)
20097 case V8SImode:
20098 /* The VPERMD and VPERMPS instructions already properly ignore
20099 the high bits of the shuffle elements. No need for us to
20100 perform an AND ourselves. */
20101 if (one_operand_shuffle)
20102 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20103 else
20105 t1 = gen_reg_rtx (V8SImode);
20106 t2 = gen_reg_rtx (V8SImode);
20107 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20108 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20109 goto merge_two;
20111 return;
20113 case V8SFmode:
20114 mask = gen_lowpart (V8SFmode, mask);
20115 if (one_operand_shuffle)
20116 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20117 else
20119 t1 = gen_reg_rtx (V8SFmode);
20120 t2 = gen_reg_rtx (V8SFmode);
20121 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20122 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20123 goto merge_two;
20125 return;
20127 case V4SImode:
20128 /* By combining the two 128-bit input vectors into one 256-bit
20129 input vector, we can use VPERMD and VPERMPS for the full
20130 two-operand shuffle. */
20131 t1 = gen_reg_rtx (V8SImode);
20132 t2 = gen_reg_rtx (V8SImode);
20133 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20134 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20135 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20136 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20137 return;
20139 case V4SFmode:
20140 t1 = gen_reg_rtx (V8SFmode);
20141 t2 = gen_reg_rtx (V8SImode);
20142 mask = gen_lowpart (V4SImode, mask);
20143 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20144 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20145 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20146 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20147 return;
20149 case V32QImode:
20150 t1 = gen_reg_rtx (V32QImode);
20151 t2 = gen_reg_rtx (V32QImode);
20152 t3 = gen_reg_rtx (V32QImode);
20153 vt2 = GEN_INT (128);
20154 for (i = 0; i < 32; i++)
20155 vec[i] = vt2;
20156 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20157 vt = force_reg (V32QImode, vt);
20158 for (i = 0; i < 32; i++)
20159 vec[i] = i < 16 ? vt2 : const0_rtx;
20160 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20161 vt2 = force_reg (V32QImode, vt2);
20162 /* From mask create two adjusted masks, which contain the same
20163 bits as mask in the low 7 bits of each vector element.
20164 The first mask will have the most significant bit clear
20165 if it requests element from the same 128-bit lane
20166 and MSB set if it requests element from the other 128-bit lane.
20167 The second mask will have the opposite values of the MSB,
20168 and additionally will have its 128-bit lanes swapped.
20169 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20170 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20171 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20172 stands for other 12 bytes. */
20173 /* The bit whether element is from the same lane or the other
20174 lane is bit 4, so shift it up by 3 to the MSB position. */
20175 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20176 gen_lowpart (V4DImode, mask),
20177 GEN_INT (3)));
20178 /* Clear MSB bits from the mask just in case it had them set. */
20179 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20180 /* After this t1 will have MSB set for elements from other lane. */
20181 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20182 /* Clear bits other than MSB. */
20183 emit_insn (gen_andv32qi3 (t1, t1, vt));
20184 /* Or in the lower bits from mask into t3. */
20185 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20186 /* And invert MSB bits in t1, so MSB is set for elements from the same
20187 lane. */
20188 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20189 /* Swap 128-bit lanes in t3. */
20190 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20191 gen_lowpart (V4DImode, t3),
20192 const2_rtx, GEN_INT (3),
20193 const0_rtx, const1_rtx));
20194 /* And or in the lower bits from mask into t1. */
20195 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20196 if (one_operand_shuffle)
20198 /* Each of these shuffles will put 0s in places where
20199 element from the other 128-bit lane is needed, otherwise
20200 will shuffle in the requested value. */
20201 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20202 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20203 /* For t3 the 128-bit lanes are swapped again. */
20204 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20205 gen_lowpart (V4DImode, t3),
20206 const2_rtx, GEN_INT (3),
20207 const0_rtx, const1_rtx));
20208 /* And oring both together leads to the result. */
20209 emit_insn (gen_iorv32qi3 (target, t1, t3));
20210 return;
20213 t4 = gen_reg_rtx (V32QImode);
20214 /* Similarly to the above one_operand_shuffle code,
20215 just for repeated twice for each operand. merge_two:
20216 code will merge the two results together. */
20217 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20218 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20219 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20220 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20221 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20222 gen_lowpart (V4DImode, t4),
20223 const2_rtx, GEN_INT (3),
20224 const0_rtx, const1_rtx));
20225 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20226 gen_lowpart (V4DImode, t3),
20227 const2_rtx, GEN_INT (3),
20228 const0_rtx, const1_rtx));
20229 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20230 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20231 t1 = t4;
20232 t2 = t3;
20233 goto merge_two;
20235 default:
20236 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20237 break;
20241 if (TARGET_XOP)
20243 /* The XOP VPPERM insn supports three inputs. By ignoring the
20244 one_operand_shuffle special case, we avoid creating another
20245 set of constant vectors in memory. */
20246 one_operand_shuffle = false;
20248 /* mask = mask & {2*w-1, ...} */
20249 vt = GEN_INT (2*w - 1);
20251 else
20253 /* mask = mask & {w-1, ...} */
20254 vt = GEN_INT (w - 1);
20257 for (i = 0; i < w; i++)
20258 vec[i] = vt;
20259 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20260 mask = expand_simple_binop (maskmode, AND, mask, vt,
20261 NULL_RTX, 0, OPTAB_DIRECT);
20263 /* For non-QImode operations, convert the word permutation control
20264 into a byte permutation control. */
20265 if (mode != V16QImode)
20267 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20268 GEN_INT (exact_log2 (e)),
20269 NULL_RTX, 0, OPTAB_DIRECT);
20271 /* Convert mask to vector of chars. */
20272 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20274 /* Replicate each of the input bytes into byte positions:
20275 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20276 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20277 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20278 for (i = 0; i < 16; ++i)
20279 vec[i] = GEN_INT (i/e * e);
20280 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20281 vt = force_const_mem (V16QImode, vt);
20282 if (TARGET_XOP)
20283 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20284 else
20285 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20287 /* Convert it into the byte positions by doing
20288 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20289 for (i = 0; i < 16; ++i)
20290 vec[i] = GEN_INT (i % e);
20291 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20292 vt = force_const_mem (V16QImode, vt);
20293 emit_insn (gen_addv16qi3 (mask, mask, vt));
20296 /* The actual shuffle operations all operate on V16QImode. */
20297 op0 = gen_lowpart (V16QImode, op0);
20298 op1 = gen_lowpart (V16QImode, op1);
20299 target = gen_lowpart (V16QImode, target);
20301 if (TARGET_XOP)
20303 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20305 else if (one_operand_shuffle)
20307 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20309 else
20311 rtx xops[6];
20312 bool ok;
20314 /* Shuffle the two input vectors independently. */
20315 t1 = gen_reg_rtx (V16QImode);
20316 t2 = gen_reg_rtx (V16QImode);
20317 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20318 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20320 merge_two:
20321 /* Then merge them together. The key is whether any given control
20322 element contained a bit set that indicates the second word. */
20323 mask = operands[3];
20324 vt = GEN_INT (w);
20325 if (maskmode == V2DImode && !TARGET_SSE4_1)
20327 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20328 more shuffle to convert the V2DI input mask into a V4SI
20329 input mask. At which point the masking that expand_int_vcond
20330 will work as desired. */
20331 rtx t3 = gen_reg_rtx (V4SImode);
20332 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20333 const0_rtx, const0_rtx,
20334 const2_rtx, const2_rtx));
20335 mask = t3;
20336 maskmode = V4SImode;
20337 e = w = 4;
20340 for (i = 0; i < w; i++)
20341 vec[i] = vt;
20342 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20343 vt = force_reg (maskmode, vt);
20344 mask = expand_simple_binop (maskmode, AND, mask, vt,
20345 NULL_RTX, 0, OPTAB_DIRECT);
20347 xops[0] = gen_lowpart (mode, operands[0]);
20348 xops[1] = gen_lowpart (mode, t2);
20349 xops[2] = gen_lowpart (mode, t1);
20350 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20351 xops[4] = mask;
20352 xops[5] = vt;
20353 ok = ix86_expand_int_vcond (xops);
20354 gcc_assert (ok);
20358 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20359 true if we should do zero extension, else sign extension. HIGH_P is
20360 true if we want the N/2 high elements, else the low elements. */
20362 void
20363 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20365 enum machine_mode imode = GET_MODE (src);
20366 rtx tmp;
20368 if (TARGET_SSE4_1)
20370 rtx (*unpack)(rtx, rtx);
20371 rtx (*extract)(rtx, rtx) = NULL;
20372 enum machine_mode halfmode = BLKmode;
20374 switch (imode)
20376 case V32QImode:
20377 if (unsigned_p)
20378 unpack = gen_avx2_zero_extendv16qiv16hi2;
20379 else
20380 unpack = gen_avx2_sign_extendv16qiv16hi2;
20381 halfmode = V16QImode;
20382 extract
20383 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20384 break;
20385 case V16HImode:
20386 if (unsigned_p)
20387 unpack = gen_avx2_zero_extendv8hiv8si2;
20388 else
20389 unpack = gen_avx2_sign_extendv8hiv8si2;
20390 halfmode = V8HImode;
20391 extract
20392 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20393 break;
20394 case V8SImode:
20395 if (unsigned_p)
20396 unpack = gen_avx2_zero_extendv4siv4di2;
20397 else
20398 unpack = gen_avx2_sign_extendv4siv4di2;
20399 halfmode = V4SImode;
20400 extract
20401 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20402 break;
20403 case V16QImode:
20404 if (unsigned_p)
20405 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20406 else
20407 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20408 break;
20409 case V8HImode:
20410 if (unsigned_p)
20411 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20412 else
20413 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20414 break;
20415 case V4SImode:
20416 if (unsigned_p)
20417 unpack = gen_sse4_1_zero_extendv2siv2di2;
20418 else
20419 unpack = gen_sse4_1_sign_extendv2siv2di2;
20420 break;
20421 default:
20422 gcc_unreachable ();
20425 if (GET_MODE_SIZE (imode) == 32)
20427 tmp = gen_reg_rtx (halfmode);
20428 emit_insn (extract (tmp, src));
20430 else if (high_p)
20432 /* Shift higher 8 bytes to lower 8 bytes. */
20433 tmp = gen_reg_rtx (imode);
20434 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20435 gen_lowpart (V1TImode, src),
20436 GEN_INT (64)));
20438 else
20439 tmp = src;
20441 emit_insn (unpack (dest, tmp));
20443 else
20445 rtx (*unpack)(rtx, rtx, rtx);
20447 switch (imode)
20449 case V16QImode:
20450 if (high_p)
20451 unpack = gen_vec_interleave_highv16qi;
20452 else
20453 unpack = gen_vec_interleave_lowv16qi;
20454 break;
20455 case V8HImode:
20456 if (high_p)
20457 unpack = gen_vec_interleave_highv8hi;
20458 else
20459 unpack = gen_vec_interleave_lowv8hi;
20460 break;
20461 case V4SImode:
20462 if (high_p)
20463 unpack = gen_vec_interleave_highv4si;
20464 else
20465 unpack = gen_vec_interleave_lowv4si;
20466 break;
20467 default:
20468 gcc_unreachable ();
20471 if (unsigned_p)
20472 tmp = force_reg (imode, CONST0_RTX (imode));
20473 else
20474 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20475 src, pc_rtx, pc_rtx);
20477 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20481 /* Expand conditional increment or decrement using adb/sbb instructions.
20482 The default case using setcc followed by the conditional move can be
20483 done by generic code. */
20484 bool
20485 ix86_expand_int_addcc (rtx operands[])
20487 enum rtx_code code = GET_CODE (operands[1]);
20488 rtx flags;
20489 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20490 rtx compare_op;
20491 rtx val = const0_rtx;
20492 bool fpcmp = false;
20493 enum machine_mode mode;
20494 rtx op0 = XEXP (operands[1], 0);
20495 rtx op1 = XEXP (operands[1], 1);
20497 if (operands[3] != const1_rtx
20498 && operands[3] != constm1_rtx)
20499 return false;
20500 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20501 return false;
20502 code = GET_CODE (compare_op);
20504 flags = XEXP (compare_op, 0);
20506 if (GET_MODE (flags) == CCFPmode
20507 || GET_MODE (flags) == CCFPUmode)
20509 fpcmp = true;
20510 code = ix86_fp_compare_code_to_integer (code);
20513 if (code != LTU)
20515 val = constm1_rtx;
20516 if (fpcmp)
20517 PUT_CODE (compare_op,
20518 reverse_condition_maybe_unordered
20519 (GET_CODE (compare_op)));
20520 else
20521 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20524 mode = GET_MODE (operands[0]);
20526 /* Construct either adc or sbb insn. */
20527 if ((code == LTU) == (operands[3] == constm1_rtx))
20529 switch (mode)
20531 case QImode:
20532 insn = gen_subqi3_carry;
20533 break;
20534 case HImode:
20535 insn = gen_subhi3_carry;
20536 break;
20537 case SImode:
20538 insn = gen_subsi3_carry;
20539 break;
20540 case DImode:
20541 insn = gen_subdi3_carry;
20542 break;
20543 default:
20544 gcc_unreachable ();
20547 else
20549 switch (mode)
20551 case QImode:
20552 insn = gen_addqi3_carry;
20553 break;
20554 case HImode:
20555 insn = gen_addhi3_carry;
20556 break;
20557 case SImode:
20558 insn = gen_addsi3_carry;
20559 break;
20560 case DImode:
20561 insn = gen_adddi3_carry;
20562 break;
20563 default:
20564 gcc_unreachable ();
20567 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20569 return true;
20573 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20574 but works for floating pointer parameters and nonoffsetable memories.
20575 For pushes, it returns just stack offsets; the values will be saved
20576 in the right order. Maximally three parts are generated. */
20578 static int
20579 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20581 int size;
20583 if (!TARGET_64BIT)
20584 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20585 else
20586 size = (GET_MODE_SIZE (mode) + 4) / 8;
20588 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20589 gcc_assert (size >= 2 && size <= 4);
20591 /* Optimize constant pool reference to immediates. This is used by fp
20592 moves, that force all constants to memory to allow combining. */
20593 if (MEM_P (operand) && MEM_READONLY_P (operand))
20595 rtx tmp = maybe_get_pool_constant (operand);
20596 if (tmp)
20597 operand = tmp;
20600 if (MEM_P (operand) && !offsettable_memref_p (operand))
20602 /* The only non-offsetable memories we handle are pushes. */
20603 int ok = push_operand (operand, VOIDmode);
20605 gcc_assert (ok);
20607 operand = copy_rtx (operand);
20608 PUT_MODE (operand, word_mode);
20609 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20610 return size;
20613 if (GET_CODE (operand) == CONST_VECTOR)
20615 enum machine_mode imode = int_mode_for_mode (mode);
20616 /* Caution: if we looked through a constant pool memory above,
20617 the operand may actually have a different mode now. That's
20618 ok, since we want to pun this all the way back to an integer. */
20619 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20620 gcc_assert (operand != NULL);
20621 mode = imode;
20624 if (!TARGET_64BIT)
20626 if (mode == DImode)
20627 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20628 else
20630 int i;
20632 if (REG_P (operand))
20634 gcc_assert (reload_completed);
20635 for (i = 0; i < size; i++)
20636 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20638 else if (offsettable_memref_p (operand))
20640 operand = adjust_address (operand, SImode, 0);
20641 parts[0] = operand;
20642 for (i = 1; i < size; i++)
20643 parts[i] = adjust_address (operand, SImode, 4 * i);
20645 else if (GET_CODE (operand) == CONST_DOUBLE)
20647 REAL_VALUE_TYPE r;
20648 long l[4];
20650 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20651 switch (mode)
20653 case TFmode:
20654 real_to_target (l, &r, mode);
20655 parts[3] = gen_int_mode (l[3], SImode);
20656 parts[2] = gen_int_mode (l[2], SImode);
20657 break;
20658 case XFmode:
20659 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
20660 long double may not be 80-bit. */
20661 real_to_target (l, &r, mode);
20662 parts[2] = gen_int_mode (l[2], SImode);
20663 break;
20664 case DFmode:
20665 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20666 break;
20667 default:
20668 gcc_unreachable ();
20670 parts[1] = gen_int_mode (l[1], SImode);
20671 parts[0] = gen_int_mode (l[0], SImode);
20673 else
20674 gcc_unreachable ();
20677 else
20679 if (mode == TImode)
20680 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20681 if (mode == XFmode || mode == TFmode)
20683 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20684 if (REG_P (operand))
20686 gcc_assert (reload_completed);
20687 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20688 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20690 else if (offsettable_memref_p (operand))
20692 operand = adjust_address (operand, DImode, 0);
20693 parts[0] = operand;
20694 parts[1] = adjust_address (operand, upper_mode, 8);
20696 else if (GET_CODE (operand) == CONST_DOUBLE)
20698 REAL_VALUE_TYPE r;
20699 long l[4];
20701 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20702 real_to_target (l, &r, mode);
20704 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20705 if (HOST_BITS_PER_WIDE_INT >= 64)
20706 parts[0]
20707 = gen_int_mode
20708 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20709 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20710 DImode);
20711 else
20712 parts[0] = immed_double_const (l[0], l[1], DImode);
20714 if (upper_mode == SImode)
20715 parts[1] = gen_int_mode (l[2], SImode);
20716 else if (HOST_BITS_PER_WIDE_INT >= 64)
20717 parts[1]
20718 = gen_int_mode
20719 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20720 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20721 DImode);
20722 else
20723 parts[1] = immed_double_const (l[2], l[3], DImode);
20725 else
20726 gcc_unreachable ();
20730 return size;
20733 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20734 Return false when normal moves are needed; true when all required
20735 insns have been emitted. Operands 2-4 contain the input values
20736 int the correct order; operands 5-7 contain the output values. */
20738 void
20739 ix86_split_long_move (rtx operands[])
20741 rtx part[2][4];
20742 int nparts, i, j;
20743 int push = 0;
20744 int collisions = 0;
20745 enum machine_mode mode = GET_MODE (operands[0]);
20746 bool collisionparts[4];
20748 /* The DFmode expanders may ask us to move double.
20749 For 64bit target this is single move. By hiding the fact
20750 here we simplify i386.md splitters. */
20751 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20753 /* Optimize constant pool reference to immediates. This is used by
20754 fp moves, that force all constants to memory to allow combining. */
20756 if (MEM_P (operands[1])
20757 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20758 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20759 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20760 if (push_operand (operands[0], VOIDmode))
20762 operands[0] = copy_rtx (operands[0]);
20763 PUT_MODE (operands[0], word_mode);
20765 else
20766 operands[0] = gen_lowpart (DImode, operands[0]);
20767 operands[1] = gen_lowpart (DImode, operands[1]);
20768 emit_move_insn (operands[0], operands[1]);
20769 return;
20772 /* The only non-offsettable memory we handle is push. */
20773 if (push_operand (operands[0], VOIDmode))
20774 push = 1;
20775 else
20776 gcc_assert (!MEM_P (operands[0])
20777 || offsettable_memref_p (operands[0]));
20779 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20780 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20782 /* When emitting push, take care for source operands on the stack. */
20783 if (push && MEM_P (operands[1])
20784 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20786 rtx src_base = XEXP (part[1][nparts - 1], 0);
20788 /* Compensate for the stack decrement by 4. */
20789 if (!TARGET_64BIT && nparts == 3
20790 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20791 src_base = plus_constant (Pmode, src_base, 4);
20793 /* src_base refers to the stack pointer and is
20794 automatically decreased by emitted push. */
20795 for (i = 0; i < nparts; i++)
20796 part[1][i] = change_address (part[1][i],
20797 GET_MODE (part[1][i]), src_base);
20800 /* We need to do copy in the right order in case an address register
20801 of the source overlaps the destination. */
20802 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20804 rtx tmp;
20806 for (i = 0; i < nparts; i++)
20808 collisionparts[i]
20809 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20810 if (collisionparts[i])
20811 collisions++;
20814 /* Collision in the middle part can be handled by reordering. */
20815 if (collisions == 1 && nparts == 3 && collisionparts [1])
20817 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20818 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20820 else if (collisions == 1
20821 && nparts == 4
20822 && (collisionparts [1] || collisionparts [2]))
20824 if (collisionparts [1])
20826 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20827 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20829 else
20831 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20832 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20836 /* If there are more collisions, we can't handle it by reordering.
20837 Do an lea to the last part and use only one colliding move. */
20838 else if (collisions > 1)
20840 rtx base;
20842 collisions = 1;
20844 base = part[0][nparts - 1];
20846 /* Handle the case when the last part isn't valid for lea.
20847 Happens in 64-bit mode storing the 12-byte XFmode. */
20848 if (GET_MODE (base) != Pmode)
20849 base = gen_rtx_REG (Pmode, REGNO (base));
20851 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20852 part[1][0] = replace_equiv_address (part[1][0], base);
20853 for (i = 1; i < nparts; i++)
20855 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
20856 part[1][i] = replace_equiv_address (part[1][i], tmp);
20861 if (push)
20863 if (!TARGET_64BIT)
20865 if (nparts == 3)
20867 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20868 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20869 stack_pointer_rtx, GEN_INT (-4)));
20870 emit_move_insn (part[0][2], part[1][2]);
20872 else if (nparts == 4)
20874 emit_move_insn (part[0][3], part[1][3]);
20875 emit_move_insn (part[0][2], part[1][2]);
20878 else
20880 /* In 64bit mode we don't have 32bit push available. In case this is
20881 register, it is OK - we will just use larger counterpart. We also
20882 retype memory - these comes from attempt to avoid REX prefix on
20883 moving of second half of TFmode value. */
20884 if (GET_MODE (part[1][1]) == SImode)
20886 switch (GET_CODE (part[1][1]))
20888 case MEM:
20889 part[1][1] = adjust_address (part[1][1], DImode, 0);
20890 break;
20892 case REG:
20893 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20894 break;
20896 default:
20897 gcc_unreachable ();
20900 if (GET_MODE (part[1][0]) == SImode)
20901 part[1][0] = part[1][1];
20904 emit_move_insn (part[0][1], part[1][1]);
20905 emit_move_insn (part[0][0], part[1][0]);
20906 return;
20909 /* Choose correct order to not overwrite the source before it is copied. */
20910 if ((REG_P (part[0][0])
20911 && REG_P (part[1][1])
20912 && (REGNO (part[0][0]) == REGNO (part[1][1])
20913 || (nparts == 3
20914 && REGNO (part[0][0]) == REGNO (part[1][2]))
20915 || (nparts == 4
20916 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20917 || (collisions > 0
20918 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20920 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20922 operands[2 + i] = part[0][j];
20923 operands[6 + i] = part[1][j];
20926 else
20928 for (i = 0; i < nparts; i++)
20930 operands[2 + i] = part[0][i];
20931 operands[6 + i] = part[1][i];
20935 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20936 if (optimize_insn_for_size_p ())
20938 for (j = 0; j < nparts - 1; j++)
20939 if (CONST_INT_P (operands[6 + j])
20940 && operands[6 + j] != const0_rtx
20941 && REG_P (operands[2 + j]))
20942 for (i = j; i < nparts - 1; i++)
20943 if (CONST_INT_P (operands[7 + i])
20944 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20945 operands[7 + i] = operands[2 + j];
20948 for (i = 0; i < nparts; i++)
20949 emit_move_insn (operands[2 + i], operands[6 + i]);
20951 return;
20954 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20955 left shift by a constant, either using a single shift or
20956 a sequence of add instructions. */
20958 static void
20959 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20961 rtx (*insn)(rtx, rtx, rtx);
20963 if (count == 1
20964 || (count * ix86_cost->add <= ix86_cost->shift_const
20965 && !optimize_insn_for_size_p ()))
20967 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20968 while (count-- > 0)
20969 emit_insn (insn (operand, operand, operand));
20971 else
20973 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20974 emit_insn (insn (operand, operand, GEN_INT (count)));
20978 void
20979 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20981 rtx (*gen_ashl3)(rtx, rtx, rtx);
20982 rtx (*gen_shld)(rtx, rtx, rtx);
20983 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20985 rtx low[2], high[2];
20986 int count;
20988 if (CONST_INT_P (operands[2]))
20990 split_double_mode (mode, operands, 2, low, high);
20991 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20993 if (count >= half_width)
20995 emit_move_insn (high[0], low[1]);
20996 emit_move_insn (low[0], const0_rtx);
20998 if (count > half_width)
20999 ix86_expand_ashl_const (high[0], count - half_width, mode);
21001 else
21003 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21005 if (!rtx_equal_p (operands[0], operands[1]))
21006 emit_move_insn (operands[0], operands[1]);
21008 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21009 ix86_expand_ashl_const (low[0], count, mode);
21011 return;
21014 split_double_mode (mode, operands, 1, low, high);
21016 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21018 if (operands[1] == const1_rtx)
21020 /* Assuming we've chosen a QImode capable registers, then 1 << N
21021 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21022 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21024 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21026 ix86_expand_clear (low[0]);
21027 ix86_expand_clear (high[0]);
21028 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21030 d = gen_lowpart (QImode, low[0]);
21031 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21032 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21033 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21035 d = gen_lowpart (QImode, high[0]);
21036 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21037 s = gen_rtx_NE (QImode, flags, const0_rtx);
21038 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21041 /* Otherwise, we can get the same results by manually performing
21042 a bit extract operation on bit 5/6, and then performing the two
21043 shifts. The two methods of getting 0/1 into low/high are exactly
21044 the same size. Avoiding the shift in the bit extract case helps
21045 pentium4 a bit; no one else seems to care much either way. */
21046 else
21048 enum machine_mode half_mode;
21049 rtx (*gen_lshr3)(rtx, rtx, rtx);
21050 rtx (*gen_and3)(rtx, rtx, rtx);
21051 rtx (*gen_xor3)(rtx, rtx, rtx);
21052 HOST_WIDE_INT bits;
21053 rtx x;
21055 if (mode == DImode)
21057 half_mode = SImode;
21058 gen_lshr3 = gen_lshrsi3;
21059 gen_and3 = gen_andsi3;
21060 gen_xor3 = gen_xorsi3;
21061 bits = 5;
21063 else
21065 half_mode = DImode;
21066 gen_lshr3 = gen_lshrdi3;
21067 gen_and3 = gen_anddi3;
21068 gen_xor3 = gen_xordi3;
21069 bits = 6;
21072 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21073 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21074 else
21075 x = gen_lowpart (half_mode, operands[2]);
21076 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21078 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21079 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21080 emit_move_insn (low[0], high[0]);
21081 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21084 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21085 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21086 return;
21089 if (operands[1] == constm1_rtx)
21091 /* For -1 << N, we can avoid the shld instruction, because we
21092 know that we're shifting 0...31/63 ones into a -1. */
21093 emit_move_insn (low[0], constm1_rtx);
21094 if (optimize_insn_for_size_p ())
21095 emit_move_insn (high[0], low[0]);
21096 else
21097 emit_move_insn (high[0], constm1_rtx);
21099 else
21101 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21103 if (!rtx_equal_p (operands[0], operands[1]))
21104 emit_move_insn (operands[0], operands[1]);
21106 split_double_mode (mode, operands, 1, low, high);
21107 emit_insn (gen_shld (high[0], low[0], operands[2]));
21110 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21112 if (TARGET_CMOVE && scratch)
21114 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21115 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21117 ix86_expand_clear (scratch);
21118 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21120 else
21122 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21123 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21125 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21129 void
21130 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21132 rtx (*gen_ashr3)(rtx, rtx, rtx)
21133 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21134 rtx (*gen_shrd)(rtx, rtx, rtx);
21135 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21137 rtx low[2], high[2];
21138 int count;
21140 if (CONST_INT_P (operands[2]))
21142 split_double_mode (mode, operands, 2, low, high);
21143 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21145 if (count == GET_MODE_BITSIZE (mode) - 1)
21147 emit_move_insn (high[0], high[1]);
21148 emit_insn (gen_ashr3 (high[0], high[0],
21149 GEN_INT (half_width - 1)));
21150 emit_move_insn (low[0], high[0]);
21153 else if (count >= half_width)
21155 emit_move_insn (low[0], high[1]);
21156 emit_move_insn (high[0], low[0]);
21157 emit_insn (gen_ashr3 (high[0], high[0],
21158 GEN_INT (half_width - 1)));
21160 if (count > half_width)
21161 emit_insn (gen_ashr3 (low[0], low[0],
21162 GEN_INT (count - half_width)));
21164 else
21166 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21168 if (!rtx_equal_p (operands[0], operands[1]))
21169 emit_move_insn (operands[0], operands[1]);
21171 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21172 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21175 else
21177 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21179 if (!rtx_equal_p (operands[0], operands[1]))
21180 emit_move_insn (operands[0], operands[1]);
21182 split_double_mode (mode, operands, 1, low, high);
21184 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21185 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21187 if (TARGET_CMOVE && scratch)
21189 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21190 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21192 emit_move_insn (scratch, high[0]);
21193 emit_insn (gen_ashr3 (scratch, scratch,
21194 GEN_INT (half_width - 1)));
21195 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21196 scratch));
21198 else
21200 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21201 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21203 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21208 void
21209 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21211 rtx (*gen_lshr3)(rtx, rtx, rtx)
21212 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21213 rtx (*gen_shrd)(rtx, rtx, rtx);
21214 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21216 rtx low[2], high[2];
21217 int count;
21219 if (CONST_INT_P (operands[2]))
21221 split_double_mode (mode, operands, 2, low, high);
21222 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21224 if (count >= half_width)
21226 emit_move_insn (low[0], high[1]);
21227 ix86_expand_clear (high[0]);
21229 if (count > half_width)
21230 emit_insn (gen_lshr3 (low[0], low[0],
21231 GEN_INT (count - half_width)));
21233 else
21235 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21237 if (!rtx_equal_p (operands[0], operands[1]))
21238 emit_move_insn (operands[0], operands[1]);
21240 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21241 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21244 else
21246 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21248 if (!rtx_equal_p (operands[0], operands[1]))
21249 emit_move_insn (operands[0], operands[1]);
21251 split_double_mode (mode, operands, 1, low, high);
21253 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21254 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21256 if (TARGET_CMOVE && scratch)
21258 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21259 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21261 ix86_expand_clear (scratch);
21262 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21263 scratch));
21265 else
21267 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21268 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21270 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21275 /* Predict just emitted jump instruction to be taken with probability PROB. */
21276 static void
21277 predict_jump (int prob)
21279 rtx insn = get_last_insn ();
21280 gcc_assert (JUMP_P (insn));
21281 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21284 /* Helper function for the string operations below. Dest VARIABLE whether
21285 it is aligned to VALUE bytes. If true, jump to the label. */
21286 static rtx
21287 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21289 rtx label = gen_label_rtx ();
21290 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21291 if (GET_MODE (variable) == DImode)
21292 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21293 else
21294 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21295 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21296 1, label);
21297 if (epilogue)
21298 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21299 else
21300 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21301 return label;
21304 /* Adjust COUNTER by the VALUE. */
21305 static void
21306 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21308 rtx (*gen_add)(rtx, rtx, rtx)
21309 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21311 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21314 /* Zero extend possibly SImode EXP to Pmode register. */
21316 ix86_zero_extend_to_Pmode (rtx exp)
21318 if (GET_MODE (exp) != Pmode)
21319 exp = convert_to_mode (Pmode, exp, 1);
21320 return force_reg (Pmode, exp);
21323 /* Divide COUNTREG by SCALE. */
21324 static rtx
21325 scale_counter (rtx countreg, int scale)
21327 rtx sc;
21329 if (scale == 1)
21330 return countreg;
21331 if (CONST_INT_P (countreg))
21332 return GEN_INT (INTVAL (countreg) / scale);
21333 gcc_assert (REG_P (countreg));
21335 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21336 GEN_INT (exact_log2 (scale)),
21337 NULL, 1, OPTAB_DIRECT);
21338 return sc;
21341 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21342 DImode for constant loop counts. */
21344 static enum machine_mode
21345 counter_mode (rtx count_exp)
21347 if (GET_MODE (count_exp) != VOIDmode)
21348 return GET_MODE (count_exp);
21349 if (!CONST_INT_P (count_exp))
21350 return Pmode;
21351 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21352 return DImode;
21353 return SImode;
21356 /* When SRCPTR is non-NULL, output simple loop to move memory
21357 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21358 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21359 equivalent loop to set memory by VALUE (supposed to be in MODE).
21361 The size is rounded down to whole number of chunk size moved at once.
21362 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21365 static void
21366 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21367 rtx destptr, rtx srcptr, rtx value,
21368 rtx count, enum machine_mode mode, int unroll,
21369 int expected_size)
21371 rtx out_label, top_label, iter, tmp;
21372 enum machine_mode iter_mode = counter_mode (count);
21373 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21374 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21375 rtx size;
21376 rtx x_addr;
21377 rtx y_addr;
21378 int i;
21380 top_label = gen_label_rtx ();
21381 out_label = gen_label_rtx ();
21382 iter = gen_reg_rtx (iter_mode);
21384 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21385 NULL, 1, OPTAB_DIRECT);
21386 /* Those two should combine. */
21387 if (piece_size == const1_rtx)
21389 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21390 true, out_label);
21391 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21393 emit_move_insn (iter, const0_rtx);
21395 emit_label (top_label);
21397 tmp = convert_modes (Pmode, iter_mode, iter, true);
21398 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21399 destmem = change_address (destmem, mode, x_addr);
21401 if (srcmem)
21403 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21404 srcmem = change_address (srcmem, mode, y_addr);
21406 /* When unrolling for chips that reorder memory reads and writes,
21407 we can save registers by using single temporary.
21408 Also using 4 temporaries is overkill in 32bit mode. */
21409 if (!TARGET_64BIT && 0)
21411 for (i = 0; i < unroll; i++)
21413 if (i)
21415 destmem =
21416 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21417 srcmem =
21418 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21420 emit_move_insn (destmem, srcmem);
21423 else
21425 rtx tmpreg[4];
21426 gcc_assert (unroll <= 4);
21427 for (i = 0; i < unroll; i++)
21429 tmpreg[i] = gen_reg_rtx (mode);
21430 if (i)
21432 srcmem =
21433 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21435 emit_move_insn (tmpreg[i], srcmem);
21437 for (i = 0; i < unroll; i++)
21439 if (i)
21441 destmem =
21442 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21444 emit_move_insn (destmem, tmpreg[i]);
21448 else
21449 for (i = 0; i < unroll; i++)
21451 if (i)
21452 destmem =
21453 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21454 emit_move_insn (destmem, value);
21457 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21458 true, OPTAB_LIB_WIDEN);
21459 if (tmp != iter)
21460 emit_move_insn (iter, tmp);
21462 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21463 true, top_label);
21464 if (expected_size != -1)
21466 expected_size /= GET_MODE_SIZE (mode) * unroll;
21467 if (expected_size == 0)
21468 predict_jump (0);
21469 else if (expected_size > REG_BR_PROB_BASE)
21470 predict_jump (REG_BR_PROB_BASE - 1);
21471 else
21472 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21474 else
21475 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21476 iter = ix86_zero_extend_to_Pmode (iter);
21477 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21478 true, OPTAB_LIB_WIDEN);
21479 if (tmp != destptr)
21480 emit_move_insn (destptr, tmp);
21481 if (srcptr)
21483 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21484 true, OPTAB_LIB_WIDEN);
21485 if (tmp != srcptr)
21486 emit_move_insn (srcptr, tmp);
21488 emit_label (out_label);
21491 /* Output "rep; mov" instruction.
21492 Arguments have same meaning as for previous function */
21493 static void
21494 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21495 rtx destptr, rtx srcptr,
21496 rtx count,
21497 enum machine_mode mode)
21499 rtx destexp;
21500 rtx srcexp;
21501 rtx countreg;
21502 HOST_WIDE_INT rounded_count;
21504 /* If the size is known, it is shorter to use rep movs. */
21505 if (mode == QImode && CONST_INT_P (count)
21506 && !(INTVAL (count) & 3))
21507 mode = SImode;
21509 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21510 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21511 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21512 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21513 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21514 if (mode != QImode)
21516 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21517 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21518 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21519 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21520 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21521 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21523 else
21525 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21526 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21528 if (CONST_INT_P (count))
21530 rounded_count = (INTVAL (count)
21531 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21532 destmem = shallow_copy_rtx (destmem);
21533 srcmem = shallow_copy_rtx (srcmem);
21534 set_mem_size (destmem, rounded_count);
21535 set_mem_size (srcmem, rounded_count);
21537 else
21539 if (MEM_SIZE_KNOWN_P (destmem))
21540 clear_mem_size (destmem);
21541 if (MEM_SIZE_KNOWN_P (srcmem))
21542 clear_mem_size (srcmem);
21544 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21545 destexp, srcexp));
21548 /* Output "rep; stos" instruction.
21549 Arguments have same meaning as for previous function */
21550 static void
21551 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21552 rtx count, enum machine_mode mode,
21553 rtx orig_value)
21555 rtx destexp;
21556 rtx countreg;
21557 HOST_WIDE_INT rounded_count;
21559 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21560 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21561 value = force_reg (mode, gen_lowpart (mode, value));
21562 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21563 if (mode != QImode)
21565 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21566 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21567 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21569 else
21570 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21571 if (orig_value == const0_rtx && CONST_INT_P (count))
21573 rounded_count = (INTVAL (count)
21574 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21575 destmem = shallow_copy_rtx (destmem);
21576 set_mem_size (destmem, rounded_count);
21578 else if (MEM_SIZE_KNOWN_P (destmem))
21579 clear_mem_size (destmem);
21580 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21583 static void
21584 emit_strmov (rtx destmem, rtx srcmem,
21585 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21587 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21588 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21589 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21592 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21593 static void
21594 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21595 rtx destptr, rtx srcptr, rtx count, int max_size)
21597 rtx src, dest;
21598 if (CONST_INT_P (count))
21600 HOST_WIDE_INT countval = INTVAL (count);
21601 int offset = 0;
21603 if ((countval & 0x10) && max_size > 16)
21605 if (TARGET_64BIT)
21607 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21608 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21610 else
21611 gcc_unreachable ();
21612 offset += 16;
21614 if ((countval & 0x08) && max_size > 8)
21616 if (TARGET_64BIT)
21617 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21618 else
21620 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21621 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21623 offset += 8;
21625 if ((countval & 0x04) && max_size > 4)
21627 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21628 offset += 4;
21630 if ((countval & 0x02) && max_size > 2)
21632 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21633 offset += 2;
21635 if ((countval & 0x01) && max_size > 1)
21637 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21638 offset += 1;
21640 return;
21642 if (max_size > 8)
21644 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21645 count, 1, OPTAB_DIRECT);
21646 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21647 count, QImode, 1, 4);
21648 return;
21651 /* When there are stringops, we can cheaply increase dest and src pointers.
21652 Otherwise we save code size by maintaining offset (zero is readily
21653 available from preceding rep operation) and using x86 addressing modes.
21655 if (TARGET_SINGLE_STRINGOP)
21657 if (max_size > 4)
21659 rtx label = ix86_expand_aligntest (count, 4, true);
21660 src = change_address (srcmem, SImode, srcptr);
21661 dest = change_address (destmem, SImode, destptr);
21662 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21663 emit_label (label);
21664 LABEL_NUSES (label) = 1;
21666 if (max_size > 2)
21668 rtx label = ix86_expand_aligntest (count, 2, true);
21669 src = change_address (srcmem, HImode, srcptr);
21670 dest = change_address (destmem, HImode, destptr);
21671 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21672 emit_label (label);
21673 LABEL_NUSES (label) = 1;
21675 if (max_size > 1)
21677 rtx label = ix86_expand_aligntest (count, 1, true);
21678 src = change_address (srcmem, QImode, srcptr);
21679 dest = change_address (destmem, QImode, destptr);
21680 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21681 emit_label (label);
21682 LABEL_NUSES (label) = 1;
21685 else
21687 rtx offset = force_reg (Pmode, const0_rtx);
21688 rtx tmp;
21690 if (max_size > 4)
21692 rtx label = ix86_expand_aligntest (count, 4, true);
21693 src = change_address (srcmem, SImode, srcptr);
21694 dest = change_address (destmem, SImode, destptr);
21695 emit_move_insn (dest, src);
21696 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21697 true, OPTAB_LIB_WIDEN);
21698 if (tmp != offset)
21699 emit_move_insn (offset, tmp);
21700 emit_label (label);
21701 LABEL_NUSES (label) = 1;
21703 if (max_size > 2)
21705 rtx label = ix86_expand_aligntest (count, 2, true);
21706 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21707 src = change_address (srcmem, HImode, tmp);
21708 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21709 dest = change_address (destmem, HImode, tmp);
21710 emit_move_insn (dest, src);
21711 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21712 true, OPTAB_LIB_WIDEN);
21713 if (tmp != offset)
21714 emit_move_insn (offset, tmp);
21715 emit_label (label);
21716 LABEL_NUSES (label) = 1;
21718 if (max_size > 1)
21720 rtx label = ix86_expand_aligntest (count, 1, true);
21721 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21722 src = change_address (srcmem, QImode, tmp);
21723 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21724 dest = change_address (destmem, QImode, tmp);
21725 emit_move_insn (dest, src);
21726 emit_label (label);
21727 LABEL_NUSES (label) = 1;
21732 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21733 static void
21734 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21735 rtx count, int max_size)
21737 count =
21738 expand_simple_binop (counter_mode (count), AND, count,
21739 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21740 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21741 gen_lowpart (QImode, value), count, QImode,
21742 1, max_size / 2);
21745 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21746 static void
21747 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21749 rtx dest;
21751 if (CONST_INT_P (count))
21753 HOST_WIDE_INT countval = INTVAL (count);
21754 int offset = 0;
21756 if ((countval & 0x10) && max_size > 16)
21758 if (TARGET_64BIT)
21760 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21761 emit_insn (gen_strset (destptr, dest, value));
21762 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21763 emit_insn (gen_strset (destptr, dest, value));
21765 else
21766 gcc_unreachable ();
21767 offset += 16;
21769 if ((countval & 0x08) && max_size > 8)
21771 if (TARGET_64BIT)
21773 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21774 emit_insn (gen_strset (destptr, dest, value));
21776 else
21778 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21779 emit_insn (gen_strset (destptr, dest, value));
21780 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21781 emit_insn (gen_strset (destptr, dest, value));
21783 offset += 8;
21785 if ((countval & 0x04) && max_size > 4)
21787 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21788 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21789 offset += 4;
21791 if ((countval & 0x02) && max_size > 2)
21793 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21794 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21795 offset += 2;
21797 if ((countval & 0x01) && max_size > 1)
21799 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21800 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21801 offset += 1;
21803 return;
21805 if (max_size > 32)
21807 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21808 return;
21810 if (max_size > 16)
21812 rtx label = ix86_expand_aligntest (count, 16, true);
21813 if (TARGET_64BIT)
21815 dest = change_address (destmem, DImode, destptr);
21816 emit_insn (gen_strset (destptr, dest, value));
21817 emit_insn (gen_strset (destptr, dest, value));
21819 else
21821 dest = change_address (destmem, SImode, destptr);
21822 emit_insn (gen_strset (destptr, dest, value));
21823 emit_insn (gen_strset (destptr, dest, value));
21824 emit_insn (gen_strset (destptr, dest, value));
21825 emit_insn (gen_strset (destptr, dest, value));
21827 emit_label (label);
21828 LABEL_NUSES (label) = 1;
21830 if (max_size > 8)
21832 rtx label = ix86_expand_aligntest (count, 8, true);
21833 if (TARGET_64BIT)
21835 dest = change_address (destmem, DImode, destptr);
21836 emit_insn (gen_strset (destptr, dest, value));
21838 else
21840 dest = change_address (destmem, SImode, destptr);
21841 emit_insn (gen_strset (destptr, dest, value));
21842 emit_insn (gen_strset (destptr, dest, value));
21844 emit_label (label);
21845 LABEL_NUSES (label) = 1;
21847 if (max_size > 4)
21849 rtx label = ix86_expand_aligntest (count, 4, true);
21850 dest = change_address (destmem, SImode, destptr);
21851 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21852 emit_label (label);
21853 LABEL_NUSES (label) = 1;
21855 if (max_size > 2)
21857 rtx label = ix86_expand_aligntest (count, 2, true);
21858 dest = change_address (destmem, HImode, destptr);
21859 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21860 emit_label (label);
21861 LABEL_NUSES (label) = 1;
21863 if (max_size > 1)
21865 rtx label = ix86_expand_aligntest (count, 1, true);
21866 dest = change_address (destmem, QImode, destptr);
21867 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21868 emit_label (label);
21869 LABEL_NUSES (label) = 1;
21873 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21874 DESIRED_ALIGNMENT. */
21875 static void
21876 expand_movmem_prologue (rtx destmem, rtx srcmem,
21877 rtx destptr, rtx srcptr, rtx count,
21878 int align, int desired_alignment)
21880 if (align <= 1 && desired_alignment > 1)
21882 rtx label = ix86_expand_aligntest (destptr, 1, false);
21883 srcmem = change_address (srcmem, QImode, srcptr);
21884 destmem = change_address (destmem, QImode, destptr);
21885 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21886 ix86_adjust_counter (count, 1);
21887 emit_label (label);
21888 LABEL_NUSES (label) = 1;
21890 if (align <= 2 && desired_alignment > 2)
21892 rtx label = ix86_expand_aligntest (destptr, 2, false);
21893 srcmem = change_address (srcmem, HImode, srcptr);
21894 destmem = change_address (destmem, HImode, destptr);
21895 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21896 ix86_adjust_counter (count, 2);
21897 emit_label (label);
21898 LABEL_NUSES (label) = 1;
21900 if (align <= 4 && desired_alignment > 4)
21902 rtx label = ix86_expand_aligntest (destptr, 4, false);
21903 srcmem = change_address (srcmem, SImode, srcptr);
21904 destmem = change_address (destmem, SImode, destptr);
21905 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21906 ix86_adjust_counter (count, 4);
21907 emit_label (label);
21908 LABEL_NUSES (label) = 1;
21910 gcc_assert (desired_alignment <= 8);
21913 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21914 ALIGN_BYTES is how many bytes need to be copied. */
21915 static rtx
21916 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21917 int desired_align, int align_bytes)
21919 rtx src = *srcp;
21920 rtx orig_dst = dst;
21921 rtx orig_src = src;
21922 int off = 0;
21923 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21924 if (src_align_bytes >= 0)
21925 src_align_bytes = desired_align - src_align_bytes;
21926 if (align_bytes & 1)
21928 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21929 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21930 off = 1;
21931 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21933 if (align_bytes & 2)
21935 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21936 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21937 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21938 set_mem_align (dst, 2 * BITS_PER_UNIT);
21939 if (src_align_bytes >= 0
21940 && (src_align_bytes & 1) == (align_bytes & 1)
21941 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21942 set_mem_align (src, 2 * BITS_PER_UNIT);
21943 off = 2;
21944 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21946 if (align_bytes & 4)
21948 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21949 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21950 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21951 set_mem_align (dst, 4 * BITS_PER_UNIT);
21952 if (src_align_bytes >= 0)
21954 unsigned int src_align = 0;
21955 if ((src_align_bytes & 3) == (align_bytes & 3))
21956 src_align = 4;
21957 else if ((src_align_bytes & 1) == (align_bytes & 1))
21958 src_align = 2;
21959 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21960 set_mem_align (src, src_align * BITS_PER_UNIT);
21962 off = 4;
21963 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21965 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21966 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21967 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21968 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21969 if (src_align_bytes >= 0)
21971 unsigned int src_align = 0;
21972 if ((src_align_bytes & 7) == (align_bytes & 7))
21973 src_align = 8;
21974 else if ((src_align_bytes & 3) == (align_bytes & 3))
21975 src_align = 4;
21976 else if ((src_align_bytes & 1) == (align_bytes & 1))
21977 src_align = 2;
21978 if (src_align > (unsigned int) desired_align)
21979 src_align = desired_align;
21980 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21981 set_mem_align (src, src_align * BITS_PER_UNIT);
21983 if (MEM_SIZE_KNOWN_P (orig_dst))
21984 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21985 if (MEM_SIZE_KNOWN_P (orig_src))
21986 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21987 *srcp = src;
21988 return dst;
21991 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21992 DESIRED_ALIGNMENT. */
21993 static void
21994 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21995 int align, int desired_alignment)
21997 if (align <= 1 && desired_alignment > 1)
21999 rtx label = ix86_expand_aligntest (destptr, 1, false);
22000 destmem = change_address (destmem, QImode, destptr);
22001 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22002 ix86_adjust_counter (count, 1);
22003 emit_label (label);
22004 LABEL_NUSES (label) = 1;
22006 if (align <= 2 && desired_alignment > 2)
22008 rtx label = ix86_expand_aligntest (destptr, 2, false);
22009 destmem = change_address (destmem, HImode, destptr);
22010 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22011 ix86_adjust_counter (count, 2);
22012 emit_label (label);
22013 LABEL_NUSES (label) = 1;
22015 if (align <= 4 && desired_alignment > 4)
22017 rtx label = ix86_expand_aligntest (destptr, 4, false);
22018 destmem = change_address (destmem, SImode, destptr);
22019 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22020 ix86_adjust_counter (count, 4);
22021 emit_label (label);
22022 LABEL_NUSES (label) = 1;
22024 gcc_assert (desired_alignment <= 8);
22027 /* Set enough from DST to align DST known to by aligned by ALIGN to
22028 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22029 static rtx
22030 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22031 int desired_align, int align_bytes)
22033 int off = 0;
22034 rtx orig_dst = dst;
22035 if (align_bytes & 1)
22037 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22038 off = 1;
22039 emit_insn (gen_strset (destreg, dst,
22040 gen_lowpart (QImode, value)));
22042 if (align_bytes & 2)
22044 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22045 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22046 set_mem_align (dst, 2 * BITS_PER_UNIT);
22047 off = 2;
22048 emit_insn (gen_strset (destreg, dst,
22049 gen_lowpart (HImode, value)));
22051 if (align_bytes & 4)
22053 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22054 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22055 set_mem_align (dst, 4 * BITS_PER_UNIT);
22056 off = 4;
22057 emit_insn (gen_strset (destreg, dst,
22058 gen_lowpart (SImode, value)));
22060 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22061 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22062 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22063 if (MEM_SIZE_KNOWN_P (orig_dst))
22064 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22065 return dst;
22068 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22069 static enum stringop_alg
22070 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22071 int *dynamic_check)
22073 const struct stringop_algs * algs;
22074 bool optimize_for_speed;
22075 /* Algorithms using the rep prefix want at least edi and ecx;
22076 additionally, memset wants eax and memcpy wants esi. Don't
22077 consider such algorithms if the user has appropriated those
22078 registers for their own purposes. */
22079 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22080 || (memset
22081 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22083 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22084 || (alg != rep_prefix_1_byte \
22085 && alg != rep_prefix_4_byte \
22086 && alg != rep_prefix_8_byte))
22087 const struct processor_costs *cost;
22089 /* Even if the string operation call is cold, we still might spend a lot
22090 of time processing large blocks. */
22091 if (optimize_function_for_size_p (cfun)
22092 || (optimize_insn_for_size_p ()
22093 && expected_size != -1 && expected_size < 256))
22094 optimize_for_speed = false;
22095 else
22096 optimize_for_speed = true;
22098 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22100 *dynamic_check = -1;
22101 if (memset)
22102 algs = &cost->memset[TARGET_64BIT != 0];
22103 else
22104 algs = &cost->memcpy[TARGET_64BIT != 0];
22105 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22106 return ix86_stringop_alg;
22107 /* rep; movq or rep; movl is the smallest variant. */
22108 else if (!optimize_for_speed)
22110 if (!count || (count & 3))
22111 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22112 else
22113 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22115 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22117 else if (expected_size != -1 && expected_size < 4)
22118 return loop_1_byte;
22119 else if (expected_size != -1)
22121 unsigned int i;
22122 enum stringop_alg alg = libcall;
22123 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22125 /* We get here if the algorithms that were not libcall-based
22126 were rep-prefix based and we are unable to use rep prefixes
22127 based on global register usage. Break out of the loop and
22128 use the heuristic below. */
22129 if (algs->size[i].max == 0)
22130 break;
22131 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22133 enum stringop_alg candidate = algs->size[i].alg;
22135 if (candidate != libcall && ALG_USABLE_P (candidate))
22136 alg = candidate;
22137 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22138 last non-libcall inline algorithm. */
22139 if (TARGET_INLINE_ALL_STRINGOPS)
22141 /* When the current size is best to be copied by a libcall,
22142 but we are still forced to inline, run the heuristic below
22143 that will pick code for medium sized blocks. */
22144 if (alg != libcall)
22145 return alg;
22146 break;
22148 else if (ALG_USABLE_P (candidate))
22149 return candidate;
22152 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22154 /* When asked to inline the call anyway, try to pick meaningful choice.
22155 We look for maximal size of block that is faster to copy by hand and
22156 take blocks of at most of that size guessing that average size will
22157 be roughly half of the block.
22159 If this turns out to be bad, we might simply specify the preferred
22160 choice in ix86_costs. */
22161 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22162 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22164 int max = -1;
22165 enum stringop_alg alg;
22166 int i;
22167 bool any_alg_usable_p = true;
22169 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22171 enum stringop_alg candidate = algs->size[i].alg;
22172 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22174 if (candidate != libcall && candidate
22175 && ALG_USABLE_P (candidate))
22176 max = algs->size[i].max;
22178 /* If there aren't any usable algorithms, then recursing on
22179 smaller sizes isn't going to find anything. Just return the
22180 simple byte-at-a-time copy loop. */
22181 if (!any_alg_usable_p)
22183 /* Pick something reasonable. */
22184 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22185 *dynamic_check = 128;
22186 return loop_1_byte;
22188 if (max == -1)
22189 max = 4096;
22190 alg = decide_alg (count, max / 2, memset, dynamic_check);
22191 gcc_assert (*dynamic_check == -1);
22192 gcc_assert (alg != libcall);
22193 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22194 *dynamic_check = max;
22195 return alg;
22197 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22198 #undef ALG_USABLE_P
22201 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22202 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22203 static int
22204 decide_alignment (int align,
22205 enum stringop_alg alg,
22206 int expected_size)
22208 int desired_align = 0;
22209 switch (alg)
22211 case no_stringop:
22212 gcc_unreachable ();
22213 case loop:
22214 case unrolled_loop:
22215 desired_align = GET_MODE_SIZE (Pmode);
22216 break;
22217 case rep_prefix_8_byte:
22218 desired_align = 8;
22219 break;
22220 case rep_prefix_4_byte:
22221 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22222 copying whole cacheline at once. */
22223 if (TARGET_PENTIUMPRO)
22224 desired_align = 8;
22225 else
22226 desired_align = 4;
22227 break;
22228 case rep_prefix_1_byte:
22229 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22230 copying whole cacheline at once. */
22231 if (TARGET_PENTIUMPRO)
22232 desired_align = 8;
22233 else
22234 desired_align = 1;
22235 break;
22236 case loop_1_byte:
22237 desired_align = 1;
22238 break;
22239 case libcall:
22240 return 0;
22243 if (optimize_size)
22244 desired_align = 1;
22245 if (desired_align < align)
22246 desired_align = align;
22247 if (expected_size != -1 && expected_size < 4)
22248 desired_align = align;
22249 return desired_align;
22252 /* Return the smallest power of 2 greater than VAL. */
22253 static int
22254 smallest_pow2_greater_than (int val)
22256 int ret = 1;
22257 while (ret <= val)
22258 ret <<= 1;
22259 return ret;
22262 /* Expand string move (memcpy) operation. Use i386 string operations
22263 when profitable. expand_setmem contains similar code. The code
22264 depends upon architecture, block size and alignment, but always has
22265 the same overall structure:
22267 1) Prologue guard: Conditional that jumps up to epilogues for small
22268 blocks that can be handled by epilogue alone. This is faster
22269 but also needed for correctness, since prologue assume the block
22270 is larger than the desired alignment.
22272 Optional dynamic check for size and libcall for large
22273 blocks is emitted here too, with -minline-stringops-dynamically.
22275 2) Prologue: copy first few bytes in order to get destination
22276 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22277 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22278 copied. We emit either a jump tree on power of two sized
22279 blocks, or a byte loop.
22281 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22282 with specified algorithm.
22284 4) Epilogue: code copying tail of the block that is too small to be
22285 handled by main body (or up to size guarded by prologue guard). */
22287 bool
22288 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22289 rtx expected_align_exp, rtx expected_size_exp)
22291 rtx destreg;
22292 rtx srcreg;
22293 rtx label = NULL;
22294 rtx tmp;
22295 rtx jump_around_label = NULL;
22296 HOST_WIDE_INT align = 1;
22297 unsigned HOST_WIDE_INT count = 0;
22298 HOST_WIDE_INT expected_size = -1;
22299 int size_needed = 0, epilogue_size_needed;
22300 int desired_align = 0, align_bytes = 0;
22301 enum stringop_alg alg;
22302 int dynamic_check;
22303 bool need_zero_guard = false;
22305 if (CONST_INT_P (align_exp))
22306 align = INTVAL (align_exp);
22307 /* i386 can do misaligned access on reasonably increased cost. */
22308 if (CONST_INT_P (expected_align_exp)
22309 && INTVAL (expected_align_exp) > align)
22310 align = INTVAL (expected_align_exp);
22311 /* ALIGN is the minimum of destination and source alignment, but we care here
22312 just about destination alignment. */
22313 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22314 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22316 if (CONST_INT_P (count_exp))
22317 count = expected_size = INTVAL (count_exp);
22318 if (CONST_INT_P (expected_size_exp) && count == 0)
22319 expected_size = INTVAL (expected_size_exp);
22321 /* Make sure we don't need to care about overflow later on. */
22322 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22323 return false;
22325 /* Step 0: Decide on preferred algorithm, desired alignment and
22326 size of chunks to be copied by main loop. */
22328 alg = decide_alg (count, expected_size, false, &dynamic_check);
22329 desired_align = decide_alignment (align, alg, expected_size);
22331 if (!TARGET_ALIGN_STRINGOPS)
22332 align = desired_align;
22334 if (alg == libcall)
22335 return false;
22336 gcc_assert (alg != no_stringop);
22337 if (!count)
22338 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22339 destreg = copy_addr_to_reg (XEXP (dst, 0));
22340 srcreg = copy_addr_to_reg (XEXP (src, 0));
22341 switch (alg)
22343 case libcall:
22344 case no_stringop:
22345 gcc_unreachable ();
22346 case loop:
22347 need_zero_guard = true;
22348 size_needed = GET_MODE_SIZE (word_mode);
22349 break;
22350 case unrolled_loop:
22351 need_zero_guard = true;
22352 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22353 break;
22354 case rep_prefix_8_byte:
22355 size_needed = 8;
22356 break;
22357 case rep_prefix_4_byte:
22358 size_needed = 4;
22359 break;
22360 case rep_prefix_1_byte:
22361 size_needed = 1;
22362 break;
22363 case loop_1_byte:
22364 need_zero_guard = true;
22365 size_needed = 1;
22366 break;
22369 epilogue_size_needed = size_needed;
22371 /* Step 1: Prologue guard. */
22373 /* Alignment code needs count to be in register. */
22374 if (CONST_INT_P (count_exp) && desired_align > align)
22376 if (INTVAL (count_exp) > desired_align
22377 && INTVAL (count_exp) > size_needed)
22379 align_bytes
22380 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22381 if (align_bytes <= 0)
22382 align_bytes = 0;
22383 else
22384 align_bytes = desired_align - align_bytes;
22386 if (align_bytes == 0)
22387 count_exp = force_reg (counter_mode (count_exp), count_exp);
22389 gcc_assert (desired_align >= 1 && align >= 1);
22391 /* Ensure that alignment prologue won't copy past end of block. */
22392 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22394 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22395 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22396 Make sure it is power of 2. */
22397 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22399 if (count)
22401 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22403 /* If main algorithm works on QImode, no epilogue is needed.
22404 For small sizes just don't align anything. */
22405 if (size_needed == 1)
22406 desired_align = align;
22407 else
22408 goto epilogue;
22411 else
22413 label = gen_label_rtx ();
22414 emit_cmp_and_jump_insns (count_exp,
22415 GEN_INT (epilogue_size_needed),
22416 LTU, 0, counter_mode (count_exp), 1, label);
22417 if (expected_size == -1 || expected_size < epilogue_size_needed)
22418 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22419 else
22420 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22424 /* Emit code to decide on runtime whether library call or inline should be
22425 used. */
22426 if (dynamic_check != -1)
22428 if (CONST_INT_P (count_exp))
22430 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22432 emit_block_move_via_libcall (dst, src, count_exp, false);
22433 count_exp = const0_rtx;
22434 goto epilogue;
22437 else
22439 rtx hot_label = gen_label_rtx ();
22440 jump_around_label = gen_label_rtx ();
22441 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22442 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22443 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22444 emit_block_move_via_libcall (dst, src, count_exp, false);
22445 emit_jump (jump_around_label);
22446 emit_label (hot_label);
22450 /* Step 2: Alignment prologue. */
22452 if (desired_align > align)
22454 if (align_bytes == 0)
22456 /* Except for the first move in epilogue, we no longer know
22457 constant offset in aliasing info. It don't seems to worth
22458 the pain to maintain it for the first move, so throw away
22459 the info early. */
22460 src = change_address (src, BLKmode, srcreg);
22461 dst = change_address (dst, BLKmode, destreg);
22462 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22463 desired_align);
22465 else
22467 /* If we know how many bytes need to be stored before dst is
22468 sufficiently aligned, maintain aliasing info accurately. */
22469 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22470 desired_align, align_bytes);
22471 count_exp = plus_constant (counter_mode (count_exp),
22472 count_exp, -align_bytes);
22473 count -= align_bytes;
22475 if (need_zero_guard
22476 && (count < (unsigned HOST_WIDE_INT) size_needed
22477 || (align_bytes == 0
22478 && count < ((unsigned HOST_WIDE_INT) size_needed
22479 + desired_align - align))))
22481 /* It is possible that we copied enough so the main loop will not
22482 execute. */
22483 gcc_assert (size_needed > 1);
22484 if (label == NULL_RTX)
22485 label = gen_label_rtx ();
22486 emit_cmp_and_jump_insns (count_exp,
22487 GEN_INT (size_needed),
22488 LTU, 0, counter_mode (count_exp), 1, label);
22489 if (expected_size == -1
22490 || expected_size < (desired_align - align) / 2 + size_needed)
22491 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22492 else
22493 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22496 if (label && size_needed == 1)
22498 emit_label (label);
22499 LABEL_NUSES (label) = 1;
22500 label = NULL;
22501 epilogue_size_needed = 1;
22503 else if (label == NULL_RTX)
22504 epilogue_size_needed = size_needed;
22506 /* Step 3: Main loop. */
22508 switch (alg)
22510 case libcall:
22511 case no_stringop:
22512 gcc_unreachable ();
22513 case loop_1_byte:
22514 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22515 count_exp, QImode, 1, expected_size);
22516 break;
22517 case loop:
22518 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22519 count_exp, word_mode, 1, expected_size);
22520 break;
22521 case unrolled_loop:
22522 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22523 registers for 4 temporaries anyway. */
22524 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22525 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22526 expected_size);
22527 break;
22528 case rep_prefix_8_byte:
22529 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22530 DImode);
22531 break;
22532 case rep_prefix_4_byte:
22533 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22534 SImode);
22535 break;
22536 case rep_prefix_1_byte:
22537 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22538 QImode);
22539 break;
22541 /* Adjust properly the offset of src and dest memory for aliasing. */
22542 if (CONST_INT_P (count_exp))
22544 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22545 (count / size_needed) * size_needed);
22546 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22547 (count / size_needed) * size_needed);
22549 else
22551 src = change_address (src, BLKmode, srcreg);
22552 dst = change_address (dst, BLKmode, destreg);
22555 /* Step 4: Epilogue to copy the remaining bytes. */
22556 epilogue:
22557 if (label)
22559 /* When the main loop is done, COUNT_EXP might hold original count,
22560 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22561 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22562 bytes. Compensate if needed. */
22564 if (size_needed < epilogue_size_needed)
22566 tmp =
22567 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22568 GEN_INT (size_needed - 1), count_exp, 1,
22569 OPTAB_DIRECT);
22570 if (tmp != count_exp)
22571 emit_move_insn (count_exp, tmp);
22573 emit_label (label);
22574 LABEL_NUSES (label) = 1;
22577 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22578 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22579 epilogue_size_needed);
22580 if (jump_around_label)
22581 emit_label (jump_around_label);
22582 return true;
22585 /* Helper function for memcpy. For QImode value 0xXY produce
22586 0xXYXYXYXY of wide specified by MODE. This is essentially
22587 a * 0x10101010, but we can do slightly better than
22588 synth_mult by unwinding the sequence by hand on CPUs with
22589 slow multiply. */
22590 static rtx
22591 promote_duplicated_reg (enum machine_mode mode, rtx val)
22593 enum machine_mode valmode = GET_MODE (val);
22594 rtx tmp;
22595 int nops = mode == DImode ? 3 : 2;
22597 gcc_assert (mode == SImode || mode == DImode);
22598 if (val == const0_rtx)
22599 return copy_to_mode_reg (mode, const0_rtx);
22600 if (CONST_INT_P (val))
22602 HOST_WIDE_INT v = INTVAL (val) & 255;
22604 v |= v << 8;
22605 v |= v << 16;
22606 if (mode == DImode)
22607 v |= (v << 16) << 16;
22608 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22611 if (valmode == VOIDmode)
22612 valmode = QImode;
22613 if (valmode != QImode)
22614 val = gen_lowpart (QImode, val);
22615 if (mode == QImode)
22616 return val;
22617 if (!TARGET_PARTIAL_REG_STALL)
22618 nops--;
22619 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22620 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22621 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22622 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22624 rtx reg = convert_modes (mode, QImode, val, true);
22625 tmp = promote_duplicated_reg (mode, const1_rtx);
22626 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22627 OPTAB_DIRECT);
22629 else
22631 rtx reg = convert_modes (mode, QImode, val, true);
22633 if (!TARGET_PARTIAL_REG_STALL)
22634 if (mode == SImode)
22635 emit_insn (gen_movsi_insv_1 (reg, reg));
22636 else
22637 emit_insn (gen_movdi_insv_1 (reg, reg));
22638 else
22640 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22641 NULL, 1, OPTAB_DIRECT);
22642 reg =
22643 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22645 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22646 NULL, 1, OPTAB_DIRECT);
22647 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22648 if (mode == SImode)
22649 return reg;
22650 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22651 NULL, 1, OPTAB_DIRECT);
22652 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22653 return reg;
22657 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22658 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22659 alignment from ALIGN to DESIRED_ALIGN. */
22660 static rtx
22661 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22663 rtx promoted_val;
22665 if (TARGET_64BIT
22666 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22667 promoted_val = promote_duplicated_reg (DImode, val);
22668 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22669 promoted_val = promote_duplicated_reg (SImode, val);
22670 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22671 promoted_val = promote_duplicated_reg (HImode, val);
22672 else
22673 promoted_val = val;
22675 return promoted_val;
22678 /* Expand string clear operation (bzero). Use i386 string operations when
22679 profitable. See expand_movmem comment for explanation of individual
22680 steps performed. */
22681 bool
22682 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22683 rtx expected_align_exp, rtx expected_size_exp)
22685 rtx destreg;
22686 rtx label = NULL;
22687 rtx tmp;
22688 rtx jump_around_label = NULL;
22689 HOST_WIDE_INT align = 1;
22690 unsigned HOST_WIDE_INT count = 0;
22691 HOST_WIDE_INT expected_size = -1;
22692 int size_needed = 0, epilogue_size_needed;
22693 int desired_align = 0, align_bytes = 0;
22694 enum stringop_alg alg;
22695 rtx promoted_val = NULL;
22696 bool force_loopy_epilogue = false;
22697 int dynamic_check;
22698 bool need_zero_guard = false;
22700 if (CONST_INT_P (align_exp))
22701 align = INTVAL (align_exp);
22702 /* i386 can do misaligned access on reasonably increased cost. */
22703 if (CONST_INT_P (expected_align_exp)
22704 && INTVAL (expected_align_exp) > align)
22705 align = INTVAL (expected_align_exp);
22706 if (CONST_INT_P (count_exp))
22707 count = expected_size = INTVAL (count_exp);
22708 if (CONST_INT_P (expected_size_exp) && count == 0)
22709 expected_size = INTVAL (expected_size_exp);
22711 /* Make sure we don't need to care about overflow later on. */
22712 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22713 return false;
22715 /* Step 0: Decide on preferred algorithm, desired alignment and
22716 size of chunks to be copied by main loop. */
22718 alg = decide_alg (count, expected_size, true, &dynamic_check);
22719 desired_align = decide_alignment (align, alg, expected_size);
22721 if (!TARGET_ALIGN_STRINGOPS)
22722 align = desired_align;
22724 if (alg == libcall)
22725 return false;
22726 gcc_assert (alg != no_stringop);
22727 if (!count)
22728 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22729 destreg = copy_addr_to_reg (XEXP (dst, 0));
22730 switch (alg)
22732 case libcall:
22733 case no_stringop:
22734 gcc_unreachable ();
22735 case loop:
22736 need_zero_guard = true;
22737 size_needed = GET_MODE_SIZE (word_mode);
22738 break;
22739 case unrolled_loop:
22740 need_zero_guard = true;
22741 size_needed = GET_MODE_SIZE (word_mode) * 4;
22742 break;
22743 case rep_prefix_8_byte:
22744 size_needed = 8;
22745 break;
22746 case rep_prefix_4_byte:
22747 size_needed = 4;
22748 break;
22749 case rep_prefix_1_byte:
22750 size_needed = 1;
22751 break;
22752 case loop_1_byte:
22753 need_zero_guard = true;
22754 size_needed = 1;
22755 break;
22757 epilogue_size_needed = size_needed;
22759 /* Step 1: Prologue guard. */
22761 /* Alignment code needs count to be in register. */
22762 if (CONST_INT_P (count_exp) && desired_align > align)
22764 if (INTVAL (count_exp) > desired_align
22765 && INTVAL (count_exp) > size_needed)
22767 align_bytes
22768 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22769 if (align_bytes <= 0)
22770 align_bytes = 0;
22771 else
22772 align_bytes = desired_align - align_bytes;
22774 if (align_bytes == 0)
22776 enum machine_mode mode = SImode;
22777 if (TARGET_64BIT && (count & ~0xffffffff))
22778 mode = DImode;
22779 count_exp = force_reg (mode, count_exp);
22782 /* Do the cheap promotion to allow better CSE across the
22783 main loop and epilogue (ie one load of the big constant in the
22784 front of all code. */
22785 if (CONST_INT_P (val_exp))
22786 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22787 desired_align, align);
22788 /* Ensure that alignment prologue won't copy past end of block. */
22789 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22791 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22792 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22793 Make sure it is power of 2. */
22794 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22796 /* To improve performance of small blocks, we jump around the VAL
22797 promoting mode. This mean that if the promoted VAL is not constant,
22798 we might not use it in the epilogue and have to use byte
22799 loop variant. */
22800 if (epilogue_size_needed > 2 && !promoted_val)
22801 force_loopy_epilogue = true;
22802 if (count)
22804 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22806 /* If main algorithm works on QImode, no epilogue is needed.
22807 For small sizes just don't align anything. */
22808 if (size_needed == 1)
22809 desired_align = align;
22810 else
22811 goto epilogue;
22814 else
22816 label = gen_label_rtx ();
22817 emit_cmp_and_jump_insns (count_exp,
22818 GEN_INT (epilogue_size_needed),
22819 LTU, 0, counter_mode (count_exp), 1, label);
22820 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22821 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22822 else
22823 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22826 if (dynamic_check != -1)
22828 rtx hot_label = gen_label_rtx ();
22829 jump_around_label = gen_label_rtx ();
22830 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22831 LEU, 0, counter_mode (count_exp), 1, hot_label);
22832 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22833 set_storage_via_libcall (dst, count_exp, val_exp, false);
22834 emit_jump (jump_around_label);
22835 emit_label (hot_label);
22838 /* Step 2: Alignment prologue. */
22840 /* Do the expensive promotion once we branched off the small blocks. */
22841 if (!promoted_val)
22842 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22843 desired_align, align);
22844 gcc_assert (desired_align >= 1 && align >= 1);
22846 if (desired_align > align)
22848 if (align_bytes == 0)
22850 /* Except for the first move in epilogue, we no longer know
22851 constant offset in aliasing info. It don't seems to worth
22852 the pain to maintain it for the first move, so throw away
22853 the info early. */
22854 dst = change_address (dst, BLKmode, destreg);
22855 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22856 desired_align);
22858 else
22860 /* If we know how many bytes need to be stored before dst is
22861 sufficiently aligned, maintain aliasing info accurately. */
22862 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22863 desired_align, align_bytes);
22864 count_exp = plus_constant (counter_mode (count_exp),
22865 count_exp, -align_bytes);
22866 count -= align_bytes;
22868 if (need_zero_guard
22869 && (count < (unsigned HOST_WIDE_INT) size_needed
22870 || (align_bytes == 0
22871 && count < ((unsigned HOST_WIDE_INT) size_needed
22872 + desired_align - align))))
22874 /* It is possible that we copied enough so the main loop will not
22875 execute. */
22876 gcc_assert (size_needed > 1);
22877 if (label == NULL_RTX)
22878 label = gen_label_rtx ();
22879 emit_cmp_and_jump_insns (count_exp,
22880 GEN_INT (size_needed),
22881 LTU, 0, counter_mode (count_exp), 1, label);
22882 if (expected_size == -1
22883 || expected_size < (desired_align - align) / 2 + size_needed)
22884 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22885 else
22886 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22889 if (label && size_needed == 1)
22891 emit_label (label);
22892 LABEL_NUSES (label) = 1;
22893 label = NULL;
22894 promoted_val = val_exp;
22895 epilogue_size_needed = 1;
22897 else if (label == NULL_RTX)
22898 epilogue_size_needed = size_needed;
22900 /* Step 3: Main loop. */
22902 switch (alg)
22904 case libcall:
22905 case no_stringop:
22906 gcc_unreachable ();
22907 case loop_1_byte:
22908 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22909 count_exp, QImode, 1, expected_size);
22910 break;
22911 case loop:
22912 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22913 count_exp, word_mode, 1, expected_size);
22914 break;
22915 case unrolled_loop:
22916 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22917 count_exp, word_mode, 4, expected_size);
22918 break;
22919 case rep_prefix_8_byte:
22920 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22921 DImode, val_exp);
22922 break;
22923 case rep_prefix_4_byte:
22924 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22925 SImode, val_exp);
22926 break;
22927 case rep_prefix_1_byte:
22928 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22929 QImode, val_exp);
22930 break;
22932 /* Adjust properly the offset of src and dest memory for aliasing. */
22933 if (CONST_INT_P (count_exp))
22934 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22935 (count / size_needed) * size_needed);
22936 else
22937 dst = change_address (dst, BLKmode, destreg);
22939 /* Step 4: Epilogue to copy the remaining bytes. */
22941 if (label)
22943 /* When the main loop is done, COUNT_EXP might hold original count,
22944 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22945 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22946 bytes. Compensate if needed. */
22948 if (size_needed < epilogue_size_needed)
22950 tmp =
22951 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22952 GEN_INT (size_needed - 1), count_exp, 1,
22953 OPTAB_DIRECT);
22954 if (tmp != count_exp)
22955 emit_move_insn (count_exp, tmp);
22957 emit_label (label);
22958 LABEL_NUSES (label) = 1;
22960 epilogue:
22961 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22963 if (force_loopy_epilogue)
22964 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22965 epilogue_size_needed);
22966 else
22967 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22968 epilogue_size_needed);
22970 if (jump_around_label)
22971 emit_label (jump_around_label);
22972 return true;
22975 /* Expand the appropriate insns for doing strlen if not just doing
22976 repnz; scasb
22978 out = result, initialized with the start address
22979 align_rtx = alignment of the address.
22980 scratch = scratch register, initialized with the startaddress when
22981 not aligned, otherwise undefined
22983 This is just the body. It needs the initializations mentioned above and
22984 some address computing at the end. These things are done in i386.md. */
22986 static void
22987 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22989 int align;
22990 rtx tmp;
22991 rtx align_2_label = NULL_RTX;
22992 rtx align_3_label = NULL_RTX;
22993 rtx align_4_label = gen_label_rtx ();
22994 rtx end_0_label = gen_label_rtx ();
22995 rtx mem;
22996 rtx tmpreg = gen_reg_rtx (SImode);
22997 rtx scratch = gen_reg_rtx (SImode);
22998 rtx cmp;
23000 align = 0;
23001 if (CONST_INT_P (align_rtx))
23002 align = INTVAL (align_rtx);
23004 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23006 /* Is there a known alignment and is it less than 4? */
23007 if (align < 4)
23009 rtx scratch1 = gen_reg_rtx (Pmode);
23010 emit_move_insn (scratch1, out);
23011 /* Is there a known alignment and is it not 2? */
23012 if (align != 2)
23014 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23015 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23017 /* Leave just the 3 lower bits. */
23018 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23019 NULL_RTX, 0, OPTAB_WIDEN);
23021 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23022 Pmode, 1, align_4_label);
23023 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23024 Pmode, 1, align_2_label);
23025 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23026 Pmode, 1, align_3_label);
23028 else
23030 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23031 check if is aligned to 4 - byte. */
23033 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23034 NULL_RTX, 0, OPTAB_WIDEN);
23036 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23037 Pmode, 1, align_4_label);
23040 mem = change_address (src, QImode, out);
23042 /* Now compare the bytes. */
23044 /* Compare the first n unaligned byte on a byte per byte basis. */
23045 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23046 QImode, 1, end_0_label);
23048 /* Increment the address. */
23049 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23051 /* Not needed with an alignment of 2 */
23052 if (align != 2)
23054 emit_label (align_2_label);
23056 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23057 end_0_label);
23059 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23061 emit_label (align_3_label);
23064 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23065 end_0_label);
23067 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23070 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23071 align this loop. It gives only huge programs, but does not help to
23072 speed up. */
23073 emit_label (align_4_label);
23075 mem = change_address (src, SImode, out);
23076 emit_move_insn (scratch, mem);
23077 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23079 /* This formula yields a nonzero result iff one of the bytes is zero.
23080 This saves three branches inside loop and many cycles. */
23082 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23083 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23084 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23085 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23086 gen_int_mode (0x80808080, SImode)));
23087 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23088 align_4_label);
23090 if (TARGET_CMOVE)
23092 rtx reg = gen_reg_rtx (SImode);
23093 rtx reg2 = gen_reg_rtx (Pmode);
23094 emit_move_insn (reg, tmpreg);
23095 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23097 /* If zero is not in the first two bytes, move two bytes forward. */
23098 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23099 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23100 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23101 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23102 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23103 reg,
23104 tmpreg)));
23105 /* Emit lea manually to avoid clobbering of flags. */
23106 emit_insn (gen_rtx_SET (SImode, reg2,
23107 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23109 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23110 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23111 emit_insn (gen_rtx_SET (VOIDmode, out,
23112 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23113 reg2,
23114 out)));
23116 else
23118 rtx end_2_label = gen_label_rtx ();
23119 /* Is zero in the first two bytes? */
23121 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23122 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23123 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23124 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23125 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23126 pc_rtx);
23127 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23128 JUMP_LABEL (tmp) = end_2_label;
23130 /* Not in the first two. Move two bytes forward. */
23131 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23132 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23134 emit_label (end_2_label);
23138 /* Avoid branch in fixing the byte. */
23139 tmpreg = gen_lowpart (QImode, tmpreg);
23140 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23141 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23142 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23143 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23145 emit_label (end_0_label);
23148 /* Expand strlen. */
23150 bool
23151 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23153 rtx addr, scratch1, scratch2, scratch3, scratch4;
23155 /* The generic case of strlen expander is long. Avoid it's
23156 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23158 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23159 && !TARGET_INLINE_ALL_STRINGOPS
23160 && !optimize_insn_for_size_p ()
23161 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23162 return false;
23164 addr = force_reg (Pmode, XEXP (src, 0));
23165 scratch1 = gen_reg_rtx (Pmode);
23167 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23168 && !optimize_insn_for_size_p ())
23170 /* Well it seems that some optimizer does not combine a call like
23171 foo(strlen(bar), strlen(bar));
23172 when the move and the subtraction is done here. It does calculate
23173 the length just once when these instructions are done inside of
23174 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23175 often used and I use one fewer register for the lifetime of
23176 output_strlen_unroll() this is better. */
23178 emit_move_insn (out, addr);
23180 ix86_expand_strlensi_unroll_1 (out, src, align);
23182 /* strlensi_unroll_1 returns the address of the zero at the end of
23183 the string, like memchr(), so compute the length by subtracting
23184 the start address. */
23185 emit_insn (ix86_gen_sub3 (out, out, addr));
23187 else
23189 rtx unspec;
23191 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23192 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23193 return false;
23195 scratch2 = gen_reg_rtx (Pmode);
23196 scratch3 = gen_reg_rtx (Pmode);
23197 scratch4 = force_reg (Pmode, constm1_rtx);
23199 emit_move_insn (scratch3, addr);
23200 eoschar = force_reg (QImode, eoschar);
23202 src = replace_equiv_address_nv (src, scratch3);
23204 /* If .md starts supporting :P, this can be done in .md. */
23205 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23206 scratch4), UNSPEC_SCAS);
23207 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23208 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23209 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23211 return true;
23214 /* For given symbol (function) construct code to compute address of it's PLT
23215 entry in large x86-64 PIC model. */
23216 static rtx
23217 construct_plt_address (rtx symbol)
23219 rtx tmp, unspec;
23221 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23222 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23223 gcc_assert (Pmode == DImode);
23225 tmp = gen_reg_rtx (Pmode);
23226 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23228 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23229 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23230 return tmp;
23234 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23235 rtx callarg2,
23236 rtx pop, bool sibcall)
23238 /* We need to represent that SI and DI registers are clobbered
23239 by SYSV calls. */
23240 static int clobbered_registers[] = {
23241 XMM6_REG, XMM7_REG, XMM8_REG,
23242 XMM9_REG, XMM10_REG, XMM11_REG,
23243 XMM12_REG, XMM13_REG, XMM14_REG,
23244 XMM15_REG, SI_REG, DI_REG
23246 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23247 rtx use = NULL, call;
23248 unsigned int vec_len;
23250 if (pop == const0_rtx)
23251 pop = NULL;
23252 gcc_assert (!TARGET_64BIT || !pop);
23254 if (TARGET_MACHO && !TARGET_64BIT)
23256 #if TARGET_MACHO
23257 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23258 fnaddr = machopic_indirect_call_target (fnaddr);
23259 #endif
23261 else
23263 /* Static functions and indirect calls don't need the pic register. */
23264 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23265 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23266 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23267 use_reg (&use, pic_offset_table_rtx);
23270 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23272 rtx al = gen_rtx_REG (QImode, AX_REG);
23273 emit_move_insn (al, callarg2);
23274 use_reg (&use, al);
23277 if (ix86_cmodel == CM_LARGE_PIC
23278 && MEM_P (fnaddr)
23279 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23280 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23281 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23282 else if (sibcall
23283 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23284 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23286 fnaddr = XEXP (fnaddr, 0);
23287 if (GET_MODE (fnaddr) != word_mode)
23288 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23289 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23292 vec_len = 0;
23293 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23294 if (retval)
23295 call = gen_rtx_SET (VOIDmode, retval, call);
23296 vec[vec_len++] = call;
23298 if (pop)
23300 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23301 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23302 vec[vec_len++] = pop;
23305 if (TARGET_64BIT_MS_ABI
23306 && (!callarg2 || INTVAL (callarg2) != -2))
23308 unsigned i;
23310 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23311 UNSPEC_MS_TO_SYSV_CALL);
23313 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23314 vec[vec_len++]
23315 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23316 ? TImode : DImode,
23317 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23318 ? TImode : DImode,
23319 clobbered_registers[i]));
23322 if (vec_len > 1)
23323 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23324 call = emit_call_insn (call);
23325 if (use)
23326 CALL_INSN_FUNCTION_USAGE (call) = use;
23328 return call;
23331 /* Output the assembly for a call instruction. */
23333 const char *
23334 ix86_output_call_insn (rtx insn, rtx call_op)
23336 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23337 bool seh_nop_p = false;
23338 const char *xasm;
23340 if (SIBLING_CALL_P (insn))
23342 if (direct_p)
23343 xasm = "jmp\t%P0";
23344 /* SEH epilogue detection requires the indirect branch case
23345 to include REX.W. */
23346 else if (TARGET_SEH)
23347 xasm = "rex.W jmp %A0";
23348 else
23349 xasm = "jmp\t%A0";
23351 output_asm_insn (xasm, &call_op);
23352 return "";
23355 /* SEH unwinding can require an extra nop to be emitted in several
23356 circumstances. Determine if we have one of those. */
23357 if (TARGET_SEH)
23359 rtx i;
23361 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23363 /* If we get to another real insn, we don't need the nop. */
23364 if (INSN_P (i))
23365 break;
23367 /* If we get to the epilogue note, prevent a catch region from
23368 being adjacent to the standard epilogue sequence. If non-
23369 call-exceptions, we'll have done this during epilogue emission. */
23370 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23371 && !flag_non_call_exceptions
23372 && !can_throw_internal (insn))
23374 seh_nop_p = true;
23375 break;
23379 /* If we didn't find a real insn following the call, prevent the
23380 unwinder from looking into the next function. */
23381 if (i == NULL)
23382 seh_nop_p = true;
23385 if (direct_p)
23386 xasm = "call\t%P0";
23387 else
23388 xasm = "call\t%A0";
23390 output_asm_insn (xasm, &call_op);
23392 if (seh_nop_p)
23393 return "nop";
23395 return "";
23398 /* Clear stack slot assignments remembered from previous functions.
23399 This is called from INIT_EXPANDERS once before RTL is emitted for each
23400 function. */
23402 static struct machine_function *
23403 ix86_init_machine_status (void)
23405 struct machine_function *f;
23407 f = ggc_alloc_cleared_machine_function ();
23408 f->use_fast_prologue_epilogue_nregs = -1;
23409 f->call_abi = ix86_abi;
23410 f->optimize_mode_switching[AVX_U128] = TARGET_VZEROUPPER;
23412 return f;
23415 /* Return a MEM corresponding to a stack slot with mode MODE.
23416 Allocate a new slot if necessary.
23418 The RTL for a function can have several slots available: N is
23419 which slot to use. */
23422 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23424 struct stack_local_entry *s;
23426 gcc_assert (n < MAX_386_STACK_LOCALS);
23428 for (s = ix86_stack_locals; s; s = s->next)
23429 if (s->mode == mode && s->n == n)
23430 return validize_mem (copy_rtx (s->rtl));
23432 s = ggc_alloc_stack_local_entry ();
23433 s->n = n;
23434 s->mode = mode;
23435 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23437 s->next = ix86_stack_locals;
23438 ix86_stack_locals = s;
23439 return validize_mem (s->rtl);
23442 static void
23443 ix86_instantiate_decls (void)
23445 struct stack_local_entry *s;
23447 for (s = ix86_stack_locals; s; s = s->next)
23448 if (s->rtl != NULL_RTX)
23449 instantiate_decl_rtl (s->rtl);
23452 /* Calculate the length of the memory address in the instruction encoding.
23453 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23454 or other prefixes. We never generate addr32 prefix for LEA insn. */
23457 memory_address_length (rtx addr, bool lea)
23459 struct ix86_address parts;
23460 rtx base, index, disp;
23461 int len;
23462 int ok;
23464 if (GET_CODE (addr) == PRE_DEC
23465 || GET_CODE (addr) == POST_INC
23466 || GET_CODE (addr) == PRE_MODIFY
23467 || GET_CODE (addr) == POST_MODIFY)
23468 return 0;
23470 ok = ix86_decompose_address (addr, &parts);
23471 gcc_assert (ok);
23473 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23475 /* If this is not LEA instruction, add the length of addr32 prefix. */
23476 if (TARGET_64BIT && !lea
23477 && (SImode_address_operand (addr, VOIDmode)
23478 || (parts.base && GET_MODE (parts.base) == SImode)
23479 || (parts.index && GET_MODE (parts.index) == SImode)))
23480 len++;
23482 base = parts.base;
23483 index = parts.index;
23484 disp = parts.disp;
23486 if (base && GET_CODE (base) == SUBREG)
23487 base = SUBREG_REG (base);
23488 if (index && GET_CODE (index) == SUBREG)
23489 index = SUBREG_REG (index);
23491 gcc_assert (base == NULL_RTX || REG_P (base));
23492 gcc_assert (index == NULL_RTX || REG_P (index));
23494 /* Rule of thumb:
23495 - esp as the base always wants an index,
23496 - ebp as the base always wants a displacement,
23497 - r12 as the base always wants an index,
23498 - r13 as the base always wants a displacement. */
23500 /* Register Indirect. */
23501 if (base && !index && !disp)
23503 /* esp (for its index) and ebp (for its displacement) need
23504 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23505 code. */
23506 if (base == arg_pointer_rtx
23507 || base == frame_pointer_rtx
23508 || REGNO (base) == SP_REG
23509 || REGNO (base) == BP_REG
23510 || REGNO (base) == R12_REG
23511 || REGNO (base) == R13_REG)
23512 len++;
23515 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23516 is not disp32, but disp32(%rip), so for disp32
23517 SIB byte is needed, unless print_operand_address
23518 optimizes it into disp32(%rip) or (%rip) is implied
23519 by UNSPEC. */
23520 else if (disp && !base && !index)
23522 len += 4;
23523 if (TARGET_64BIT)
23525 rtx symbol = disp;
23527 if (GET_CODE (disp) == CONST)
23528 symbol = XEXP (disp, 0);
23529 if (GET_CODE (symbol) == PLUS
23530 && CONST_INT_P (XEXP (symbol, 1)))
23531 symbol = XEXP (symbol, 0);
23533 if (GET_CODE (symbol) != LABEL_REF
23534 && (GET_CODE (symbol) != SYMBOL_REF
23535 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23536 && (GET_CODE (symbol) != UNSPEC
23537 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23538 && XINT (symbol, 1) != UNSPEC_PCREL
23539 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23540 len++;
23543 else
23545 /* Find the length of the displacement constant. */
23546 if (disp)
23548 if (base && satisfies_constraint_K (disp))
23549 len += 1;
23550 else
23551 len += 4;
23553 /* ebp always wants a displacement. Similarly r13. */
23554 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23555 len++;
23557 /* An index requires the two-byte modrm form.... */
23558 if (index
23559 /* ...like esp (or r12), which always wants an index. */
23560 || base == arg_pointer_rtx
23561 || base == frame_pointer_rtx
23562 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23563 len++;
23566 return len;
23569 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23570 is set, expect that insn have 8bit immediate alternative. */
23572 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23574 int len = 0;
23575 int i;
23576 extract_insn_cached (insn);
23577 for (i = recog_data.n_operands - 1; i >= 0; --i)
23578 if (CONSTANT_P (recog_data.operand[i]))
23580 enum attr_mode mode = get_attr_mode (insn);
23582 gcc_assert (!len);
23583 if (shortform && CONST_INT_P (recog_data.operand[i]))
23585 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23586 switch (mode)
23588 case MODE_QI:
23589 len = 1;
23590 continue;
23591 case MODE_HI:
23592 ival = trunc_int_for_mode (ival, HImode);
23593 break;
23594 case MODE_SI:
23595 ival = trunc_int_for_mode (ival, SImode);
23596 break;
23597 default:
23598 break;
23600 if (IN_RANGE (ival, -128, 127))
23602 len = 1;
23603 continue;
23606 switch (mode)
23608 case MODE_QI:
23609 len = 1;
23610 break;
23611 case MODE_HI:
23612 len = 2;
23613 break;
23614 case MODE_SI:
23615 len = 4;
23616 break;
23617 /* Immediates for DImode instructions are encoded
23618 as 32bit sign extended values. */
23619 case MODE_DI:
23620 len = 4;
23621 break;
23622 default:
23623 fatal_insn ("unknown insn mode", insn);
23626 return len;
23629 /* Compute default value for "length_address" attribute. */
23631 ix86_attr_length_address_default (rtx insn)
23633 int i;
23635 if (get_attr_type (insn) == TYPE_LEA)
23637 rtx set = PATTERN (insn), addr;
23639 if (GET_CODE (set) == PARALLEL)
23640 set = XVECEXP (set, 0, 0);
23642 gcc_assert (GET_CODE (set) == SET);
23644 addr = SET_SRC (set);
23646 return memory_address_length (addr, true);
23649 extract_insn_cached (insn);
23650 for (i = recog_data.n_operands - 1; i >= 0; --i)
23651 if (MEM_P (recog_data.operand[i]))
23653 constrain_operands_cached (reload_completed);
23654 if (which_alternative != -1)
23656 const char *constraints = recog_data.constraints[i];
23657 int alt = which_alternative;
23659 while (*constraints == '=' || *constraints == '+')
23660 constraints++;
23661 while (alt-- > 0)
23662 while (*constraints++ != ',')
23664 /* Skip ignored operands. */
23665 if (*constraints == 'X')
23666 continue;
23668 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
23670 return 0;
23673 /* Compute default value for "length_vex" attribute. It includes
23674 2 or 3 byte VEX prefix and 1 opcode byte. */
23677 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23679 int i;
23681 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23682 byte VEX prefix. */
23683 if (!has_0f_opcode || has_vex_w)
23684 return 3 + 1;
23686 /* We can always use 2 byte VEX prefix in 32bit. */
23687 if (!TARGET_64BIT)
23688 return 2 + 1;
23690 extract_insn_cached (insn);
23692 for (i = recog_data.n_operands - 1; i >= 0; --i)
23693 if (REG_P (recog_data.operand[i]))
23695 /* REX.W bit uses 3 byte VEX prefix. */
23696 if (GET_MODE (recog_data.operand[i]) == DImode
23697 && GENERAL_REG_P (recog_data.operand[i]))
23698 return 3 + 1;
23700 else
23702 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23703 if (MEM_P (recog_data.operand[i])
23704 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23705 return 3 + 1;
23708 return 2 + 1;
23711 /* Return the maximum number of instructions a cpu can issue. */
23713 static int
23714 ix86_issue_rate (void)
23716 switch (ix86_tune)
23718 case PROCESSOR_PENTIUM:
23719 case PROCESSOR_ATOM:
23720 case PROCESSOR_K6:
23721 case PROCESSOR_BTVER2:
23722 return 2;
23724 case PROCESSOR_PENTIUMPRO:
23725 case PROCESSOR_PENTIUM4:
23726 case PROCESSOR_CORE2_32:
23727 case PROCESSOR_CORE2_64:
23728 case PROCESSOR_COREI7_32:
23729 case PROCESSOR_COREI7_64:
23730 case PROCESSOR_ATHLON:
23731 case PROCESSOR_K8:
23732 case PROCESSOR_AMDFAM10:
23733 case PROCESSOR_NOCONA:
23734 case PROCESSOR_GENERIC32:
23735 case PROCESSOR_GENERIC64:
23736 case PROCESSOR_BDVER1:
23737 case PROCESSOR_BDVER2:
23738 case PROCESSOR_BTVER1:
23739 return 3;
23741 default:
23742 return 1;
23746 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23747 by DEP_INSN and nothing set by DEP_INSN. */
23749 static bool
23750 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23752 rtx set, set2;
23754 /* Simplify the test for uninteresting insns. */
23755 if (insn_type != TYPE_SETCC
23756 && insn_type != TYPE_ICMOV
23757 && insn_type != TYPE_FCMOV
23758 && insn_type != TYPE_IBR)
23759 return false;
23761 if ((set = single_set (dep_insn)) != 0)
23763 set = SET_DEST (set);
23764 set2 = NULL_RTX;
23766 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23767 && XVECLEN (PATTERN (dep_insn), 0) == 2
23768 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23769 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23771 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23772 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23774 else
23775 return false;
23777 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23778 return false;
23780 /* This test is true if the dependent insn reads the flags but
23781 not any other potentially set register. */
23782 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23783 return false;
23785 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23786 return false;
23788 return true;
23791 /* Return true iff USE_INSN has a memory address with operands set by
23792 SET_INSN. */
23794 bool
23795 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23797 int i;
23798 extract_insn_cached (use_insn);
23799 for (i = recog_data.n_operands - 1; i >= 0; --i)
23800 if (MEM_P (recog_data.operand[i]))
23802 rtx addr = XEXP (recog_data.operand[i], 0);
23803 return modified_in_p (addr, set_insn) != 0;
23805 return false;
23808 static int
23809 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23811 enum attr_type insn_type, dep_insn_type;
23812 enum attr_memory memory;
23813 rtx set, set2;
23814 int dep_insn_code_number;
23816 /* Anti and output dependencies have zero cost on all CPUs. */
23817 if (REG_NOTE_KIND (link) != 0)
23818 return 0;
23820 dep_insn_code_number = recog_memoized (dep_insn);
23822 /* If we can't recognize the insns, we can't really do anything. */
23823 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23824 return cost;
23826 insn_type = get_attr_type (insn);
23827 dep_insn_type = get_attr_type (dep_insn);
23829 switch (ix86_tune)
23831 case PROCESSOR_PENTIUM:
23832 /* Address Generation Interlock adds a cycle of latency. */
23833 if (insn_type == TYPE_LEA)
23835 rtx addr = PATTERN (insn);
23837 if (GET_CODE (addr) == PARALLEL)
23838 addr = XVECEXP (addr, 0, 0);
23840 gcc_assert (GET_CODE (addr) == SET);
23842 addr = SET_SRC (addr);
23843 if (modified_in_p (addr, dep_insn))
23844 cost += 1;
23846 else if (ix86_agi_dependent (dep_insn, insn))
23847 cost += 1;
23849 /* ??? Compares pair with jump/setcc. */
23850 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23851 cost = 0;
23853 /* Floating point stores require value to be ready one cycle earlier. */
23854 if (insn_type == TYPE_FMOV
23855 && get_attr_memory (insn) == MEMORY_STORE
23856 && !ix86_agi_dependent (dep_insn, insn))
23857 cost += 1;
23858 break;
23860 case PROCESSOR_PENTIUMPRO:
23861 memory = get_attr_memory (insn);
23863 /* INT->FP conversion is expensive. */
23864 if (get_attr_fp_int_src (dep_insn))
23865 cost += 5;
23867 /* There is one cycle extra latency between an FP op and a store. */
23868 if (insn_type == TYPE_FMOV
23869 && (set = single_set (dep_insn)) != NULL_RTX
23870 && (set2 = single_set (insn)) != NULL_RTX
23871 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23872 && MEM_P (SET_DEST (set2)))
23873 cost += 1;
23875 /* Show ability of reorder buffer to hide latency of load by executing
23876 in parallel with previous instruction in case
23877 previous instruction is not needed to compute the address. */
23878 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23879 && !ix86_agi_dependent (dep_insn, insn))
23881 /* Claim moves to take one cycle, as core can issue one load
23882 at time and the next load can start cycle later. */
23883 if (dep_insn_type == TYPE_IMOV
23884 || dep_insn_type == TYPE_FMOV)
23885 cost = 1;
23886 else if (cost > 1)
23887 cost--;
23889 break;
23891 case PROCESSOR_K6:
23892 memory = get_attr_memory (insn);
23894 /* The esp dependency is resolved before the instruction is really
23895 finished. */
23896 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23897 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23898 return 1;
23900 /* INT->FP conversion is expensive. */
23901 if (get_attr_fp_int_src (dep_insn))
23902 cost += 5;
23904 /* Show ability of reorder buffer to hide latency of load by executing
23905 in parallel with previous instruction in case
23906 previous instruction is not needed to compute the address. */
23907 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23908 && !ix86_agi_dependent (dep_insn, insn))
23910 /* Claim moves to take one cycle, as core can issue one load
23911 at time and the next load can start cycle later. */
23912 if (dep_insn_type == TYPE_IMOV
23913 || dep_insn_type == TYPE_FMOV)
23914 cost = 1;
23915 else if (cost > 2)
23916 cost -= 2;
23917 else
23918 cost = 1;
23920 break;
23922 case PROCESSOR_ATHLON:
23923 case PROCESSOR_K8:
23924 case PROCESSOR_AMDFAM10:
23925 case PROCESSOR_BDVER1:
23926 case PROCESSOR_BDVER2:
23927 case PROCESSOR_BTVER1:
23928 case PROCESSOR_BTVER2:
23929 case PROCESSOR_ATOM:
23930 case PROCESSOR_GENERIC32:
23931 case PROCESSOR_GENERIC64:
23932 memory = get_attr_memory (insn);
23934 /* Show ability of reorder buffer to hide latency of load by executing
23935 in parallel with previous instruction in case
23936 previous instruction is not needed to compute the address. */
23937 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23938 && !ix86_agi_dependent (dep_insn, insn))
23940 enum attr_unit unit = get_attr_unit (insn);
23941 int loadcost = 3;
23943 /* Because of the difference between the length of integer and
23944 floating unit pipeline preparation stages, the memory operands
23945 for floating point are cheaper.
23947 ??? For Athlon it the difference is most probably 2. */
23948 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23949 loadcost = 3;
23950 else
23951 loadcost = TARGET_ATHLON ? 2 : 0;
23953 if (cost >= loadcost)
23954 cost -= loadcost;
23955 else
23956 cost = 0;
23959 default:
23960 break;
23963 return cost;
23966 /* How many alternative schedules to try. This should be as wide as the
23967 scheduling freedom in the DFA, but no wider. Making this value too
23968 large results extra work for the scheduler. */
23970 static int
23971 ia32_multipass_dfa_lookahead (void)
23973 switch (ix86_tune)
23975 case PROCESSOR_PENTIUM:
23976 return 2;
23978 case PROCESSOR_PENTIUMPRO:
23979 case PROCESSOR_K6:
23980 return 1;
23982 case PROCESSOR_CORE2_32:
23983 case PROCESSOR_CORE2_64:
23984 case PROCESSOR_COREI7_32:
23985 case PROCESSOR_COREI7_64:
23986 case PROCESSOR_ATOM:
23987 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23988 as many instructions can be executed on a cycle, i.e.,
23989 issue_rate. I wonder why tuning for many CPUs does not do this. */
23990 if (reload_completed)
23991 return ix86_issue_rate ();
23992 /* Don't use lookahead for pre-reload schedule to save compile time. */
23993 return 0;
23995 default:
23996 return 0;
24000 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24001 execution. It is applied if
24002 (1) IMUL instruction is on the top of list;
24003 (2) There exists the only producer of independent IMUL instruction in
24004 ready list;
24005 (3) Put found producer on the top of ready list.
24006 Returns issue rate. */
24008 static int
24009 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24010 int clock_var ATTRIBUTE_UNUSED)
24012 static int issue_rate = -1;
24013 int n_ready = *pn_ready;
24014 rtx insn, insn1, insn2;
24015 int i;
24016 sd_iterator_def sd_it;
24017 dep_t dep;
24018 int index = -1;
24020 /* Set up issue rate. */
24021 issue_rate = ix86_issue_rate();
24023 /* Do reodering for Atom only. */
24024 if (ix86_tune != PROCESSOR_ATOM)
24025 return issue_rate;
24026 /* Do not perform ready list reodering for pre-reload schedule pass. */
24027 if (!reload_completed)
24028 return issue_rate;
24029 /* Nothing to do if ready list contains only 1 instruction. */
24030 if (n_ready <= 1)
24031 return issue_rate;
24033 /* Check that IMUL instruction is on the top of ready list. */
24034 insn = ready[n_ready - 1];
24035 if (!NONDEBUG_INSN_P (insn))
24036 return issue_rate;
24037 insn = PATTERN (insn);
24038 if (GET_CODE (insn) == PARALLEL)
24039 insn = XVECEXP (insn, 0, 0);
24040 if (GET_CODE (insn) != SET)
24041 return issue_rate;
24042 if (!(GET_CODE (SET_SRC (insn)) == MULT
24043 && GET_MODE (SET_SRC (insn)) == SImode))
24044 return issue_rate;
24046 /* Search for producer of independent IMUL instruction. */
24047 for (i = n_ready - 2; i>= 0; i--)
24049 insn = ready[i];
24050 if (!NONDEBUG_INSN_P (insn))
24051 continue;
24052 /* Skip IMUL instruction. */
24053 insn2 = PATTERN (insn);
24054 if (GET_CODE (insn2) == PARALLEL)
24055 insn2 = XVECEXP (insn2, 0, 0);
24056 if (GET_CODE (insn2) == SET
24057 && GET_CODE (SET_SRC (insn2)) == MULT
24058 && GET_MODE (SET_SRC (insn2)) == SImode)
24059 continue;
24061 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24063 rtx con;
24064 con = DEP_CON (dep);
24065 if (!NONDEBUG_INSN_P (con))
24066 continue;
24067 insn1 = PATTERN (con);
24068 if (GET_CODE (insn1) == PARALLEL)
24069 insn1 = XVECEXP (insn1, 0, 0);
24071 if (GET_CODE (insn1) == SET
24072 && GET_CODE (SET_SRC (insn1)) == MULT
24073 && GET_MODE (SET_SRC (insn1)) == SImode)
24075 sd_iterator_def sd_it1;
24076 dep_t dep1;
24077 /* Check if there is no other dependee for IMUL. */
24078 index = i;
24079 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24081 rtx pro;
24082 pro = DEP_PRO (dep1);
24083 if (!NONDEBUG_INSN_P (pro))
24084 continue;
24085 if (pro != insn)
24086 index = -1;
24088 if (index >= 0)
24089 break;
24092 if (index >= 0)
24093 break;
24095 if (index < 0)
24096 return issue_rate; /* Didn't find IMUL producer. */
24098 if (sched_verbose > 1)
24099 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24100 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24102 /* Put IMUL producer (ready[index]) at the top of ready list. */
24103 insn1= ready[index];
24104 for (i = index; i < n_ready - 1; i++)
24105 ready[i] = ready[i + 1];
24106 ready[n_ready - 1] = insn1;
24108 return issue_rate;
24111 static bool
24112 ix86_class_likely_spilled_p (reg_class_t);
24114 /* Returns true if lhs of insn is HW function argument register and set up
24115 is_spilled to true if it is likely spilled HW register. */
24116 static bool
24117 insn_is_function_arg (rtx insn, bool* is_spilled)
24119 rtx dst;
24121 if (!NONDEBUG_INSN_P (insn))
24122 return false;
24123 /* Call instructions are not movable, ignore it. */
24124 if (CALL_P (insn))
24125 return false;
24126 insn = PATTERN (insn);
24127 if (GET_CODE (insn) == PARALLEL)
24128 insn = XVECEXP (insn, 0, 0);
24129 if (GET_CODE (insn) != SET)
24130 return false;
24131 dst = SET_DEST (insn);
24132 if (REG_P (dst) && HARD_REGISTER_P (dst)
24133 && ix86_function_arg_regno_p (REGNO (dst)))
24135 /* Is it likely spilled HW register? */
24136 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24137 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24138 *is_spilled = true;
24139 return true;
24141 return false;
24144 /* Add output dependencies for chain of function adjacent arguments if only
24145 there is a move to likely spilled HW register. Return first argument
24146 if at least one dependence was added or NULL otherwise. */
24147 static rtx
24148 add_parameter_dependencies (rtx call, rtx head)
24150 rtx insn;
24151 rtx last = call;
24152 rtx first_arg = NULL;
24153 bool is_spilled = false;
24155 head = PREV_INSN (head);
24157 /* Find nearest to call argument passing instruction. */
24158 while (true)
24160 last = PREV_INSN (last);
24161 if (last == head)
24162 return NULL;
24163 if (!NONDEBUG_INSN_P (last))
24164 continue;
24165 if (insn_is_function_arg (last, &is_spilled))
24166 break;
24167 return NULL;
24170 first_arg = last;
24171 while (true)
24173 insn = PREV_INSN (last);
24174 if (!INSN_P (insn))
24175 break;
24176 if (insn == head)
24177 break;
24178 if (!NONDEBUG_INSN_P (insn))
24180 last = insn;
24181 continue;
24183 if (insn_is_function_arg (insn, &is_spilled))
24185 /* Add output depdendence between two function arguments if chain
24186 of output arguments contains likely spilled HW registers. */
24187 if (is_spilled)
24188 add_dependence (last, insn, REG_DEP_OUTPUT);
24189 first_arg = last = insn;
24191 else
24192 break;
24194 if (!is_spilled)
24195 return NULL;
24196 return first_arg;
24199 /* Add output or anti dependency from insn to first_arg to restrict its code
24200 motion. */
24201 static void
24202 avoid_func_arg_motion (rtx first_arg, rtx insn)
24204 rtx set;
24205 rtx tmp;
24207 set = single_set (insn);
24208 if (!set)
24209 return;
24210 tmp = SET_DEST (set);
24211 if (REG_P (tmp))
24213 /* Add output dependency to the first function argument. */
24214 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24215 return;
24217 /* Add anti dependency. */
24218 add_dependence (first_arg, insn, REG_DEP_ANTI);
24221 /* Avoid cross block motion of function argument through adding dependency
24222 from the first non-jump instruction in bb. */
24223 static void
24224 add_dependee_for_func_arg (rtx arg, basic_block bb)
24226 rtx insn = BB_END (bb);
24228 while (insn)
24230 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24232 rtx set = single_set (insn);
24233 if (set)
24235 avoid_func_arg_motion (arg, insn);
24236 return;
24239 if (insn == BB_HEAD (bb))
24240 return;
24241 insn = PREV_INSN (insn);
24245 /* Hook for pre-reload schedule - avoid motion of function arguments
24246 passed in likely spilled HW registers. */
24247 static void
24248 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24250 rtx insn;
24251 rtx first_arg = NULL;
24252 if (reload_completed)
24253 return;
24254 while (head != tail && DEBUG_INSN_P (head))
24255 head = NEXT_INSN (head);
24256 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24257 if (INSN_P (insn) && CALL_P (insn))
24259 first_arg = add_parameter_dependencies (insn, head);
24260 if (first_arg)
24262 /* Add dependee for first argument to predecessors if only
24263 region contains more than one block. */
24264 basic_block bb = BLOCK_FOR_INSN (insn);
24265 int rgn = CONTAINING_RGN (bb->index);
24266 int nr_blks = RGN_NR_BLOCKS (rgn);
24267 /* Skip trivial regions and region head blocks that can have
24268 predecessors outside of region. */
24269 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24271 edge e;
24272 edge_iterator ei;
24273 /* Assume that region is SCC, i.e. all immediate predecessors
24274 of non-head block are in the same region. */
24275 FOR_EACH_EDGE (e, ei, bb->preds)
24277 /* Avoid creating of loop-carried dependencies through
24278 using topological odering in region. */
24279 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24280 add_dependee_for_func_arg (first_arg, e->src);
24283 insn = first_arg;
24284 if (insn == head)
24285 break;
24288 else if (first_arg)
24289 avoid_func_arg_motion (first_arg, insn);
24292 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24293 HW registers to maximum, to schedule them at soon as possible. These are
24294 moves from function argument registers at the top of the function entry
24295 and moves from function return value registers after call. */
24296 static int
24297 ix86_adjust_priority (rtx insn, int priority)
24299 rtx set;
24301 if (reload_completed)
24302 return priority;
24304 if (!NONDEBUG_INSN_P (insn))
24305 return priority;
24307 set = single_set (insn);
24308 if (set)
24310 rtx tmp = SET_SRC (set);
24311 if (REG_P (tmp)
24312 && HARD_REGISTER_P (tmp)
24313 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24314 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24315 return current_sched_info->sched_max_insns_priority;
24318 return priority;
24321 /* Model decoder of Core 2/i7.
24322 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24323 track the instruction fetch block boundaries and make sure that long
24324 (9+ bytes) instructions are assigned to D0. */
24326 /* Maximum length of an insn that can be handled by
24327 a secondary decoder unit. '8' for Core 2/i7. */
24328 static int core2i7_secondary_decoder_max_insn_size;
24330 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24331 '16' for Core 2/i7. */
24332 static int core2i7_ifetch_block_size;
24334 /* Maximum number of instructions decoder can handle per cycle.
24335 '6' for Core 2/i7. */
24336 static int core2i7_ifetch_block_max_insns;
24338 typedef struct ix86_first_cycle_multipass_data_ *
24339 ix86_first_cycle_multipass_data_t;
24340 typedef const struct ix86_first_cycle_multipass_data_ *
24341 const_ix86_first_cycle_multipass_data_t;
24343 /* A variable to store target state across calls to max_issue within
24344 one cycle. */
24345 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24346 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24348 /* Initialize DATA. */
24349 static void
24350 core2i7_first_cycle_multipass_init (void *_data)
24352 ix86_first_cycle_multipass_data_t data
24353 = (ix86_first_cycle_multipass_data_t) _data;
24355 data->ifetch_block_len = 0;
24356 data->ifetch_block_n_insns = 0;
24357 data->ready_try_change = NULL;
24358 data->ready_try_change_size = 0;
24361 /* Advancing the cycle; reset ifetch block counts. */
24362 static void
24363 core2i7_dfa_post_advance_cycle (void)
24365 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24367 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24369 data->ifetch_block_len = 0;
24370 data->ifetch_block_n_insns = 0;
24373 static int min_insn_size (rtx);
24375 /* Filter out insns from ready_try that the core will not be able to issue
24376 on current cycle due to decoder. */
24377 static void
24378 core2i7_first_cycle_multipass_filter_ready_try
24379 (const_ix86_first_cycle_multipass_data_t data,
24380 char *ready_try, int n_ready, bool first_cycle_insn_p)
24382 while (n_ready--)
24384 rtx insn;
24385 int insn_size;
24387 if (ready_try[n_ready])
24388 continue;
24390 insn = get_ready_element (n_ready);
24391 insn_size = min_insn_size (insn);
24393 if (/* If this is a too long an insn for a secondary decoder ... */
24394 (!first_cycle_insn_p
24395 && insn_size > core2i7_secondary_decoder_max_insn_size)
24396 /* ... or it would not fit into the ifetch block ... */
24397 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24398 /* ... or the decoder is full already ... */
24399 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24400 /* ... mask the insn out. */
24402 ready_try[n_ready] = 1;
24404 if (data->ready_try_change)
24405 bitmap_set_bit (data->ready_try_change, n_ready);
24410 /* Prepare for a new round of multipass lookahead scheduling. */
24411 static void
24412 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24413 bool first_cycle_insn_p)
24415 ix86_first_cycle_multipass_data_t data
24416 = (ix86_first_cycle_multipass_data_t) _data;
24417 const_ix86_first_cycle_multipass_data_t prev_data
24418 = ix86_first_cycle_multipass_data;
24420 /* Restore the state from the end of the previous round. */
24421 data->ifetch_block_len = prev_data->ifetch_block_len;
24422 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24424 /* Filter instructions that cannot be issued on current cycle due to
24425 decoder restrictions. */
24426 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24427 first_cycle_insn_p);
24430 /* INSN is being issued in current solution. Account for its impact on
24431 the decoder model. */
24432 static void
24433 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24434 rtx insn, const void *_prev_data)
24436 ix86_first_cycle_multipass_data_t data
24437 = (ix86_first_cycle_multipass_data_t) _data;
24438 const_ix86_first_cycle_multipass_data_t prev_data
24439 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24441 int insn_size = min_insn_size (insn);
24443 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24444 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24445 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24446 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24448 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24449 if (!data->ready_try_change)
24451 data->ready_try_change = sbitmap_alloc (n_ready);
24452 data->ready_try_change_size = n_ready;
24454 else if (data->ready_try_change_size < n_ready)
24456 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24457 n_ready, 0);
24458 data->ready_try_change_size = n_ready;
24460 bitmap_clear (data->ready_try_change);
24462 /* Filter out insns from ready_try that the core will not be able to issue
24463 on current cycle due to decoder. */
24464 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24465 false);
24468 /* Revert the effect on ready_try. */
24469 static void
24470 core2i7_first_cycle_multipass_backtrack (const void *_data,
24471 char *ready_try,
24472 int n_ready ATTRIBUTE_UNUSED)
24474 const_ix86_first_cycle_multipass_data_t data
24475 = (const_ix86_first_cycle_multipass_data_t) _data;
24476 unsigned int i = 0;
24477 sbitmap_iterator sbi;
24479 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24480 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24482 ready_try[i] = 0;
24486 /* Save the result of multipass lookahead scheduling for the next round. */
24487 static void
24488 core2i7_first_cycle_multipass_end (const void *_data)
24490 const_ix86_first_cycle_multipass_data_t data
24491 = (const_ix86_first_cycle_multipass_data_t) _data;
24492 ix86_first_cycle_multipass_data_t next_data
24493 = ix86_first_cycle_multipass_data;
24495 if (data != NULL)
24497 next_data->ifetch_block_len = data->ifetch_block_len;
24498 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24502 /* Deallocate target data. */
24503 static void
24504 core2i7_first_cycle_multipass_fini (void *_data)
24506 ix86_first_cycle_multipass_data_t data
24507 = (ix86_first_cycle_multipass_data_t) _data;
24509 if (data->ready_try_change)
24511 sbitmap_free (data->ready_try_change);
24512 data->ready_try_change = NULL;
24513 data->ready_try_change_size = 0;
24517 /* Prepare for scheduling pass. */
24518 static void
24519 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24520 int verbose ATTRIBUTE_UNUSED,
24521 int max_uid ATTRIBUTE_UNUSED)
24523 /* Install scheduling hooks for current CPU. Some of these hooks are used
24524 in time-critical parts of the scheduler, so we only set them up when
24525 they are actually used. */
24526 switch (ix86_tune)
24528 case PROCESSOR_CORE2_32:
24529 case PROCESSOR_CORE2_64:
24530 case PROCESSOR_COREI7_32:
24531 case PROCESSOR_COREI7_64:
24532 /* Do not perform multipass scheduling for pre-reload schedule
24533 to save compile time. */
24534 if (reload_completed)
24536 targetm.sched.dfa_post_advance_cycle
24537 = core2i7_dfa_post_advance_cycle;
24538 targetm.sched.first_cycle_multipass_init
24539 = core2i7_first_cycle_multipass_init;
24540 targetm.sched.first_cycle_multipass_begin
24541 = core2i7_first_cycle_multipass_begin;
24542 targetm.sched.first_cycle_multipass_issue
24543 = core2i7_first_cycle_multipass_issue;
24544 targetm.sched.first_cycle_multipass_backtrack
24545 = core2i7_first_cycle_multipass_backtrack;
24546 targetm.sched.first_cycle_multipass_end
24547 = core2i7_first_cycle_multipass_end;
24548 targetm.sched.first_cycle_multipass_fini
24549 = core2i7_first_cycle_multipass_fini;
24551 /* Set decoder parameters. */
24552 core2i7_secondary_decoder_max_insn_size = 8;
24553 core2i7_ifetch_block_size = 16;
24554 core2i7_ifetch_block_max_insns = 6;
24555 break;
24557 /* ... Fall through ... */
24558 default:
24559 targetm.sched.dfa_post_advance_cycle = NULL;
24560 targetm.sched.first_cycle_multipass_init = NULL;
24561 targetm.sched.first_cycle_multipass_begin = NULL;
24562 targetm.sched.first_cycle_multipass_issue = NULL;
24563 targetm.sched.first_cycle_multipass_backtrack = NULL;
24564 targetm.sched.first_cycle_multipass_end = NULL;
24565 targetm.sched.first_cycle_multipass_fini = NULL;
24566 break;
24571 /* Compute the alignment given to a constant that is being placed in memory.
24572 EXP is the constant and ALIGN is the alignment that the object would
24573 ordinarily have.
24574 The value of this function is used instead of that alignment to align
24575 the object. */
24578 ix86_constant_alignment (tree exp, int align)
24580 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24581 || TREE_CODE (exp) == INTEGER_CST)
24583 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24584 return 64;
24585 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24586 return 128;
24588 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24589 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24590 return BITS_PER_WORD;
24592 return align;
24595 /* Compute the alignment for a static variable.
24596 TYPE is the data type, and ALIGN is the alignment that
24597 the object would ordinarily have. The value of this function is used
24598 instead of that alignment to align the object. */
24601 ix86_data_alignment (tree type, int align)
24603 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24605 if (AGGREGATE_TYPE_P (type)
24606 && TYPE_SIZE (type)
24607 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24608 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24609 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24610 && align < max_align)
24611 align = max_align;
24613 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24614 to 16byte boundary. */
24615 if (TARGET_64BIT)
24617 if (AGGREGATE_TYPE_P (type)
24618 && TYPE_SIZE (type)
24619 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24620 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24621 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24622 return 128;
24625 if (TREE_CODE (type) == ARRAY_TYPE)
24627 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24628 return 64;
24629 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24630 return 128;
24632 else if (TREE_CODE (type) == COMPLEX_TYPE)
24635 if (TYPE_MODE (type) == DCmode && align < 64)
24636 return 64;
24637 if ((TYPE_MODE (type) == XCmode
24638 || TYPE_MODE (type) == TCmode) && align < 128)
24639 return 128;
24641 else if ((TREE_CODE (type) == RECORD_TYPE
24642 || TREE_CODE (type) == UNION_TYPE
24643 || TREE_CODE (type) == QUAL_UNION_TYPE)
24644 && TYPE_FIELDS (type))
24646 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24647 return 64;
24648 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24649 return 128;
24651 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24652 || TREE_CODE (type) == INTEGER_TYPE)
24654 if (TYPE_MODE (type) == DFmode && align < 64)
24655 return 64;
24656 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24657 return 128;
24660 return align;
24663 /* Compute the alignment for a local variable or a stack slot. EXP is
24664 the data type or decl itself, MODE is the widest mode available and
24665 ALIGN is the alignment that the object would ordinarily have. The
24666 value of this macro is used instead of that alignment to align the
24667 object. */
24669 unsigned int
24670 ix86_local_alignment (tree exp, enum machine_mode mode,
24671 unsigned int align)
24673 tree type, decl;
24675 if (exp && DECL_P (exp))
24677 type = TREE_TYPE (exp);
24678 decl = exp;
24680 else
24682 type = exp;
24683 decl = NULL;
24686 /* Don't do dynamic stack realignment for long long objects with
24687 -mpreferred-stack-boundary=2. */
24688 if (!TARGET_64BIT
24689 && align == 64
24690 && ix86_preferred_stack_boundary < 64
24691 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24692 && (!type || !TYPE_USER_ALIGN (type))
24693 && (!decl || !DECL_USER_ALIGN (decl)))
24694 align = 32;
24696 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24697 register in MODE. We will return the largest alignment of XF
24698 and DF. */
24699 if (!type)
24701 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24702 align = GET_MODE_ALIGNMENT (DFmode);
24703 return align;
24706 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24707 to 16byte boundary. Exact wording is:
24709 An array uses the same alignment as its elements, except that a local or
24710 global array variable of length at least 16 bytes or
24711 a C99 variable-length array variable always has alignment of at least 16 bytes.
24713 This was added to allow use of aligned SSE instructions at arrays. This
24714 rule is meant for static storage (where compiler can not do the analysis
24715 by itself). We follow it for automatic variables only when convenient.
24716 We fully control everything in the function compiled and functions from
24717 other unit can not rely on the alignment.
24719 Exclude va_list type. It is the common case of local array where
24720 we can not benefit from the alignment. */
24721 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24722 && TARGET_SSE)
24724 if (AGGREGATE_TYPE_P (type)
24725 && (va_list_type_node == NULL_TREE
24726 || (TYPE_MAIN_VARIANT (type)
24727 != TYPE_MAIN_VARIANT (va_list_type_node)))
24728 && TYPE_SIZE (type)
24729 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24730 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24731 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24732 return 128;
24734 if (TREE_CODE (type) == ARRAY_TYPE)
24736 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24737 return 64;
24738 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24739 return 128;
24741 else if (TREE_CODE (type) == COMPLEX_TYPE)
24743 if (TYPE_MODE (type) == DCmode && align < 64)
24744 return 64;
24745 if ((TYPE_MODE (type) == XCmode
24746 || TYPE_MODE (type) == TCmode) && align < 128)
24747 return 128;
24749 else if ((TREE_CODE (type) == RECORD_TYPE
24750 || TREE_CODE (type) == UNION_TYPE
24751 || TREE_CODE (type) == QUAL_UNION_TYPE)
24752 && TYPE_FIELDS (type))
24754 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24755 return 64;
24756 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24757 return 128;
24759 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24760 || TREE_CODE (type) == INTEGER_TYPE)
24763 if (TYPE_MODE (type) == DFmode && align < 64)
24764 return 64;
24765 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24766 return 128;
24768 return align;
24771 /* Compute the minimum required alignment for dynamic stack realignment
24772 purposes for a local variable, parameter or a stack slot. EXP is
24773 the data type or decl itself, MODE is its mode and ALIGN is the
24774 alignment that the object would ordinarily have. */
24776 unsigned int
24777 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24778 unsigned int align)
24780 tree type, decl;
24782 if (exp && DECL_P (exp))
24784 type = TREE_TYPE (exp);
24785 decl = exp;
24787 else
24789 type = exp;
24790 decl = NULL;
24793 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24794 return align;
24796 /* Don't do dynamic stack realignment for long long objects with
24797 -mpreferred-stack-boundary=2. */
24798 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24799 && (!type || !TYPE_USER_ALIGN (type))
24800 && (!decl || !DECL_USER_ALIGN (decl)))
24801 return 32;
24803 return align;
24806 /* Find a location for the static chain incoming to a nested function.
24807 This is a register, unless all free registers are used by arguments. */
24809 static rtx
24810 ix86_static_chain (const_tree fndecl, bool incoming_p)
24812 unsigned regno;
24814 if (!DECL_STATIC_CHAIN (fndecl))
24815 return NULL;
24817 if (TARGET_64BIT)
24819 /* We always use R10 in 64-bit mode. */
24820 regno = R10_REG;
24822 else
24824 tree fntype;
24825 unsigned int ccvt;
24827 /* By default in 32-bit mode we use ECX to pass the static chain. */
24828 regno = CX_REG;
24830 fntype = TREE_TYPE (fndecl);
24831 ccvt = ix86_get_callcvt (fntype);
24832 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24834 /* Fastcall functions use ecx/edx for arguments, which leaves
24835 us with EAX for the static chain.
24836 Thiscall functions use ecx for arguments, which also
24837 leaves us with EAX for the static chain. */
24838 regno = AX_REG;
24840 else if (ix86_function_regparm (fntype, fndecl) == 3)
24842 /* For regparm 3, we have no free call-clobbered registers in
24843 which to store the static chain. In order to implement this,
24844 we have the trampoline push the static chain to the stack.
24845 However, we can't push a value below the return address when
24846 we call the nested function directly, so we have to use an
24847 alternate entry point. For this we use ESI, and have the
24848 alternate entry point push ESI, so that things appear the
24849 same once we're executing the nested function. */
24850 if (incoming_p)
24852 if (fndecl == current_function_decl)
24853 ix86_static_chain_on_stack = true;
24854 return gen_frame_mem (SImode,
24855 plus_constant (Pmode,
24856 arg_pointer_rtx, -8));
24858 regno = SI_REG;
24862 return gen_rtx_REG (Pmode, regno);
24865 /* Emit RTL insns to initialize the variable parts of a trampoline.
24866 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24867 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24868 to be passed to the target function. */
24870 static void
24871 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24873 rtx mem, fnaddr;
24874 int opcode;
24875 int offset = 0;
24877 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24879 if (TARGET_64BIT)
24881 int size;
24883 /* Load the function address to r11. Try to load address using
24884 the shorter movl instead of movabs. We may want to support
24885 movq for kernel mode, but kernel does not use trampolines at
24886 the moment. FNADDR is a 32bit address and may not be in
24887 DImode when ptr_mode == SImode. Always use movl in this
24888 case. */
24889 if (ptr_mode == SImode
24890 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24892 fnaddr = copy_addr_to_reg (fnaddr);
24894 mem = adjust_address (m_tramp, HImode, offset);
24895 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24897 mem = adjust_address (m_tramp, SImode, offset + 2);
24898 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24899 offset += 6;
24901 else
24903 mem = adjust_address (m_tramp, HImode, offset);
24904 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24906 mem = adjust_address (m_tramp, DImode, offset + 2);
24907 emit_move_insn (mem, fnaddr);
24908 offset += 10;
24911 /* Load static chain using movabs to r10. Use the shorter movl
24912 instead of movabs when ptr_mode == SImode. */
24913 if (ptr_mode == SImode)
24915 opcode = 0xba41;
24916 size = 6;
24918 else
24920 opcode = 0xba49;
24921 size = 10;
24924 mem = adjust_address (m_tramp, HImode, offset);
24925 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24927 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24928 emit_move_insn (mem, chain_value);
24929 offset += size;
24931 /* Jump to r11; the last (unused) byte is a nop, only there to
24932 pad the write out to a single 32-bit store. */
24933 mem = adjust_address (m_tramp, SImode, offset);
24934 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24935 offset += 4;
24937 else
24939 rtx disp, chain;
24941 /* Depending on the static chain location, either load a register
24942 with a constant, or push the constant to the stack. All of the
24943 instructions are the same size. */
24944 chain = ix86_static_chain (fndecl, true);
24945 if (REG_P (chain))
24947 switch (REGNO (chain))
24949 case AX_REG:
24950 opcode = 0xb8; break;
24951 case CX_REG:
24952 opcode = 0xb9; break;
24953 default:
24954 gcc_unreachable ();
24957 else
24958 opcode = 0x68;
24960 mem = adjust_address (m_tramp, QImode, offset);
24961 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24963 mem = adjust_address (m_tramp, SImode, offset + 1);
24964 emit_move_insn (mem, chain_value);
24965 offset += 5;
24967 mem = adjust_address (m_tramp, QImode, offset);
24968 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24970 mem = adjust_address (m_tramp, SImode, offset + 1);
24972 /* Compute offset from the end of the jmp to the target function.
24973 In the case in which the trampoline stores the static chain on
24974 the stack, we need to skip the first insn which pushes the
24975 (call-saved) register static chain; this push is 1 byte. */
24976 offset += 5;
24977 disp = expand_binop (SImode, sub_optab, fnaddr,
24978 plus_constant (Pmode, XEXP (m_tramp, 0),
24979 offset - (MEM_P (chain) ? 1 : 0)),
24980 NULL_RTX, 1, OPTAB_DIRECT);
24981 emit_move_insn (mem, disp);
24984 gcc_assert (offset <= TRAMPOLINE_SIZE);
24986 #ifdef HAVE_ENABLE_EXECUTE_STACK
24987 #ifdef CHECK_EXECUTE_STACK_ENABLED
24988 if (CHECK_EXECUTE_STACK_ENABLED)
24989 #endif
24990 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24991 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24992 #endif
24995 /* The following file contains several enumerations and data structures
24996 built from the definitions in i386-builtin-types.def. */
24998 #include "i386-builtin-types.inc"
25000 /* Table for the ix86 builtin non-function types. */
25001 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25003 /* Retrieve an element from the above table, building some of
25004 the types lazily. */
25006 static tree
25007 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25009 unsigned int index;
25010 tree type, itype;
25012 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25014 type = ix86_builtin_type_tab[(int) tcode];
25015 if (type != NULL)
25016 return type;
25018 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25019 if (tcode <= IX86_BT_LAST_VECT)
25021 enum machine_mode mode;
25023 index = tcode - IX86_BT_LAST_PRIM - 1;
25024 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25025 mode = ix86_builtin_type_vect_mode[index];
25027 type = build_vector_type_for_mode (itype, mode);
25029 else
25031 int quals;
25033 index = tcode - IX86_BT_LAST_VECT - 1;
25034 if (tcode <= IX86_BT_LAST_PTR)
25035 quals = TYPE_UNQUALIFIED;
25036 else
25037 quals = TYPE_QUAL_CONST;
25039 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25040 if (quals != TYPE_UNQUALIFIED)
25041 itype = build_qualified_type (itype, quals);
25043 type = build_pointer_type (itype);
25046 ix86_builtin_type_tab[(int) tcode] = type;
25047 return type;
25050 /* Table for the ix86 builtin function types. */
25051 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25053 /* Retrieve an element from the above table, building some of
25054 the types lazily. */
25056 static tree
25057 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25059 tree type;
25061 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25063 type = ix86_builtin_func_type_tab[(int) tcode];
25064 if (type != NULL)
25065 return type;
25067 if (tcode <= IX86_BT_LAST_FUNC)
25069 unsigned start = ix86_builtin_func_start[(int) tcode];
25070 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25071 tree rtype, atype, args = void_list_node;
25072 unsigned i;
25074 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25075 for (i = after - 1; i > start; --i)
25077 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25078 args = tree_cons (NULL, atype, args);
25081 type = build_function_type (rtype, args);
25083 else
25085 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25086 enum ix86_builtin_func_type icode;
25088 icode = ix86_builtin_func_alias_base[index];
25089 type = ix86_get_builtin_func_type (icode);
25092 ix86_builtin_func_type_tab[(int) tcode] = type;
25093 return type;
25097 /* Codes for all the SSE/MMX builtins. */
25098 enum ix86_builtins
25100 IX86_BUILTIN_ADDPS,
25101 IX86_BUILTIN_ADDSS,
25102 IX86_BUILTIN_DIVPS,
25103 IX86_BUILTIN_DIVSS,
25104 IX86_BUILTIN_MULPS,
25105 IX86_BUILTIN_MULSS,
25106 IX86_BUILTIN_SUBPS,
25107 IX86_BUILTIN_SUBSS,
25109 IX86_BUILTIN_CMPEQPS,
25110 IX86_BUILTIN_CMPLTPS,
25111 IX86_BUILTIN_CMPLEPS,
25112 IX86_BUILTIN_CMPGTPS,
25113 IX86_BUILTIN_CMPGEPS,
25114 IX86_BUILTIN_CMPNEQPS,
25115 IX86_BUILTIN_CMPNLTPS,
25116 IX86_BUILTIN_CMPNLEPS,
25117 IX86_BUILTIN_CMPNGTPS,
25118 IX86_BUILTIN_CMPNGEPS,
25119 IX86_BUILTIN_CMPORDPS,
25120 IX86_BUILTIN_CMPUNORDPS,
25121 IX86_BUILTIN_CMPEQSS,
25122 IX86_BUILTIN_CMPLTSS,
25123 IX86_BUILTIN_CMPLESS,
25124 IX86_BUILTIN_CMPNEQSS,
25125 IX86_BUILTIN_CMPNLTSS,
25126 IX86_BUILTIN_CMPNLESS,
25127 IX86_BUILTIN_CMPNGTSS,
25128 IX86_BUILTIN_CMPNGESS,
25129 IX86_BUILTIN_CMPORDSS,
25130 IX86_BUILTIN_CMPUNORDSS,
25132 IX86_BUILTIN_COMIEQSS,
25133 IX86_BUILTIN_COMILTSS,
25134 IX86_BUILTIN_COMILESS,
25135 IX86_BUILTIN_COMIGTSS,
25136 IX86_BUILTIN_COMIGESS,
25137 IX86_BUILTIN_COMINEQSS,
25138 IX86_BUILTIN_UCOMIEQSS,
25139 IX86_BUILTIN_UCOMILTSS,
25140 IX86_BUILTIN_UCOMILESS,
25141 IX86_BUILTIN_UCOMIGTSS,
25142 IX86_BUILTIN_UCOMIGESS,
25143 IX86_BUILTIN_UCOMINEQSS,
25145 IX86_BUILTIN_CVTPI2PS,
25146 IX86_BUILTIN_CVTPS2PI,
25147 IX86_BUILTIN_CVTSI2SS,
25148 IX86_BUILTIN_CVTSI642SS,
25149 IX86_BUILTIN_CVTSS2SI,
25150 IX86_BUILTIN_CVTSS2SI64,
25151 IX86_BUILTIN_CVTTPS2PI,
25152 IX86_BUILTIN_CVTTSS2SI,
25153 IX86_BUILTIN_CVTTSS2SI64,
25155 IX86_BUILTIN_MAXPS,
25156 IX86_BUILTIN_MAXSS,
25157 IX86_BUILTIN_MINPS,
25158 IX86_BUILTIN_MINSS,
25160 IX86_BUILTIN_LOADUPS,
25161 IX86_BUILTIN_STOREUPS,
25162 IX86_BUILTIN_MOVSS,
25164 IX86_BUILTIN_MOVHLPS,
25165 IX86_BUILTIN_MOVLHPS,
25166 IX86_BUILTIN_LOADHPS,
25167 IX86_BUILTIN_LOADLPS,
25168 IX86_BUILTIN_STOREHPS,
25169 IX86_BUILTIN_STORELPS,
25171 IX86_BUILTIN_MASKMOVQ,
25172 IX86_BUILTIN_MOVMSKPS,
25173 IX86_BUILTIN_PMOVMSKB,
25175 IX86_BUILTIN_MOVNTPS,
25176 IX86_BUILTIN_MOVNTQ,
25178 IX86_BUILTIN_LOADDQU,
25179 IX86_BUILTIN_STOREDQU,
25181 IX86_BUILTIN_PACKSSWB,
25182 IX86_BUILTIN_PACKSSDW,
25183 IX86_BUILTIN_PACKUSWB,
25185 IX86_BUILTIN_PADDB,
25186 IX86_BUILTIN_PADDW,
25187 IX86_BUILTIN_PADDD,
25188 IX86_BUILTIN_PADDQ,
25189 IX86_BUILTIN_PADDSB,
25190 IX86_BUILTIN_PADDSW,
25191 IX86_BUILTIN_PADDUSB,
25192 IX86_BUILTIN_PADDUSW,
25193 IX86_BUILTIN_PSUBB,
25194 IX86_BUILTIN_PSUBW,
25195 IX86_BUILTIN_PSUBD,
25196 IX86_BUILTIN_PSUBQ,
25197 IX86_BUILTIN_PSUBSB,
25198 IX86_BUILTIN_PSUBSW,
25199 IX86_BUILTIN_PSUBUSB,
25200 IX86_BUILTIN_PSUBUSW,
25202 IX86_BUILTIN_PAND,
25203 IX86_BUILTIN_PANDN,
25204 IX86_BUILTIN_POR,
25205 IX86_BUILTIN_PXOR,
25207 IX86_BUILTIN_PAVGB,
25208 IX86_BUILTIN_PAVGW,
25210 IX86_BUILTIN_PCMPEQB,
25211 IX86_BUILTIN_PCMPEQW,
25212 IX86_BUILTIN_PCMPEQD,
25213 IX86_BUILTIN_PCMPGTB,
25214 IX86_BUILTIN_PCMPGTW,
25215 IX86_BUILTIN_PCMPGTD,
25217 IX86_BUILTIN_PMADDWD,
25219 IX86_BUILTIN_PMAXSW,
25220 IX86_BUILTIN_PMAXUB,
25221 IX86_BUILTIN_PMINSW,
25222 IX86_BUILTIN_PMINUB,
25224 IX86_BUILTIN_PMULHUW,
25225 IX86_BUILTIN_PMULHW,
25226 IX86_BUILTIN_PMULLW,
25228 IX86_BUILTIN_PSADBW,
25229 IX86_BUILTIN_PSHUFW,
25231 IX86_BUILTIN_PSLLW,
25232 IX86_BUILTIN_PSLLD,
25233 IX86_BUILTIN_PSLLQ,
25234 IX86_BUILTIN_PSRAW,
25235 IX86_BUILTIN_PSRAD,
25236 IX86_BUILTIN_PSRLW,
25237 IX86_BUILTIN_PSRLD,
25238 IX86_BUILTIN_PSRLQ,
25239 IX86_BUILTIN_PSLLWI,
25240 IX86_BUILTIN_PSLLDI,
25241 IX86_BUILTIN_PSLLQI,
25242 IX86_BUILTIN_PSRAWI,
25243 IX86_BUILTIN_PSRADI,
25244 IX86_BUILTIN_PSRLWI,
25245 IX86_BUILTIN_PSRLDI,
25246 IX86_BUILTIN_PSRLQI,
25248 IX86_BUILTIN_PUNPCKHBW,
25249 IX86_BUILTIN_PUNPCKHWD,
25250 IX86_BUILTIN_PUNPCKHDQ,
25251 IX86_BUILTIN_PUNPCKLBW,
25252 IX86_BUILTIN_PUNPCKLWD,
25253 IX86_BUILTIN_PUNPCKLDQ,
25255 IX86_BUILTIN_SHUFPS,
25257 IX86_BUILTIN_RCPPS,
25258 IX86_BUILTIN_RCPSS,
25259 IX86_BUILTIN_RSQRTPS,
25260 IX86_BUILTIN_RSQRTPS_NR,
25261 IX86_BUILTIN_RSQRTSS,
25262 IX86_BUILTIN_RSQRTF,
25263 IX86_BUILTIN_SQRTPS,
25264 IX86_BUILTIN_SQRTPS_NR,
25265 IX86_BUILTIN_SQRTSS,
25267 IX86_BUILTIN_UNPCKHPS,
25268 IX86_BUILTIN_UNPCKLPS,
25270 IX86_BUILTIN_ANDPS,
25271 IX86_BUILTIN_ANDNPS,
25272 IX86_BUILTIN_ORPS,
25273 IX86_BUILTIN_XORPS,
25275 IX86_BUILTIN_EMMS,
25276 IX86_BUILTIN_LDMXCSR,
25277 IX86_BUILTIN_STMXCSR,
25278 IX86_BUILTIN_SFENCE,
25280 IX86_BUILTIN_FXSAVE,
25281 IX86_BUILTIN_FXRSTOR,
25282 IX86_BUILTIN_FXSAVE64,
25283 IX86_BUILTIN_FXRSTOR64,
25285 IX86_BUILTIN_XSAVE,
25286 IX86_BUILTIN_XRSTOR,
25287 IX86_BUILTIN_XSAVE64,
25288 IX86_BUILTIN_XRSTOR64,
25290 IX86_BUILTIN_XSAVEOPT,
25291 IX86_BUILTIN_XSAVEOPT64,
25293 /* 3DNow! Original */
25294 IX86_BUILTIN_FEMMS,
25295 IX86_BUILTIN_PAVGUSB,
25296 IX86_BUILTIN_PF2ID,
25297 IX86_BUILTIN_PFACC,
25298 IX86_BUILTIN_PFADD,
25299 IX86_BUILTIN_PFCMPEQ,
25300 IX86_BUILTIN_PFCMPGE,
25301 IX86_BUILTIN_PFCMPGT,
25302 IX86_BUILTIN_PFMAX,
25303 IX86_BUILTIN_PFMIN,
25304 IX86_BUILTIN_PFMUL,
25305 IX86_BUILTIN_PFRCP,
25306 IX86_BUILTIN_PFRCPIT1,
25307 IX86_BUILTIN_PFRCPIT2,
25308 IX86_BUILTIN_PFRSQIT1,
25309 IX86_BUILTIN_PFRSQRT,
25310 IX86_BUILTIN_PFSUB,
25311 IX86_BUILTIN_PFSUBR,
25312 IX86_BUILTIN_PI2FD,
25313 IX86_BUILTIN_PMULHRW,
25315 /* 3DNow! Athlon Extensions */
25316 IX86_BUILTIN_PF2IW,
25317 IX86_BUILTIN_PFNACC,
25318 IX86_BUILTIN_PFPNACC,
25319 IX86_BUILTIN_PI2FW,
25320 IX86_BUILTIN_PSWAPDSI,
25321 IX86_BUILTIN_PSWAPDSF,
25323 /* SSE2 */
25324 IX86_BUILTIN_ADDPD,
25325 IX86_BUILTIN_ADDSD,
25326 IX86_BUILTIN_DIVPD,
25327 IX86_BUILTIN_DIVSD,
25328 IX86_BUILTIN_MULPD,
25329 IX86_BUILTIN_MULSD,
25330 IX86_BUILTIN_SUBPD,
25331 IX86_BUILTIN_SUBSD,
25333 IX86_BUILTIN_CMPEQPD,
25334 IX86_BUILTIN_CMPLTPD,
25335 IX86_BUILTIN_CMPLEPD,
25336 IX86_BUILTIN_CMPGTPD,
25337 IX86_BUILTIN_CMPGEPD,
25338 IX86_BUILTIN_CMPNEQPD,
25339 IX86_BUILTIN_CMPNLTPD,
25340 IX86_BUILTIN_CMPNLEPD,
25341 IX86_BUILTIN_CMPNGTPD,
25342 IX86_BUILTIN_CMPNGEPD,
25343 IX86_BUILTIN_CMPORDPD,
25344 IX86_BUILTIN_CMPUNORDPD,
25345 IX86_BUILTIN_CMPEQSD,
25346 IX86_BUILTIN_CMPLTSD,
25347 IX86_BUILTIN_CMPLESD,
25348 IX86_BUILTIN_CMPNEQSD,
25349 IX86_BUILTIN_CMPNLTSD,
25350 IX86_BUILTIN_CMPNLESD,
25351 IX86_BUILTIN_CMPORDSD,
25352 IX86_BUILTIN_CMPUNORDSD,
25354 IX86_BUILTIN_COMIEQSD,
25355 IX86_BUILTIN_COMILTSD,
25356 IX86_BUILTIN_COMILESD,
25357 IX86_BUILTIN_COMIGTSD,
25358 IX86_BUILTIN_COMIGESD,
25359 IX86_BUILTIN_COMINEQSD,
25360 IX86_BUILTIN_UCOMIEQSD,
25361 IX86_BUILTIN_UCOMILTSD,
25362 IX86_BUILTIN_UCOMILESD,
25363 IX86_BUILTIN_UCOMIGTSD,
25364 IX86_BUILTIN_UCOMIGESD,
25365 IX86_BUILTIN_UCOMINEQSD,
25367 IX86_BUILTIN_MAXPD,
25368 IX86_BUILTIN_MAXSD,
25369 IX86_BUILTIN_MINPD,
25370 IX86_BUILTIN_MINSD,
25372 IX86_BUILTIN_ANDPD,
25373 IX86_BUILTIN_ANDNPD,
25374 IX86_BUILTIN_ORPD,
25375 IX86_BUILTIN_XORPD,
25377 IX86_BUILTIN_SQRTPD,
25378 IX86_BUILTIN_SQRTSD,
25380 IX86_BUILTIN_UNPCKHPD,
25381 IX86_BUILTIN_UNPCKLPD,
25383 IX86_BUILTIN_SHUFPD,
25385 IX86_BUILTIN_LOADUPD,
25386 IX86_BUILTIN_STOREUPD,
25387 IX86_BUILTIN_MOVSD,
25389 IX86_BUILTIN_LOADHPD,
25390 IX86_BUILTIN_LOADLPD,
25392 IX86_BUILTIN_CVTDQ2PD,
25393 IX86_BUILTIN_CVTDQ2PS,
25395 IX86_BUILTIN_CVTPD2DQ,
25396 IX86_BUILTIN_CVTPD2PI,
25397 IX86_BUILTIN_CVTPD2PS,
25398 IX86_BUILTIN_CVTTPD2DQ,
25399 IX86_BUILTIN_CVTTPD2PI,
25401 IX86_BUILTIN_CVTPI2PD,
25402 IX86_BUILTIN_CVTSI2SD,
25403 IX86_BUILTIN_CVTSI642SD,
25405 IX86_BUILTIN_CVTSD2SI,
25406 IX86_BUILTIN_CVTSD2SI64,
25407 IX86_BUILTIN_CVTSD2SS,
25408 IX86_BUILTIN_CVTSS2SD,
25409 IX86_BUILTIN_CVTTSD2SI,
25410 IX86_BUILTIN_CVTTSD2SI64,
25412 IX86_BUILTIN_CVTPS2DQ,
25413 IX86_BUILTIN_CVTPS2PD,
25414 IX86_BUILTIN_CVTTPS2DQ,
25416 IX86_BUILTIN_MOVNTI,
25417 IX86_BUILTIN_MOVNTI64,
25418 IX86_BUILTIN_MOVNTPD,
25419 IX86_BUILTIN_MOVNTDQ,
25421 IX86_BUILTIN_MOVQ128,
25423 /* SSE2 MMX */
25424 IX86_BUILTIN_MASKMOVDQU,
25425 IX86_BUILTIN_MOVMSKPD,
25426 IX86_BUILTIN_PMOVMSKB128,
25428 IX86_BUILTIN_PACKSSWB128,
25429 IX86_BUILTIN_PACKSSDW128,
25430 IX86_BUILTIN_PACKUSWB128,
25432 IX86_BUILTIN_PADDB128,
25433 IX86_BUILTIN_PADDW128,
25434 IX86_BUILTIN_PADDD128,
25435 IX86_BUILTIN_PADDQ128,
25436 IX86_BUILTIN_PADDSB128,
25437 IX86_BUILTIN_PADDSW128,
25438 IX86_BUILTIN_PADDUSB128,
25439 IX86_BUILTIN_PADDUSW128,
25440 IX86_BUILTIN_PSUBB128,
25441 IX86_BUILTIN_PSUBW128,
25442 IX86_BUILTIN_PSUBD128,
25443 IX86_BUILTIN_PSUBQ128,
25444 IX86_BUILTIN_PSUBSB128,
25445 IX86_BUILTIN_PSUBSW128,
25446 IX86_BUILTIN_PSUBUSB128,
25447 IX86_BUILTIN_PSUBUSW128,
25449 IX86_BUILTIN_PAND128,
25450 IX86_BUILTIN_PANDN128,
25451 IX86_BUILTIN_POR128,
25452 IX86_BUILTIN_PXOR128,
25454 IX86_BUILTIN_PAVGB128,
25455 IX86_BUILTIN_PAVGW128,
25457 IX86_BUILTIN_PCMPEQB128,
25458 IX86_BUILTIN_PCMPEQW128,
25459 IX86_BUILTIN_PCMPEQD128,
25460 IX86_BUILTIN_PCMPGTB128,
25461 IX86_BUILTIN_PCMPGTW128,
25462 IX86_BUILTIN_PCMPGTD128,
25464 IX86_BUILTIN_PMADDWD128,
25466 IX86_BUILTIN_PMAXSW128,
25467 IX86_BUILTIN_PMAXUB128,
25468 IX86_BUILTIN_PMINSW128,
25469 IX86_BUILTIN_PMINUB128,
25471 IX86_BUILTIN_PMULUDQ,
25472 IX86_BUILTIN_PMULUDQ128,
25473 IX86_BUILTIN_PMULHUW128,
25474 IX86_BUILTIN_PMULHW128,
25475 IX86_BUILTIN_PMULLW128,
25477 IX86_BUILTIN_PSADBW128,
25478 IX86_BUILTIN_PSHUFHW,
25479 IX86_BUILTIN_PSHUFLW,
25480 IX86_BUILTIN_PSHUFD,
25482 IX86_BUILTIN_PSLLDQI128,
25483 IX86_BUILTIN_PSLLWI128,
25484 IX86_BUILTIN_PSLLDI128,
25485 IX86_BUILTIN_PSLLQI128,
25486 IX86_BUILTIN_PSRAWI128,
25487 IX86_BUILTIN_PSRADI128,
25488 IX86_BUILTIN_PSRLDQI128,
25489 IX86_BUILTIN_PSRLWI128,
25490 IX86_BUILTIN_PSRLDI128,
25491 IX86_BUILTIN_PSRLQI128,
25493 IX86_BUILTIN_PSLLDQ128,
25494 IX86_BUILTIN_PSLLW128,
25495 IX86_BUILTIN_PSLLD128,
25496 IX86_BUILTIN_PSLLQ128,
25497 IX86_BUILTIN_PSRAW128,
25498 IX86_BUILTIN_PSRAD128,
25499 IX86_BUILTIN_PSRLW128,
25500 IX86_BUILTIN_PSRLD128,
25501 IX86_BUILTIN_PSRLQ128,
25503 IX86_BUILTIN_PUNPCKHBW128,
25504 IX86_BUILTIN_PUNPCKHWD128,
25505 IX86_BUILTIN_PUNPCKHDQ128,
25506 IX86_BUILTIN_PUNPCKHQDQ128,
25507 IX86_BUILTIN_PUNPCKLBW128,
25508 IX86_BUILTIN_PUNPCKLWD128,
25509 IX86_BUILTIN_PUNPCKLDQ128,
25510 IX86_BUILTIN_PUNPCKLQDQ128,
25512 IX86_BUILTIN_CLFLUSH,
25513 IX86_BUILTIN_MFENCE,
25514 IX86_BUILTIN_LFENCE,
25515 IX86_BUILTIN_PAUSE,
25517 IX86_BUILTIN_BSRSI,
25518 IX86_BUILTIN_BSRDI,
25519 IX86_BUILTIN_RDPMC,
25520 IX86_BUILTIN_RDTSC,
25521 IX86_BUILTIN_RDTSCP,
25522 IX86_BUILTIN_ROLQI,
25523 IX86_BUILTIN_ROLHI,
25524 IX86_BUILTIN_RORQI,
25525 IX86_BUILTIN_RORHI,
25527 /* SSE3. */
25528 IX86_BUILTIN_ADDSUBPS,
25529 IX86_BUILTIN_HADDPS,
25530 IX86_BUILTIN_HSUBPS,
25531 IX86_BUILTIN_MOVSHDUP,
25532 IX86_BUILTIN_MOVSLDUP,
25533 IX86_BUILTIN_ADDSUBPD,
25534 IX86_BUILTIN_HADDPD,
25535 IX86_BUILTIN_HSUBPD,
25536 IX86_BUILTIN_LDDQU,
25538 IX86_BUILTIN_MONITOR,
25539 IX86_BUILTIN_MWAIT,
25541 /* SSSE3. */
25542 IX86_BUILTIN_PHADDW,
25543 IX86_BUILTIN_PHADDD,
25544 IX86_BUILTIN_PHADDSW,
25545 IX86_BUILTIN_PHSUBW,
25546 IX86_BUILTIN_PHSUBD,
25547 IX86_BUILTIN_PHSUBSW,
25548 IX86_BUILTIN_PMADDUBSW,
25549 IX86_BUILTIN_PMULHRSW,
25550 IX86_BUILTIN_PSHUFB,
25551 IX86_BUILTIN_PSIGNB,
25552 IX86_BUILTIN_PSIGNW,
25553 IX86_BUILTIN_PSIGND,
25554 IX86_BUILTIN_PALIGNR,
25555 IX86_BUILTIN_PABSB,
25556 IX86_BUILTIN_PABSW,
25557 IX86_BUILTIN_PABSD,
25559 IX86_BUILTIN_PHADDW128,
25560 IX86_BUILTIN_PHADDD128,
25561 IX86_BUILTIN_PHADDSW128,
25562 IX86_BUILTIN_PHSUBW128,
25563 IX86_BUILTIN_PHSUBD128,
25564 IX86_BUILTIN_PHSUBSW128,
25565 IX86_BUILTIN_PMADDUBSW128,
25566 IX86_BUILTIN_PMULHRSW128,
25567 IX86_BUILTIN_PSHUFB128,
25568 IX86_BUILTIN_PSIGNB128,
25569 IX86_BUILTIN_PSIGNW128,
25570 IX86_BUILTIN_PSIGND128,
25571 IX86_BUILTIN_PALIGNR128,
25572 IX86_BUILTIN_PABSB128,
25573 IX86_BUILTIN_PABSW128,
25574 IX86_BUILTIN_PABSD128,
25576 /* AMDFAM10 - SSE4A New Instructions. */
25577 IX86_BUILTIN_MOVNTSD,
25578 IX86_BUILTIN_MOVNTSS,
25579 IX86_BUILTIN_EXTRQI,
25580 IX86_BUILTIN_EXTRQ,
25581 IX86_BUILTIN_INSERTQI,
25582 IX86_BUILTIN_INSERTQ,
25584 /* SSE4.1. */
25585 IX86_BUILTIN_BLENDPD,
25586 IX86_BUILTIN_BLENDPS,
25587 IX86_BUILTIN_BLENDVPD,
25588 IX86_BUILTIN_BLENDVPS,
25589 IX86_BUILTIN_PBLENDVB128,
25590 IX86_BUILTIN_PBLENDW128,
25592 IX86_BUILTIN_DPPD,
25593 IX86_BUILTIN_DPPS,
25595 IX86_BUILTIN_INSERTPS128,
25597 IX86_BUILTIN_MOVNTDQA,
25598 IX86_BUILTIN_MPSADBW128,
25599 IX86_BUILTIN_PACKUSDW128,
25600 IX86_BUILTIN_PCMPEQQ,
25601 IX86_BUILTIN_PHMINPOSUW128,
25603 IX86_BUILTIN_PMAXSB128,
25604 IX86_BUILTIN_PMAXSD128,
25605 IX86_BUILTIN_PMAXUD128,
25606 IX86_BUILTIN_PMAXUW128,
25608 IX86_BUILTIN_PMINSB128,
25609 IX86_BUILTIN_PMINSD128,
25610 IX86_BUILTIN_PMINUD128,
25611 IX86_BUILTIN_PMINUW128,
25613 IX86_BUILTIN_PMOVSXBW128,
25614 IX86_BUILTIN_PMOVSXBD128,
25615 IX86_BUILTIN_PMOVSXBQ128,
25616 IX86_BUILTIN_PMOVSXWD128,
25617 IX86_BUILTIN_PMOVSXWQ128,
25618 IX86_BUILTIN_PMOVSXDQ128,
25620 IX86_BUILTIN_PMOVZXBW128,
25621 IX86_BUILTIN_PMOVZXBD128,
25622 IX86_BUILTIN_PMOVZXBQ128,
25623 IX86_BUILTIN_PMOVZXWD128,
25624 IX86_BUILTIN_PMOVZXWQ128,
25625 IX86_BUILTIN_PMOVZXDQ128,
25627 IX86_BUILTIN_PMULDQ128,
25628 IX86_BUILTIN_PMULLD128,
25630 IX86_BUILTIN_ROUNDSD,
25631 IX86_BUILTIN_ROUNDSS,
25633 IX86_BUILTIN_ROUNDPD,
25634 IX86_BUILTIN_ROUNDPS,
25636 IX86_BUILTIN_FLOORPD,
25637 IX86_BUILTIN_CEILPD,
25638 IX86_BUILTIN_TRUNCPD,
25639 IX86_BUILTIN_RINTPD,
25640 IX86_BUILTIN_ROUNDPD_AZ,
25642 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25643 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25644 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25646 IX86_BUILTIN_FLOORPS,
25647 IX86_BUILTIN_CEILPS,
25648 IX86_BUILTIN_TRUNCPS,
25649 IX86_BUILTIN_RINTPS,
25650 IX86_BUILTIN_ROUNDPS_AZ,
25652 IX86_BUILTIN_FLOORPS_SFIX,
25653 IX86_BUILTIN_CEILPS_SFIX,
25654 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25656 IX86_BUILTIN_PTESTZ,
25657 IX86_BUILTIN_PTESTC,
25658 IX86_BUILTIN_PTESTNZC,
25660 IX86_BUILTIN_VEC_INIT_V2SI,
25661 IX86_BUILTIN_VEC_INIT_V4HI,
25662 IX86_BUILTIN_VEC_INIT_V8QI,
25663 IX86_BUILTIN_VEC_EXT_V2DF,
25664 IX86_BUILTIN_VEC_EXT_V2DI,
25665 IX86_BUILTIN_VEC_EXT_V4SF,
25666 IX86_BUILTIN_VEC_EXT_V4SI,
25667 IX86_BUILTIN_VEC_EXT_V8HI,
25668 IX86_BUILTIN_VEC_EXT_V2SI,
25669 IX86_BUILTIN_VEC_EXT_V4HI,
25670 IX86_BUILTIN_VEC_EXT_V16QI,
25671 IX86_BUILTIN_VEC_SET_V2DI,
25672 IX86_BUILTIN_VEC_SET_V4SF,
25673 IX86_BUILTIN_VEC_SET_V4SI,
25674 IX86_BUILTIN_VEC_SET_V8HI,
25675 IX86_BUILTIN_VEC_SET_V4HI,
25676 IX86_BUILTIN_VEC_SET_V16QI,
25678 IX86_BUILTIN_VEC_PACK_SFIX,
25679 IX86_BUILTIN_VEC_PACK_SFIX256,
25681 /* SSE4.2. */
25682 IX86_BUILTIN_CRC32QI,
25683 IX86_BUILTIN_CRC32HI,
25684 IX86_BUILTIN_CRC32SI,
25685 IX86_BUILTIN_CRC32DI,
25687 IX86_BUILTIN_PCMPESTRI128,
25688 IX86_BUILTIN_PCMPESTRM128,
25689 IX86_BUILTIN_PCMPESTRA128,
25690 IX86_BUILTIN_PCMPESTRC128,
25691 IX86_BUILTIN_PCMPESTRO128,
25692 IX86_BUILTIN_PCMPESTRS128,
25693 IX86_BUILTIN_PCMPESTRZ128,
25694 IX86_BUILTIN_PCMPISTRI128,
25695 IX86_BUILTIN_PCMPISTRM128,
25696 IX86_BUILTIN_PCMPISTRA128,
25697 IX86_BUILTIN_PCMPISTRC128,
25698 IX86_BUILTIN_PCMPISTRO128,
25699 IX86_BUILTIN_PCMPISTRS128,
25700 IX86_BUILTIN_PCMPISTRZ128,
25702 IX86_BUILTIN_PCMPGTQ,
25704 /* AES instructions */
25705 IX86_BUILTIN_AESENC128,
25706 IX86_BUILTIN_AESENCLAST128,
25707 IX86_BUILTIN_AESDEC128,
25708 IX86_BUILTIN_AESDECLAST128,
25709 IX86_BUILTIN_AESIMC128,
25710 IX86_BUILTIN_AESKEYGENASSIST128,
25712 /* PCLMUL instruction */
25713 IX86_BUILTIN_PCLMULQDQ128,
25715 /* AVX */
25716 IX86_BUILTIN_ADDPD256,
25717 IX86_BUILTIN_ADDPS256,
25718 IX86_BUILTIN_ADDSUBPD256,
25719 IX86_BUILTIN_ADDSUBPS256,
25720 IX86_BUILTIN_ANDPD256,
25721 IX86_BUILTIN_ANDPS256,
25722 IX86_BUILTIN_ANDNPD256,
25723 IX86_BUILTIN_ANDNPS256,
25724 IX86_BUILTIN_BLENDPD256,
25725 IX86_BUILTIN_BLENDPS256,
25726 IX86_BUILTIN_BLENDVPD256,
25727 IX86_BUILTIN_BLENDVPS256,
25728 IX86_BUILTIN_DIVPD256,
25729 IX86_BUILTIN_DIVPS256,
25730 IX86_BUILTIN_DPPS256,
25731 IX86_BUILTIN_HADDPD256,
25732 IX86_BUILTIN_HADDPS256,
25733 IX86_BUILTIN_HSUBPD256,
25734 IX86_BUILTIN_HSUBPS256,
25735 IX86_BUILTIN_MAXPD256,
25736 IX86_BUILTIN_MAXPS256,
25737 IX86_BUILTIN_MINPD256,
25738 IX86_BUILTIN_MINPS256,
25739 IX86_BUILTIN_MULPD256,
25740 IX86_BUILTIN_MULPS256,
25741 IX86_BUILTIN_ORPD256,
25742 IX86_BUILTIN_ORPS256,
25743 IX86_BUILTIN_SHUFPD256,
25744 IX86_BUILTIN_SHUFPS256,
25745 IX86_BUILTIN_SUBPD256,
25746 IX86_BUILTIN_SUBPS256,
25747 IX86_BUILTIN_XORPD256,
25748 IX86_BUILTIN_XORPS256,
25749 IX86_BUILTIN_CMPSD,
25750 IX86_BUILTIN_CMPSS,
25751 IX86_BUILTIN_CMPPD,
25752 IX86_BUILTIN_CMPPS,
25753 IX86_BUILTIN_CMPPD256,
25754 IX86_BUILTIN_CMPPS256,
25755 IX86_BUILTIN_CVTDQ2PD256,
25756 IX86_BUILTIN_CVTDQ2PS256,
25757 IX86_BUILTIN_CVTPD2PS256,
25758 IX86_BUILTIN_CVTPS2DQ256,
25759 IX86_BUILTIN_CVTPS2PD256,
25760 IX86_BUILTIN_CVTTPD2DQ256,
25761 IX86_BUILTIN_CVTPD2DQ256,
25762 IX86_BUILTIN_CVTTPS2DQ256,
25763 IX86_BUILTIN_EXTRACTF128PD256,
25764 IX86_BUILTIN_EXTRACTF128PS256,
25765 IX86_BUILTIN_EXTRACTF128SI256,
25766 IX86_BUILTIN_VZEROALL,
25767 IX86_BUILTIN_VZEROUPPER,
25768 IX86_BUILTIN_VPERMILVARPD,
25769 IX86_BUILTIN_VPERMILVARPS,
25770 IX86_BUILTIN_VPERMILVARPD256,
25771 IX86_BUILTIN_VPERMILVARPS256,
25772 IX86_BUILTIN_VPERMILPD,
25773 IX86_BUILTIN_VPERMILPS,
25774 IX86_BUILTIN_VPERMILPD256,
25775 IX86_BUILTIN_VPERMILPS256,
25776 IX86_BUILTIN_VPERMIL2PD,
25777 IX86_BUILTIN_VPERMIL2PS,
25778 IX86_BUILTIN_VPERMIL2PD256,
25779 IX86_BUILTIN_VPERMIL2PS256,
25780 IX86_BUILTIN_VPERM2F128PD256,
25781 IX86_BUILTIN_VPERM2F128PS256,
25782 IX86_BUILTIN_VPERM2F128SI256,
25783 IX86_BUILTIN_VBROADCASTSS,
25784 IX86_BUILTIN_VBROADCASTSD256,
25785 IX86_BUILTIN_VBROADCASTSS256,
25786 IX86_BUILTIN_VBROADCASTPD256,
25787 IX86_BUILTIN_VBROADCASTPS256,
25788 IX86_BUILTIN_VINSERTF128PD256,
25789 IX86_BUILTIN_VINSERTF128PS256,
25790 IX86_BUILTIN_VINSERTF128SI256,
25791 IX86_BUILTIN_LOADUPD256,
25792 IX86_BUILTIN_LOADUPS256,
25793 IX86_BUILTIN_STOREUPD256,
25794 IX86_BUILTIN_STOREUPS256,
25795 IX86_BUILTIN_LDDQU256,
25796 IX86_BUILTIN_MOVNTDQ256,
25797 IX86_BUILTIN_MOVNTPD256,
25798 IX86_BUILTIN_MOVNTPS256,
25799 IX86_BUILTIN_LOADDQU256,
25800 IX86_BUILTIN_STOREDQU256,
25801 IX86_BUILTIN_MASKLOADPD,
25802 IX86_BUILTIN_MASKLOADPS,
25803 IX86_BUILTIN_MASKSTOREPD,
25804 IX86_BUILTIN_MASKSTOREPS,
25805 IX86_BUILTIN_MASKLOADPD256,
25806 IX86_BUILTIN_MASKLOADPS256,
25807 IX86_BUILTIN_MASKSTOREPD256,
25808 IX86_BUILTIN_MASKSTOREPS256,
25809 IX86_BUILTIN_MOVSHDUP256,
25810 IX86_BUILTIN_MOVSLDUP256,
25811 IX86_BUILTIN_MOVDDUP256,
25813 IX86_BUILTIN_SQRTPD256,
25814 IX86_BUILTIN_SQRTPS256,
25815 IX86_BUILTIN_SQRTPS_NR256,
25816 IX86_BUILTIN_RSQRTPS256,
25817 IX86_BUILTIN_RSQRTPS_NR256,
25819 IX86_BUILTIN_RCPPS256,
25821 IX86_BUILTIN_ROUNDPD256,
25822 IX86_BUILTIN_ROUNDPS256,
25824 IX86_BUILTIN_FLOORPD256,
25825 IX86_BUILTIN_CEILPD256,
25826 IX86_BUILTIN_TRUNCPD256,
25827 IX86_BUILTIN_RINTPD256,
25828 IX86_BUILTIN_ROUNDPD_AZ256,
25830 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25831 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25832 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25834 IX86_BUILTIN_FLOORPS256,
25835 IX86_BUILTIN_CEILPS256,
25836 IX86_BUILTIN_TRUNCPS256,
25837 IX86_BUILTIN_RINTPS256,
25838 IX86_BUILTIN_ROUNDPS_AZ256,
25840 IX86_BUILTIN_FLOORPS_SFIX256,
25841 IX86_BUILTIN_CEILPS_SFIX256,
25842 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25844 IX86_BUILTIN_UNPCKHPD256,
25845 IX86_BUILTIN_UNPCKLPD256,
25846 IX86_BUILTIN_UNPCKHPS256,
25847 IX86_BUILTIN_UNPCKLPS256,
25849 IX86_BUILTIN_SI256_SI,
25850 IX86_BUILTIN_PS256_PS,
25851 IX86_BUILTIN_PD256_PD,
25852 IX86_BUILTIN_SI_SI256,
25853 IX86_BUILTIN_PS_PS256,
25854 IX86_BUILTIN_PD_PD256,
25856 IX86_BUILTIN_VTESTZPD,
25857 IX86_BUILTIN_VTESTCPD,
25858 IX86_BUILTIN_VTESTNZCPD,
25859 IX86_BUILTIN_VTESTZPS,
25860 IX86_BUILTIN_VTESTCPS,
25861 IX86_BUILTIN_VTESTNZCPS,
25862 IX86_BUILTIN_VTESTZPD256,
25863 IX86_BUILTIN_VTESTCPD256,
25864 IX86_BUILTIN_VTESTNZCPD256,
25865 IX86_BUILTIN_VTESTZPS256,
25866 IX86_BUILTIN_VTESTCPS256,
25867 IX86_BUILTIN_VTESTNZCPS256,
25868 IX86_BUILTIN_PTESTZ256,
25869 IX86_BUILTIN_PTESTC256,
25870 IX86_BUILTIN_PTESTNZC256,
25872 IX86_BUILTIN_MOVMSKPD256,
25873 IX86_BUILTIN_MOVMSKPS256,
25875 /* AVX2 */
25876 IX86_BUILTIN_MPSADBW256,
25877 IX86_BUILTIN_PABSB256,
25878 IX86_BUILTIN_PABSW256,
25879 IX86_BUILTIN_PABSD256,
25880 IX86_BUILTIN_PACKSSDW256,
25881 IX86_BUILTIN_PACKSSWB256,
25882 IX86_BUILTIN_PACKUSDW256,
25883 IX86_BUILTIN_PACKUSWB256,
25884 IX86_BUILTIN_PADDB256,
25885 IX86_BUILTIN_PADDW256,
25886 IX86_BUILTIN_PADDD256,
25887 IX86_BUILTIN_PADDQ256,
25888 IX86_BUILTIN_PADDSB256,
25889 IX86_BUILTIN_PADDSW256,
25890 IX86_BUILTIN_PADDUSB256,
25891 IX86_BUILTIN_PADDUSW256,
25892 IX86_BUILTIN_PALIGNR256,
25893 IX86_BUILTIN_AND256I,
25894 IX86_BUILTIN_ANDNOT256I,
25895 IX86_BUILTIN_PAVGB256,
25896 IX86_BUILTIN_PAVGW256,
25897 IX86_BUILTIN_PBLENDVB256,
25898 IX86_BUILTIN_PBLENDVW256,
25899 IX86_BUILTIN_PCMPEQB256,
25900 IX86_BUILTIN_PCMPEQW256,
25901 IX86_BUILTIN_PCMPEQD256,
25902 IX86_BUILTIN_PCMPEQQ256,
25903 IX86_BUILTIN_PCMPGTB256,
25904 IX86_BUILTIN_PCMPGTW256,
25905 IX86_BUILTIN_PCMPGTD256,
25906 IX86_BUILTIN_PCMPGTQ256,
25907 IX86_BUILTIN_PHADDW256,
25908 IX86_BUILTIN_PHADDD256,
25909 IX86_BUILTIN_PHADDSW256,
25910 IX86_BUILTIN_PHSUBW256,
25911 IX86_BUILTIN_PHSUBD256,
25912 IX86_BUILTIN_PHSUBSW256,
25913 IX86_BUILTIN_PMADDUBSW256,
25914 IX86_BUILTIN_PMADDWD256,
25915 IX86_BUILTIN_PMAXSB256,
25916 IX86_BUILTIN_PMAXSW256,
25917 IX86_BUILTIN_PMAXSD256,
25918 IX86_BUILTIN_PMAXUB256,
25919 IX86_BUILTIN_PMAXUW256,
25920 IX86_BUILTIN_PMAXUD256,
25921 IX86_BUILTIN_PMINSB256,
25922 IX86_BUILTIN_PMINSW256,
25923 IX86_BUILTIN_PMINSD256,
25924 IX86_BUILTIN_PMINUB256,
25925 IX86_BUILTIN_PMINUW256,
25926 IX86_BUILTIN_PMINUD256,
25927 IX86_BUILTIN_PMOVMSKB256,
25928 IX86_BUILTIN_PMOVSXBW256,
25929 IX86_BUILTIN_PMOVSXBD256,
25930 IX86_BUILTIN_PMOVSXBQ256,
25931 IX86_BUILTIN_PMOVSXWD256,
25932 IX86_BUILTIN_PMOVSXWQ256,
25933 IX86_BUILTIN_PMOVSXDQ256,
25934 IX86_BUILTIN_PMOVZXBW256,
25935 IX86_BUILTIN_PMOVZXBD256,
25936 IX86_BUILTIN_PMOVZXBQ256,
25937 IX86_BUILTIN_PMOVZXWD256,
25938 IX86_BUILTIN_PMOVZXWQ256,
25939 IX86_BUILTIN_PMOVZXDQ256,
25940 IX86_BUILTIN_PMULDQ256,
25941 IX86_BUILTIN_PMULHRSW256,
25942 IX86_BUILTIN_PMULHUW256,
25943 IX86_BUILTIN_PMULHW256,
25944 IX86_BUILTIN_PMULLW256,
25945 IX86_BUILTIN_PMULLD256,
25946 IX86_BUILTIN_PMULUDQ256,
25947 IX86_BUILTIN_POR256,
25948 IX86_BUILTIN_PSADBW256,
25949 IX86_BUILTIN_PSHUFB256,
25950 IX86_BUILTIN_PSHUFD256,
25951 IX86_BUILTIN_PSHUFHW256,
25952 IX86_BUILTIN_PSHUFLW256,
25953 IX86_BUILTIN_PSIGNB256,
25954 IX86_BUILTIN_PSIGNW256,
25955 IX86_BUILTIN_PSIGND256,
25956 IX86_BUILTIN_PSLLDQI256,
25957 IX86_BUILTIN_PSLLWI256,
25958 IX86_BUILTIN_PSLLW256,
25959 IX86_BUILTIN_PSLLDI256,
25960 IX86_BUILTIN_PSLLD256,
25961 IX86_BUILTIN_PSLLQI256,
25962 IX86_BUILTIN_PSLLQ256,
25963 IX86_BUILTIN_PSRAWI256,
25964 IX86_BUILTIN_PSRAW256,
25965 IX86_BUILTIN_PSRADI256,
25966 IX86_BUILTIN_PSRAD256,
25967 IX86_BUILTIN_PSRLDQI256,
25968 IX86_BUILTIN_PSRLWI256,
25969 IX86_BUILTIN_PSRLW256,
25970 IX86_BUILTIN_PSRLDI256,
25971 IX86_BUILTIN_PSRLD256,
25972 IX86_BUILTIN_PSRLQI256,
25973 IX86_BUILTIN_PSRLQ256,
25974 IX86_BUILTIN_PSUBB256,
25975 IX86_BUILTIN_PSUBW256,
25976 IX86_BUILTIN_PSUBD256,
25977 IX86_BUILTIN_PSUBQ256,
25978 IX86_BUILTIN_PSUBSB256,
25979 IX86_BUILTIN_PSUBSW256,
25980 IX86_BUILTIN_PSUBUSB256,
25981 IX86_BUILTIN_PSUBUSW256,
25982 IX86_BUILTIN_PUNPCKHBW256,
25983 IX86_BUILTIN_PUNPCKHWD256,
25984 IX86_BUILTIN_PUNPCKHDQ256,
25985 IX86_BUILTIN_PUNPCKHQDQ256,
25986 IX86_BUILTIN_PUNPCKLBW256,
25987 IX86_BUILTIN_PUNPCKLWD256,
25988 IX86_BUILTIN_PUNPCKLDQ256,
25989 IX86_BUILTIN_PUNPCKLQDQ256,
25990 IX86_BUILTIN_PXOR256,
25991 IX86_BUILTIN_MOVNTDQA256,
25992 IX86_BUILTIN_VBROADCASTSS_PS,
25993 IX86_BUILTIN_VBROADCASTSS_PS256,
25994 IX86_BUILTIN_VBROADCASTSD_PD256,
25995 IX86_BUILTIN_VBROADCASTSI256,
25996 IX86_BUILTIN_PBLENDD256,
25997 IX86_BUILTIN_PBLENDD128,
25998 IX86_BUILTIN_PBROADCASTB256,
25999 IX86_BUILTIN_PBROADCASTW256,
26000 IX86_BUILTIN_PBROADCASTD256,
26001 IX86_BUILTIN_PBROADCASTQ256,
26002 IX86_BUILTIN_PBROADCASTB128,
26003 IX86_BUILTIN_PBROADCASTW128,
26004 IX86_BUILTIN_PBROADCASTD128,
26005 IX86_BUILTIN_PBROADCASTQ128,
26006 IX86_BUILTIN_VPERMVARSI256,
26007 IX86_BUILTIN_VPERMDF256,
26008 IX86_BUILTIN_VPERMVARSF256,
26009 IX86_BUILTIN_VPERMDI256,
26010 IX86_BUILTIN_VPERMTI256,
26011 IX86_BUILTIN_VEXTRACT128I256,
26012 IX86_BUILTIN_VINSERT128I256,
26013 IX86_BUILTIN_MASKLOADD,
26014 IX86_BUILTIN_MASKLOADQ,
26015 IX86_BUILTIN_MASKLOADD256,
26016 IX86_BUILTIN_MASKLOADQ256,
26017 IX86_BUILTIN_MASKSTORED,
26018 IX86_BUILTIN_MASKSTOREQ,
26019 IX86_BUILTIN_MASKSTORED256,
26020 IX86_BUILTIN_MASKSTOREQ256,
26021 IX86_BUILTIN_PSLLVV4DI,
26022 IX86_BUILTIN_PSLLVV2DI,
26023 IX86_BUILTIN_PSLLVV8SI,
26024 IX86_BUILTIN_PSLLVV4SI,
26025 IX86_BUILTIN_PSRAVV8SI,
26026 IX86_BUILTIN_PSRAVV4SI,
26027 IX86_BUILTIN_PSRLVV4DI,
26028 IX86_BUILTIN_PSRLVV2DI,
26029 IX86_BUILTIN_PSRLVV8SI,
26030 IX86_BUILTIN_PSRLVV4SI,
26032 IX86_BUILTIN_GATHERSIV2DF,
26033 IX86_BUILTIN_GATHERSIV4DF,
26034 IX86_BUILTIN_GATHERDIV2DF,
26035 IX86_BUILTIN_GATHERDIV4DF,
26036 IX86_BUILTIN_GATHERSIV4SF,
26037 IX86_BUILTIN_GATHERSIV8SF,
26038 IX86_BUILTIN_GATHERDIV4SF,
26039 IX86_BUILTIN_GATHERDIV8SF,
26040 IX86_BUILTIN_GATHERSIV2DI,
26041 IX86_BUILTIN_GATHERSIV4DI,
26042 IX86_BUILTIN_GATHERDIV2DI,
26043 IX86_BUILTIN_GATHERDIV4DI,
26044 IX86_BUILTIN_GATHERSIV4SI,
26045 IX86_BUILTIN_GATHERSIV8SI,
26046 IX86_BUILTIN_GATHERDIV4SI,
26047 IX86_BUILTIN_GATHERDIV8SI,
26049 /* Alternate 4 element gather for the vectorizer where
26050 all operands are 32-byte wide. */
26051 IX86_BUILTIN_GATHERALTSIV4DF,
26052 IX86_BUILTIN_GATHERALTDIV8SF,
26053 IX86_BUILTIN_GATHERALTSIV4DI,
26054 IX86_BUILTIN_GATHERALTDIV8SI,
26056 /* TFmode support builtins. */
26057 IX86_BUILTIN_INFQ,
26058 IX86_BUILTIN_HUGE_VALQ,
26059 IX86_BUILTIN_FABSQ,
26060 IX86_BUILTIN_COPYSIGNQ,
26062 /* Vectorizer support builtins. */
26063 IX86_BUILTIN_CPYSGNPS,
26064 IX86_BUILTIN_CPYSGNPD,
26065 IX86_BUILTIN_CPYSGNPS256,
26066 IX86_BUILTIN_CPYSGNPD256,
26068 /* FMA4 instructions. */
26069 IX86_BUILTIN_VFMADDSS,
26070 IX86_BUILTIN_VFMADDSD,
26071 IX86_BUILTIN_VFMADDPS,
26072 IX86_BUILTIN_VFMADDPD,
26073 IX86_BUILTIN_VFMADDPS256,
26074 IX86_BUILTIN_VFMADDPD256,
26075 IX86_BUILTIN_VFMADDSUBPS,
26076 IX86_BUILTIN_VFMADDSUBPD,
26077 IX86_BUILTIN_VFMADDSUBPS256,
26078 IX86_BUILTIN_VFMADDSUBPD256,
26080 /* FMA3 instructions. */
26081 IX86_BUILTIN_VFMADDSS3,
26082 IX86_BUILTIN_VFMADDSD3,
26084 /* XOP instructions. */
26085 IX86_BUILTIN_VPCMOV,
26086 IX86_BUILTIN_VPCMOV_V2DI,
26087 IX86_BUILTIN_VPCMOV_V4SI,
26088 IX86_BUILTIN_VPCMOV_V8HI,
26089 IX86_BUILTIN_VPCMOV_V16QI,
26090 IX86_BUILTIN_VPCMOV_V4SF,
26091 IX86_BUILTIN_VPCMOV_V2DF,
26092 IX86_BUILTIN_VPCMOV256,
26093 IX86_BUILTIN_VPCMOV_V4DI256,
26094 IX86_BUILTIN_VPCMOV_V8SI256,
26095 IX86_BUILTIN_VPCMOV_V16HI256,
26096 IX86_BUILTIN_VPCMOV_V32QI256,
26097 IX86_BUILTIN_VPCMOV_V8SF256,
26098 IX86_BUILTIN_VPCMOV_V4DF256,
26100 IX86_BUILTIN_VPPERM,
26102 IX86_BUILTIN_VPMACSSWW,
26103 IX86_BUILTIN_VPMACSWW,
26104 IX86_BUILTIN_VPMACSSWD,
26105 IX86_BUILTIN_VPMACSWD,
26106 IX86_BUILTIN_VPMACSSDD,
26107 IX86_BUILTIN_VPMACSDD,
26108 IX86_BUILTIN_VPMACSSDQL,
26109 IX86_BUILTIN_VPMACSSDQH,
26110 IX86_BUILTIN_VPMACSDQL,
26111 IX86_BUILTIN_VPMACSDQH,
26112 IX86_BUILTIN_VPMADCSSWD,
26113 IX86_BUILTIN_VPMADCSWD,
26115 IX86_BUILTIN_VPHADDBW,
26116 IX86_BUILTIN_VPHADDBD,
26117 IX86_BUILTIN_VPHADDBQ,
26118 IX86_BUILTIN_VPHADDWD,
26119 IX86_BUILTIN_VPHADDWQ,
26120 IX86_BUILTIN_VPHADDDQ,
26121 IX86_BUILTIN_VPHADDUBW,
26122 IX86_BUILTIN_VPHADDUBD,
26123 IX86_BUILTIN_VPHADDUBQ,
26124 IX86_BUILTIN_VPHADDUWD,
26125 IX86_BUILTIN_VPHADDUWQ,
26126 IX86_BUILTIN_VPHADDUDQ,
26127 IX86_BUILTIN_VPHSUBBW,
26128 IX86_BUILTIN_VPHSUBWD,
26129 IX86_BUILTIN_VPHSUBDQ,
26131 IX86_BUILTIN_VPROTB,
26132 IX86_BUILTIN_VPROTW,
26133 IX86_BUILTIN_VPROTD,
26134 IX86_BUILTIN_VPROTQ,
26135 IX86_BUILTIN_VPROTB_IMM,
26136 IX86_BUILTIN_VPROTW_IMM,
26137 IX86_BUILTIN_VPROTD_IMM,
26138 IX86_BUILTIN_VPROTQ_IMM,
26140 IX86_BUILTIN_VPSHLB,
26141 IX86_BUILTIN_VPSHLW,
26142 IX86_BUILTIN_VPSHLD,
26143 IX86_BUILTIN_VPSHLQ,
26144 IX86_BUILTIN_VPSHAB,
26145 IX86_BUILTIN_VPSHAW,
26146 IX86_BUILTIN_VPSHAD,
26147 IX86_BUILTIN_VPSHAQ,
26149 IX86_BUILTIN_VFRCZSS,
26150 IX86_BUILTIN_VFRCZSD,
26151 IX86_BUILTIN_VFRCZPS,
26152 IX86_BUILTIN_VFRCZPD,
26153 IX86_BUILTIN_VFRCZPS256,
26154 IX86_BUILTIN_VFRCZPD256,
26156 IX86_BUILTIN_VPCOMEQUB,
26157 IX86_BUILTIN_VPCOMNEUB,
26158 IX86_BUILTIN_VPCOMLTUB,
26159 IX86_BUILTIN_VPCOMLEUB,
26160 IX86_BUILTIN_VPCOMGTUB,
26161 IX86_BUILTIN_VPCOMGEUB,
26162 IX86_BUILTIN_VPCOMFALSEUB,
26163 IX86_BUILTIN_VPCOMTRUEUB,
26165 IX86_BUILTIN_VPCOMEQUW,
26166 IX86_BUILTIN_VPCOMNEUW,
26167 IX86_BUILTIN_VPCOMLTUW,
26168 IX86_BUILTIN_VPCOMLEUW,
26169 IX86_BUILTIN_VPCOMGTUW,
26170 IX86_BUILTIN_VPCOMGEUW,
26171 IX86_BUILTIN_VPCOMFALSEUW,
26172 IX86_BUILTIN_VPCOMTRUEUW,
26174 IX86_BUILTIN_VPCOMEQUD,
26175 IX86_BUILTIN_VPCOMNEUD,
26176 IX86_BUILTIN_VPCOMLTUD,
26177 IX86_BUILTIN_VPCOMLEUD,
26178 IX86_BUILTIN_VPCOMGTUD,
26179 IX86_BUILTIN_VPCOMGEUD,
26180 IX86_BUILTIN_VPCOMFALSEUD,
26181 IX86_BUILTIN_VPCOMTRUEUD,
26183 IX86_BUILTIN_VPCOMEQUQ,
26184 IX86_BUILTIN_VPCOMNEUQ,
26185 IX86_BUILTIN_VPCOMLTUQ,
26186 IX86_BUILTIN_VPCOMLEUQ,
26187 IX86_BUILTIN_VPCOMGTUQ,
26188 IX86_BUILTIN_VPCOMGEUQ,
26189 IX86_BUILTIN_VPCOMFALSEUQ,
26190 IX86_BUILTIN_VPCOMTRUEUQ,
26192 IX86_BUILTIN_VPCOMEQB,
26193 IX86_BUILTIN_VPCOMNEB,
26194 IX86_BUILTIN_VPCOMLTB,
26195 IX86_BUILTIN_VPCOMLEB,
26196 IX86_BUILTIN_VPCOMGTB,
26197 IX86_BUILTIN_VPCOMGEB,
26198 IX86_BUILTIN_VPCOMFALSEB,
26199 IX86_BUILTIN_VPCOMTRUEB,
26201 IX86_BUILTIN_VPCOMEQW,
26202 IX86_BUILTIN_VPCOMNEW,
26203 IX86_BUILTIN_VPCOMLTW,
26204 IX86_BUILTIN_VPCOMLEW,
26205 IX86_BUILTIN_VPCOMGTW,
26206 IX86_BUILTIN_VPCOMGEW,
26207 IX86_BUILTIN_VPCOMFALSEW,
26208 IX86_BUILTIN_VPCOMTRUEW,
26210 IX86_BUILTIN_VPCOMEQD,
26211 IX86_BUILTIN_VPCOMNED,
26212 IX86_BUILTIN_VPCOMLTD,
26213 IX86_BUILTIN_VPCOMLED,
26214 IX86_BUILTIN_VPCOMGTD,
26215 IX86_BUILTIN_VPCOMGED,
26216 IX86_BUILTIN_VPCOMFALSED,
26217 IX86_BUILTIN_VPCOMTRUED,
26219 IX86_BUILTIN_VPCOMEQQ,
26220 IX86_BUILTIN_VPCOMNEQ,
26221 IX86_BUILTIN_VPCOMLTQ,
26222 IX86_BUILTIN_VPCOMLEQ,
26223 IX86_BUILTIN_VPCOMGTQ,
26224 IX86_BUILTIN_VPCOMGEQ,
26225 IX86_BUILTIN_VPCOMFALSEQ,
26226 IX86_BUILTIN_VPCOMTRUEQ,
26228 /* LWP instructions. */
26229 IX86_BUILTIN_LLWPCB,
26230 IX86_BUILTIN_SLWPCB,
26231 IX86_BUILTIN_LWPVAL32,
26232 IX86_BUILTIN_LWPVAL64,
26233 IX86_BUILTIN_LWPINS32,
26234 IX86_BUILTIN_LWPINS64,
26236 IX86_BUILTIN_CLZS,
26238 /* RTM */
26239 IX86_BUILTIN_XBEGIN,
26240 IX86_BUILTIN_XEND,
26241 IX86_BUILTIN_XABORT,
26242 IX86_BUILTIN_XTEST,
26244 /* BMI instructions. */
26245 IX86_BUILTIN_BEXTR32,
26246 IX86_BUILTIN_BEXTR64,
26247 IX86_BUILTIN_CTZS,
26249 /* TBM instructions. */
26250 IX86_BUILTIN_BEXTRI32,
26251 IX86_BUILTIN_BEXTRI64,
26253 /* BMI2 instructions. */
26254 IX86_BUILTIN_BZHI32,
26255 IX86_BUILTIN_BZHI64,
26256 IX86_BUILTIN_PDEP32,
26257 IX86_BUILTIN_PDEP64,
26258 IX86_BUILTIN_PEXT32,
26259 IX86_BUILTIN_PEXT64,
26261 /* ADX instructions. */
26262 IX86_BUILTIN_ADDCARRYX32,
26263 IX86_BUILTIN_ADDCARRYX64,
26265 /* FSGSBASE instructions. */
26266 IX86_BUILTIN_RDFSBASE32,
26267 IX86_BUILTIN_RDFSBASE64,
26268 IX86_BUILTIN_RDGSBASE32,
26269 IX86_BUILTIN_RDGSBASE64,
26270 IX86_BUILTIN_WRFSBASE32,
26271 IX86_BUILTIN_WRFSBASE64,
26272 IX86_BUILTIN_WRGSBASE32,
26273 IX86_BUILTIN_WRGSBASE64,
26275 /* RDRND instructions. */
26276 IX86_BUILTIN_RDRAND16_STEP,
26277 IX86_BUILTIN_RDRAND32_STEP,
26278 IX86_BUILTIN_RDRAND64_STEP,
26280 /* RDSEED instructions. */
26281 IX86_BUILTIN_RDSEED16_STEP,
26282 IX86_BUILTIN_RDSEED32_STEP,
26283 IX86_BUILTIN_RDSEED64_STEP,
26285 /* F16C instructions. */
26286 IX86_BUILTIN_CVTPH2PS,
26287 IX86_BUILTIN_CVTPH2PS256,
26288 IX86_BUILTIN_CVTPS2PH,
26289 IX86_BUILTIN_CVTPS2PH256,
26291 /* CFString built-in for darwin */
26292 IX86_BUILTIN_CFSTRING,
26294 /* Builtins to get CPU type and supported features. */
26295 IX86_BUILTIN_CPU_INIT,
26296 IX86_BUILTIN_CPU_IS,
26297 IX86_BUILTIN_CPU_SUPPORTS,
26299 IX86_BUILTIN_MAX
26302 /* Table for the ix86 builtin decls. */
26303 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26305 /* Table of all of the builtin functions that are possible with different ISA's
26306 but are waiting to be built until a function is declared to use that
26307 ISA. */
26308 struct builtin_isa {
26309 const char *name; /* function name */
26310 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26311 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26312 bool const_p; /* true if the declaration is constant */
26313 bool set_and_not_built_p;
26316 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26319 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26320 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26321 function decl in the ix86_builtins array. Returns the function decl or
26322 NULL_TREE, if the builtin was not added.
26324 If the front end has a special hook for builtin functions, delay adding
26325 builtin functions that aren't in the current ISA until the ISA is changed
26326 with function specific optimization. Doing so, can save about 300K for the
26327 default compiler. When the builtin is expanded, check at that time whether
26328 it is valid.
26330 If the front end doesn't have a special hook, record all builtins, even if
26331 it isn't an instruction set in the current ISA in case the user uses
26332 function specific options for a different ISA, so that we don't get scope
26333 errors if a builtin is added in the middle of a function scope. */
26335 static inline tree
26336 def_builtin (HOST_WIDE_INT mask, const char *name,
26337 enum ix86_builtin_func_type tcode,
26338 enum ix86_builtins code)
26340 tree decl = NULL_TREE;
26342 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26344 ix86_builtins_isa[(int) code].isa = mask;
26346 mask &= ~OPTION_MASK_ISA_64BIT;
26347 if (mask == 0
26348 || (mask & ix86_isa_flags) != 0
26349 || (lang_hooks.builtin_function
26350 == lang_hooks.builtin_function_ext_scope))
26353 tree type = ix86_get_builtin_func_type (tcode);
26354 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26355 NULL, NULL_TREE);
26356 ix86_builtins[(int) code] = decl;
26357 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26359 else
26361 ix86_builtins[(int) code] = NULL_TREE;
26362 ix86_builtins_isa[(int) code].tcode = tcode;
26363 ix86_builtins_isa[(int) code].name = name;
26364 ix86_builtins_isa[(int) code].const_p = false;
26365 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26369 return decl;
26372 /* Like def_builtin, but also marks the function decl "const". */
26374 static inline tree
26375 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26376 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26378 tree decl = def_builtin (mask, name, tcode, code);
26379 if (decl)
26380 TREE_READONLY (decl) = 1;
26381 else
26382 ix86_builtins_isa[(int) code].const_p = true;
26384 return decl;
26387 /* Add any new builtin functions for a given ISA that may not have been
26388 declared. This saves a bit of space compared to adding all of the
26389 declarations to the tree, even if we didn't use them. */
26391 static void
26392 ix86_add_new_builtins (HOST_WIDE_INT isa)
26394 int i;
26396 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26398 if ((ix86_builtins_isa[i].isa & isa) != 0
26399 && ix86_builtins_isa[i].set_and_not_built_p)
26401 tree decl, type;
26403 /* Don't define the builtin again. */
26404 ix86_builtins_isa[i].set_and_not_built_p = false;
26406 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26407 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26408 type, i, BUILT_IN_MD, NULL,
26409 NULL_TREE);
26411 ix86_builtins[i] = decl;
26412 if (ix86_builtins_isa[i].const_p)
26413 TREE_READONLY (decl) = 1;
26418 /* Bits for builtin_description.flag. */
26420 /* Set when we don't support the comparison natively, and should
26421 swap_comparison in order to support it. */
26422 #define BUILTIN_DESC_SWAP_OPERANDS 1
26424 struct builtin_description
26426 const HOST_WIDE_INT mask;
26427 const enum insn_code icode;
26428 const char *const name;
26429 const enum ix86_builtins code;
26430 const enum rtx_code comparison;
26431 const int flag;
26434 static const struct builtin_description bdesc_comi[] =
26436 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26437 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26438 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26439 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26440 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26441 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26442 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26443 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26444 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26445 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26446 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26447 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26448 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26449 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26450 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26451 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26452 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26453 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26454 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26455 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26456 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26457 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26458 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26459 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26462 static const struct builtin_description bdesc_pcmpestr[] =
26464 /* SSE4.2 */
26465 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26466 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26467 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26468 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26469 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26470 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26471 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26474 static const struct builtin_description bdesc_pcmpistr[] =
26476 /* SSE4.2 */
26477 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26478 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26479 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26480 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26481 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26482 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26483 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26486 /* Special builtins with variable number of arguments. */
26487 static const struct builtin_description bdesc_special_args[] =
26489 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26490 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26491 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26493 /* MMX */
26494 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26496 /* 3DNow! */
26497 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26499 /* FXSR, XSAVE and XSAVEOPT */
26500 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26501 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26502 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26503 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26504 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26506 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26507 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26508 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26509 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26510 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26512 /* SSE */
26513 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26514 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26515 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26517 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26518 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26519 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26520 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26522 /* SSE or 3DNow!A */
26523 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26524 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26526 /* SSE2 */
26527 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26528 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26529 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26530 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26531 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26532 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26533 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26534 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26535 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26536 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26538 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26539 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26541 /* SSE3 */
26542 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26544 /* SSE4.1 */
26545 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26547 /* SSE4A */
26548 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26549 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26551 /* AVX */
26552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26555 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26556 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26557 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26563 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26564 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26574 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26582 /* AVX2 */
26583 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26584 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26585 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26586 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26587 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26588 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26589 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26590 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26591 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26593 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26594 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26595 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26596 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26597 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26598 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26600 /* FSGSBASE */
26601 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26602 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26603 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26604 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26605 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26606 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26607 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26608 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26610 /* RTM */
26611 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26612 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26613 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26616 /* Builtins with variable number of arguments. */
26617 static const struct builtin_description bdesc_args[] =
26619 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26620 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26621 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26622 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26623 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26624 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26625 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26627 /* MMX */
26628 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26629 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26630 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26631 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26632 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26633 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26635 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26636 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26637 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26638 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26639 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26640 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26641 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26642 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26644 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26645 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26647 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26648 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26649 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26650 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26652 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26653 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26654 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26655 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26656 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26657 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26659 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26660 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26661 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26662 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26663 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26664 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26666 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26667 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26668 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26670 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26672 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26673 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26674 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26675 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26676 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26677 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26679 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26680 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26681 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26682 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26683 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26684 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26686 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26687 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26688 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26689 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26691 /* 3DNow! */
26692 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26693 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26694 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26695 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26697 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26698 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26699 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26700 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26701 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26702 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26703 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26704 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26705 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26706 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26707 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26708 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26709 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26710 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26711 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26713 /* 3DNow!A */
26714 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26715 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26716 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26717 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26718 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26719 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26721 /* SSE */
26722 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26723 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26724 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26725 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26726 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26727 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26728 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26729 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26730 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26731 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26732 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26733 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26735 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26737 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26738 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26739 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26740 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26741 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26742 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26743 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26744 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26746 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26747 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26748 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26749 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26750 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26751 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26752 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26753 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26754 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26755 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26756 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26757 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26758 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26759 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26760 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26761 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26762 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26763 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26764 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26765 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26766 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26767 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26769 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26770 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26771 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26772 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26774 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26775 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26776 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26777 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26779 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26781 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26782 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26783 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26784 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26785 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26787 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26788 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26789 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26791 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26793 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26794 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26795 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26797 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26798 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26800 /* SSE MMX or 3Dnow!A */
26801 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26802 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26803 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26805 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26806 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26807 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26808 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26810 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26811 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26813 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26815 /* SSE2 */
26816 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26818 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26819 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26820 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26821 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26822 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26824 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26825 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26826 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26827 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26828 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26830 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26832 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26833 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26834 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26835 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26837 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26838 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26839 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26841 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26842 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26843 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26844 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26845 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26846 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26847 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26848 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26850 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26851 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26852 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26853 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26854 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26855 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26856 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26857 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26858 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26859 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26860 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26861 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26862 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26863 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26864 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26865 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26866 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26867 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26868 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26869 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26871 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26872 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26873 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26874 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26876 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26877 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26878 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26879 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26881 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26883 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26884 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26885 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26887 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26889 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26890 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26891 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26892 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26893 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26894 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26895 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26896 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26898 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26899 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26900 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26901 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26902 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26903 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26904 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26905 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26907 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26908 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26910 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26911 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26912 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26913 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26915 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26916 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26918 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26919 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26920 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26921 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26922 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26923 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26925 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26926 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26927 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26928 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26930 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26931 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26932 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26933 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26934 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26935 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26936 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26937 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26939 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26940 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26941 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26943 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26944 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26946 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26947 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26949 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26951 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26952 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26953 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26954 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26956 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26957 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26958 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26959 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26960 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26961 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26962 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26964 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26965 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26966 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26967 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26968 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26969 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26970 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26972 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26973 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26974 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26975 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26977 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26978 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26979 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26981 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26983 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26985 /* SSE2 MMX */
26986 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26987 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26989 /* SSE3 */
26990 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26991 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26993 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26994 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26995 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26996 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26997 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26998 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27000 /* SSSE3 */
27001 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27002 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27003 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27004 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27005 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27006 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27008 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27009 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27010 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27011 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27012 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27013 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27014 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27015 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27016 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27017 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27018 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27019 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27020 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27021 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27022 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27023 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27024 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27025 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27026 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27027 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27028 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27029 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27030 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27031 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27033 /* SSSE3. */
27034 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27035 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27037 /* SSE4.1 */
27038 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27039 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27040 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27041 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27042 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27043 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27044 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27045 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27046 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27047 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27049 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27050 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27051 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27052 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27053 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27054 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27055 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27056 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27057 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27058 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27059 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27060 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27061 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27063 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27064 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27065 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27066 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27067 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27068 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27069 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27070 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27071 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27072 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27073 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27074 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27076 /* SSE4.1 */
27077 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27078 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27079 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27080 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27082 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27083 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27084 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27085 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27087 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27088 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27090 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27091 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27093 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27094 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27095 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27096 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27098 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27099 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27101 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27102 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27104 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27105 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27106 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27108 /* SSE4.2 */
27109 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27110 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27111 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27112 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27113 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27115 /* SSE4A */
27116 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27117 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27118 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27119 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27121 /* AES */
27122 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27123 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27125 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27126 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27127 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27128 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27130 /* PCLMUL */
27131 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27133 /* AVX */
27134 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27135 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27136 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27137 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27138 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27139 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27140 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27141 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27142 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27143 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27144 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27145 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27146 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27147 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27148 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27149 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27150 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27151 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27152 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27153 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27154 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27155 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27156 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27157 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27158 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27159 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27161 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27162 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27163 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27164 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27166 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27167 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27168 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27169 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27170 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27171 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27172 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27173 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27174 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27175 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27176 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27177 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27178 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27179 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27180 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27181 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27182 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27183 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27184 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27185 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27186 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27187 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27188 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27189 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27190 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27191 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27192 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27193 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27194 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27195 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27196 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27197 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27198 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27199 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27201 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27202 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27203 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27205 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27206 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27207 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27208 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27209 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27211 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27213 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27214 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27216 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27217 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27218 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27219 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27221 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27222 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27224 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27225 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27227 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27228 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27229 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27230 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27232 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27233 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27235 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27236 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27238 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27239 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27240 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27241 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27243 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27244 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27245 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27246 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27247 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27248 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27250 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27251 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27252 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27253 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27254 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27255 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27256 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27257 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27258 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27259 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27260 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27261 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27262 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27263 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27264 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27266 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27267 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27269 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27270 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27272 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27274 /* AVX2 */
27275 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27276 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27277 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27278 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27279 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27280 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27281 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27282 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27283 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27284 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27285 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27286 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27287 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27288 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27289 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27290 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27291 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27292 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27293 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27294 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27295 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27296 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27297 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27298 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27299 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27300 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27301 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27302 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27303 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27304 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27305 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27306 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27307 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27308 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27309 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27310 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27311 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27312 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27313 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27314 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27315 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27316 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27317 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27318 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27319 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27320 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27321 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27322 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27323 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27324 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27325 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27326 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27327 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27328 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27329 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27330 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27331 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27332 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27333 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27334 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27335 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27336 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27337 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27338 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27339 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27340 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27341 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27342 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27343 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27344 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27345 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27346 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27347 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27348 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27349 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27350 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27351 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27352 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27353 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27354 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27355 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27356 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27357 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27358 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27359 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27360 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27361 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27362 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27363 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27364 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27365 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27366 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27367 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27368 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27369 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27370 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27371 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27372 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27373 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27374 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27375 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27376 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27377 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27378 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27379 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27380 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27381 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27382 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27383 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27384 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27385 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27386 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27387 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27388 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27389 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27390 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27391 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27392 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27393 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27394 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27395 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27396 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27397 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27398 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27399 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27400 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27401 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27402 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27403 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27404 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27405 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27406 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27407 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27408 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27409 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27410 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27411 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27412 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27413 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27414 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27415 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27416 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27417 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27418 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27419 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27420 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27422 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27424 /* BMI */
27425 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27426 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27427 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27429 /* TBM */
27430 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27431 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27433 /* F16C */
27434 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27435 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27436 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27437 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27439 /* BMI2 */
27440 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27441 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27442 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27443 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27444 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27445 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27448 /* FMA4 and XOP. */
27449 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27450 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27451 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27452 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27453 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27454 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27455 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27456 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27457 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27458 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27459 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27460 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27461 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27462 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27463 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27464 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27465 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27466 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27467 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27468 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27469 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27470 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27471 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27472 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27473 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27474 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27475 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27476 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27477 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27478 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27479 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27480 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27481 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27482 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27483 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27484 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27485 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27486 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27487 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27488 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27489 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27490 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27491 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27492 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27493 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27494 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27495 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27496 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27497 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27498 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27499 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27500 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27502 static const struct builtin_description bdesc_multi_arg[] =
27504 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27505 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27506 UNKNOWN, (int)MULTI_ARG_3_SF },
27507 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27508 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27509 UNKNOWN, (int)MULTI_ARG_3_DF },
27511 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27512 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27513 UNKNOWN, (int)MULTI_ARG_3_SF },
27514 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27515 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27516 UNKNOWN, (int)MULTI_ARG_3_DF },
27518 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27519 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27520 UNKNOWN, (int)MULTI_ARG_3_SF },
27521 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27522 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27523 UNKNOWN, (int)MULTI_ARG_3_DF },
27524 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27525 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27526 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27527 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27528 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27529 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27531 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27532 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27533 UNKNOWN, (int)MULTI_ARG_3_SF },
27534 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27535 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27536 UNKNOWN, (int)MULTI_ARG_3_DF },
27537 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27538 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27539 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27540 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27541 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27542 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27544 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27545 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27546 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27547 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27548 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27549 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27550 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27552 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27553 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27554 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27555 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27556 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27557 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27558 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27560 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27562 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27563 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27564 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27565 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27566 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27567 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27568 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27569 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27570 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27571 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27572 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27573 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27575 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27576 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27577 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27578 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27579 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27580 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27581 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27582 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27583 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27584 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27585 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27586 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27587 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27588 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27589 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27590 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27592 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27593 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27594 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27595 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27596 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27597 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27599 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27600 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27601 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27602 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27603 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27604 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27605 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27606 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27607 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27608 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27609 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27610 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27611 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27612 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27613 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27615 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27616 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27617 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27618 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27619 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27620 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27621 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27623 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27624 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27625 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27626 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27627 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27628 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27629 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27631 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27632 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27633 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27634 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27635 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27636 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27637 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27639 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27640 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27641 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27642 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27643 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27644 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27645 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27647 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27648 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27649 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27650 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27651 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27652 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27653 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27655 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27656 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27657 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27658 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27659 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27660 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27661 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27663 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27664 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27665 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27666 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27667 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27668 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27669 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27671 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27672 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27673 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27674 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27675 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27676 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27677 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27679 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27680 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27681 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27682 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27683 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27684 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27685 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27686 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27688 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27689 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27690 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27691 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27692 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27693 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27694 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27695 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27697 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27698 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27699 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27700 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27704 /* TM vector builtins. */
27706 /* Reuse the existing x86-specific `struct builtin_description' cause
27707 we're lazy. Add casts to make them fit. */
27708 static const struct builtin_description bdesc_tm[] =
27710 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27711 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27712 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27713 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27714 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27715 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27716 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27718 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27719 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27720 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27721 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27722 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27723 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27724 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27726 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27727 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27728 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27729 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27730 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27731 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27732 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27734 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27735 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27736 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27739 /* TM callbacks. */
27741 /* Return the builtin decl needed to load a vector of TYPE. */
27743 static tree
27744 ix86_builtin_tm_load (tree type)
27746 if (TREE_CODE (type) == VECTOR_TYPE)
27748 switch (tree_low_cst (TYPE_SIZE (type), 1))
27750 case 64:
27751 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27752 case 128:
27753 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27754 case 256:
27755 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27758 return NULL_TREE;
27761 /* Return the builtin decl needed to store a vector of TYPE. */
27763 static tree
27764 ix86_builtin_tm_store (tree type)
27766 if (TREE_CODE (type) == VECTOR_TYPE)
27768 switch (tree_low_cst (TYPE_SIZE (type), 1))
27770 case 64:
27771 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27772 case 128:
27773 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27774 case 256:
27775 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27778 return NULL_TREE;
27781 /* Initialize the transactional memory vector load/store builtins. */
27783 static void
27784 ix86_init_tm_builtins (void)
27786 enum ix86_builtin_func_type ftype;
27787 const struct builtin_description *d;
27788 size_t i;
27789 tree decl;
27790 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27791 tree attrs_log, attrs_type_log;
27793 if (!flag_tm)
27794 return;
27796 /* If there are no builtins defined, we must be compiling in a
27797 language without trans-mem support. */
27798 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27799 return;
27801 /* Use whatever attributes a normal TM load has. */
27802 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27803 attrs_load = DECL_ATTRIBUTES (decl);
27804 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27805 /* Use whatever attributes a normal TM store has. */
27806 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27807 attrs_store = DECL_ATTRIBUTES (decl);
27808 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27809 /* Use whatever attributes a normal TM log has. */
27810 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27811 attrs_log = DECL_ATTRIBUTES (decl);
27812 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27814 for (i = 0, d = bdesc_tm;
27815 i < ARRAY_SIZE (bdesc_tm);
27816 i++, d++)
27818 if ((d->mask & ix86_isa_flags) != 0
27819 || (lang_hooks.builtin_function
27820 == lang_hooks.builtin_function_ext_scope))
27822 tree type, attrs, attrs_type;
27823 enum built_in_function code = (enum built_in_function) d->code;
27825 ftype = (enum ix86_builtin_func_type) d->flag;
27826 type = ix86_get_builtin_func_type (ftype);
27828 if (BUILTIN_TM_LOAD_P (code))
27830 attrs = attrs_load;
27831 attrs_type = attrs_type_load;
27833 else if (BUILTIN_TM_STORE_P (code))
27835 attrs = attrs_store;
27836 attrs_type = attrs_type_store;
27838 else
27840 attrs = attrs_log;
27841 attrs_type = attrs_type_log;
27843 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27844 /* The builtin without the prefix for
27845 calling it directly. */
27846 d->name + strlen ("__builtin_"),
27847 attrs);
27848 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27849 set the TYPE_ATTRIBUTES. */
27850 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27852 set_builtin_decl (code, decl, false);
27857 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27858 in the current target ISA to allow the user to compile particular modules
27859 with different target specific options that differ from the command line
27860 options. */
27861 static void
27862 ix86_init_mmx_sse_builtins (void)
27864 const struct builtin_description * d;
27865 enum ix86_builtin_func_type ftype;
27866 size_t i;
27868 /* Add all special builtins with variable number of operands. */
27869 for (i = 0, d = bdesc_special_args;
27870 i < ARRAY_SIZE (bdesc_special_args);
27871 i++, d++)
27873 if (d->name == 0)
27874 continue;
27876 ftype = (enum ix86_builtin_func_type) d->flag;
27877 def_builtin (d->mask, d->name, ftype, d->code);
27880 /* Add all builtins with variable number of operands. */
27881 for (i = 0, d = bdesc_args;
27882 i < ARRAY_SIZE (bdesc_args);
27883 i++, d++)
27885 if (d->name == 0)
27886 continue;
27888 ftype = (enum ix86_builtin_func_type) d->flag;
27889 def_builtin_const (d->mask, d->name, ftype, d->code);
27892 /* pcmpestr[im] insns. */
27893 for (i = 0, d = bdesc_pcmpestr;
27894 i < ARRAY_SIZE (bdesc_pcmpestr);
27895 i++, d++)
27897 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27898 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27899 else
27900 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27901 def_builtin_const (d->mask, d->name, ftype, d->code);
27904 /* pcmpistr[im] insns. */
27905 for (i = 0, d = bdesc_pcmpistr;
27906 i < ARRAY_SIZE (bdesc_pcmpistr);
27907 i++, d++)
27909 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27910 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27911 else
27912 ftype = INT_FTYPE_V16QI_V16QI_INT;
27913 def_builtin_const (d->mask, d->name, ftype, d->code);
27916 /* comi/ucomi insns. */
27917 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27919 if (d->mask == OPTION_MASK_ISA_SSE2)
27920 ftype = INT_FTYPE_V2DF_V2DF;
27921 else
27922 ftype = INT_FTYPE_V4SF_V4SF;
27923 def_builtin_const (d->mask, d->name, ftype, d->code);
27926 /* SSE */
27927 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27928 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27929 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27930 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27932 /* SSE or 3DNow!A */
27933 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27934 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27935 IX86_BUILTIN_MASKMOVQ);
27937 /* SSE2 */
27938 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27939 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27941 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27942 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27943 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27944 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27946 /* SSE3. */
27947 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27948 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27949 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27950 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27952 /* AES */
27953 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27954 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27955 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27956 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27957 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27958 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27959 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27960 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27961 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27962 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27963 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27964 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27966 /* PCLMUL */
27967 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27968 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27970 /* RDRND */
27971 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27972 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27973 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27974 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27975 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27976 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27977 IX86_BUILTIN_RDRAND64_STEP);
27979 /* AVX2 */
27980 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27981 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27982 IX86_BUILTIN_GATHERSIV2DF);
27984 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27985 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27986 IX86_BUILTIN_GATHERSIV4DF);
27988 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27989 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27990 IX86_BUILTIN_GATHERDIV2DF);
27992 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27993 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27994 IX86_BUILTIN_GATHERDIV4DF);
27996 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27997 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27998 IX86_BUILTIN_GATHERSIV4SF);
28000 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28001 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28002 IX86_BUILTIN_GATHERSIV8SF);
28004 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28005 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28006 IX86_BUILTIN_GATHERDIV4SF);
28008 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28009 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28010 IX86_BUILTIN_GATHERDIV8SF);
28012 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28013 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28014 IX86_BUILTIN_GATHERSIV2DI);
28016 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28017 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28018 IX86_BUILTIN_GATHERSIV4DI);
28020 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28021 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28022 IX86_BUILTIN_GATHERDIV2DI);
28024 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28025 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28026 IX86_BUILTIN_GATHERDIV4DI);
28028 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28029 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28030 IX86_BUILTIN_GATHERSIV4SI);
28032 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28033 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28034 IX86_BUILTIN_GATHERSIV8SI);
28036 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28037 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28038 IX86_BUILTIN_GATHERDIV4SI);
28040 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28041 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28042 IX86_BUILTIN_GATHERDIV8SI);
28044 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28045 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28046 IX86_BUILTIN_GATHERALTSIV4DF);
28048 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28049 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28050 IX86_BUILTIN_GATHERALTDIV8SF);
28052 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28053 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28054 IX86_BUILTIN_GATHERALTSIV4DI);
28056 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28057 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28058 IX86_BUILTIN_GATHERALTDIV8SI);
28060 /* RTM. */
28061 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28062 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28064 /* MMX access to the vec_init patterns. */
28065 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28066 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28068 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28069 V4HI_FTYPE_HI_HI_HI_HI,
28070 IX86_BUILTIN_VEC_INIT_V4HI);
28072 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28073 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28074 IX86_BUILTIN_VEC_INIT_V8QI);
28076 /* Access to the vec_extract patterns. */
28077 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28078 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28079 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28080 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28081 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28082 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28083 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28084 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28085 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28086 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28088 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28089 "__builtin_ia32_vec_ext_v4hi",
28090 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28092 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28093 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28095 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28096 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28098 /* Access to the vec_set patterns. */
28099 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28100 "__builtin_ia32_vec_set_v2di",
28101 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28103 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28104 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28106 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28107 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28109 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28110 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28112 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28113 "__builtin_ia32_vec_set_v4hi",
28114 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28116 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28117 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28119 /* RDSEED */
28120 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28121 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28122 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28123 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28124 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28125 "__builtin_ia32_rdseed_di_step",
28126 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28128 /* ADCX */
28129 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28130 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28131 def_builtin (OPTION_MASK_ISA_64BIT,
28132 "__builtin_ia32_addcarryx_u64",
28133 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28134 IX86_BUILTIN_ADDCARRYX64);
28136 /* Add FMA4 multi-arg argument instructions */
28137 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28139 if (d->name == 0)
28140 continue;
28142 ftype = (enum ix86_builtin_func_type) d->flag;
28143 def_builtin_const (d->mask, d->name, ftype, d->code);
28147 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28148 to return a pointer to VERSION_DECL if the outcome of the expression
28149 formed by PREDICATE_CHAIN is true. This function will be called during
28150 version dispatch to decide which function version to execute. It returns
28151 the basic block at the end, to which more conditions can be added. */
28153 static basic_block
28154 add_condition_to_bb (tree function_decl, tree version_decl,
28155 tree predicate_chain, basic_block new_bb)
28157 gimple return_stmt;
28158 tree convert_expr, result_var;
28159 gimple convert_stmt;
28160 gimple call_cond_stmt;
28161 gimple if_else_stmt;
28163 basic_block bb1, bb2, bb3;
28164 edge e12, e23;
28166 tree cond_var, and_expr_var = NULL_TREE;
28167 gimple_seq gseq;
28169 tree predicate_decl, predicate_arg;
28171 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28173 gcc_assert (new_bb != NULL);
28174 gseq = bb_seq (new_bb);
28177 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28178 build_fold_addr_expr (version_decl));
28179 result_var = create_tmp_var (ptr_type_node, NULL);
28180 convert_stmt = gimple_build_assign (result_var, convert_expr);
28181 return_stmt = gimple_build_return (result_var);
28183 if (predicate_chain == NULL_TREE)
28185 gimple_seq_add_stmt (&gseq, convert_stmt);
28186 gimple_seq_add_stmt (&gseq, return_stmt);
28187 set_bb_seq (new_bb, gseq);
28188 gimple_set_bb (convert_stmt, new_bb);
28189 gimple_set_bb (return_stmt, new_bb);
28190 pop_cfun ();
28191 return new_bb;
28194 while (predicate_chain != NULL)
28196 cond_var = create_tmp_var (integer_type_node, NULL);
28197 predicate_decl = TREE_PURPOSE (predicate_chain);
28198 predicate_arg = TREE_VALUE (predicate_chain);
28199 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28200 gimple_call_set_lhs (call_cond_stmt, cond_var);
28202 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28203 gimple_set_bb (call_cond_stmt, new_bb);
28204 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28206 predicate_chain = TREE_CHAIN (predicate_chain);
28208 if (and_expr_var == NULL)
28209 and_expr_var = cond_var;
28210 else
28212 gimple assign_stmt;
28213 /* Use MIN_EXPR to check if any integer is zero?.
28214 and_expr_var = min_expr <cond_var, and_expr_var> */
28215 assign_stmt = gimple_build_assign (and_expr_var,
28216 build2 (MIN_EXPR, integer_type_node,
28217 cond_var, and_expr_var));
28219 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28220 gimple_set_bb (assign_stmt, new_bb);
28221 gimple_seq_add_stmt (&gseq, assign_stmt);
28225 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28226 integer_zero_node,
28227 NULL_TREE, NULL_TREE);
28228 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28229 gimple_set_bb (if_else_stmt, new_bb);
28230 gimple_seq_add_stmt (&gseq, if_else_stmt);
28232 gimple_seq_add_stmt (&gseq, convert_stmt);
28233 gimple_seq_add_stmt (&gseq, return_stmt);
28234 set_bb_seq (new_bb, gseq);
28236 bb1 = new_bb;
28237 e12 = split_block (bb1, if_else_stmt);
28238 bb2 = e12->dest;
28239 e12->flags &= ~EDGE_FALLTHRU;
28240 e12->flags |= EDGE_TRUE_VALUE;
28242 e23 = split_block (bb2, return_stmt);
28244 gimple_set_bb (convert_stmt, bb2);
28245 gimple_set_bb (return_stmt, bb2);
28247 bb3 = e23->dest;
28248 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28250 remove_edge (e23);
28251 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28253 pop_cfun ();
28255 return bb3;
28258 /* This parses the attribute arguments to target in DECL and determines
28259 the right builtin to use to match the platform specification.
28260 It returns the priority value for this version decl. If PREDICATE_LIST
28261 is not NULL, it stores the list of cpu features that need to be checked
28262 before dispatching this function. */
28264 static unsigned int
28265 get_builtin_code_for_version (tree decl, tree *predicate_list)
28267 tree attrs;
28268 struct cl_target_option cur_target;
28269 tree target_node;
28270 struct cl_target_option *new_target;
28271 const char *arg_str = NULL;
28272 const char *attrs_str = NULL;
28273 char *tok_str = NULL;
28274 char *token;
28276 /* Priority of i386 features, greater value is higher priority. This is
28277 used to decide the order in which function dispatch must happen. For
28278 instance, a version specialized for SSE4.2 should be checked for dispatch
28279 before a version for SSE3, as SSE4.2 implies SSE3. */
28280 enum feature_priority
28282 P_ZERO = 0,
28283 P_MMX,
28284 P_SSE,
28285 P_SSE2,
28286 P_SSE3,
28287 P_SSSE3,
28288 P_PROC_SSSE3,
28289 P_SSE4_a,
28290 P_PROC_SSE4_a,
28291 P_SSE4_1,
28292 P_SSE4_2,
28293 P_PROC_SSE4_2,
28294 P_POPCNT,
28295 P_AVX,
28296 P_AVX2,
28297 P_FMA,
28298 P_PROC_FMA
28301 enum feature_priority priority = P_ZERO;
28303 /* These are the target attribute strings for which a dispatcher is
28304 available, from fold_builtin_cpu. */
28306 static struct _feature_list
28308 const char *const name;
28309 const enum feature_priority priority;
28311 const feature_list[] =
28313 {"mmx", P_MMX},
28314 {"sse", P_SSE},
28315 {"sse2", P_SSE2},
28316 {"sse3", P_SSE3},
28317 {"ssse3", P_SSSE3},
28318 {"sse4.1", P_SSE4_1},
28319 {"sse4.2", P_SSE4_2},
28320 {"popcnt", P_POPCNT},
28321 {"avx", P_AVX},
28322 {"avx2", P_AVX2}
28326 static unsigned int NUM_FEATURES
28327 = sizeof (feature_list) / sizeof (struct _feature_list);
28329 unsigned int i;
28331 tree predicate_chain = NULL_TREE;
28332 tree predicate_decl, predicate_arg;
28334 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28335 gcc_assert (attrs != NULL);
28337 attrs = TREE_VALUE (TREE_VALUE (attrs));
28339 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28340 attrs_str = TREE_STRING_POINTER (attrs);
28343 /* Handle arch= if specified. For priority, set it to be 1 more than
28344 the best instruction set the processor can handle. For instance, if
28345 there is a version for atom and a version for ssse3 (the highest ISA
28346 priority for atom), the atom version must be checked for dispatch
28347 before the ssse3 version. */
28348 if (strstr (attrs_str, "arch=") != NULL)
28350 cl_target_option_save (&cur_target, &global_options);
28351 target_node = ix86_valid_target_attribute_tree (attrs);
28353 gcc_assert (target_node);
28354 new_target = TREE_TARGET_OPTION (target_node);
28355 gcc_assert (new_target);
28357 if (new_target->arch_specified && new_target->arch > 0)
28359 switch (new_target->arch)
28361 case PROCESSOR_CORE2_32:
28362 case PROCESSOR_CORE2_64:
28363 arg_str = "core2";
28364 priority = P_PROC_SSSE3;
28365 break;
28366 case PROCESSOR_COREI7_32:
28367 case PROCESSOR_COREI7_64:
28368 arg_str = "corei7";
28369 priority = P_PROC_SSE4_2;
28370 break;
28371 case PROCESSOR_ATOM:
28372 arg_str = "atom";
28373 priority = P_PROC_SSSE3;
28374 break;
28375 case PROCESSOR_AMDFAM10:
28376 arg_str = "amdfam10h";
28377 priority = P_PROC_SSE4_a;
28378 break;
28379 case PROCESSOR_BDVER1:
28380 arg_str = "bdver1";
28381 priority = P_PROC_FMA;
28382 break;
28383 case PROCESSOR_BDVER2:
28384 arg_str = "bdver2";
28385 priority = P_PROC_FMA;
28386 break;
28390 cl_target_option_restore (&global_options, &cur_target);
28392 if (predicate_list && arg_str == NULL)
28394 error_at (DECL_SOURCE_LOCATION (decl),
28395 "No dispatcher found for the versioning attributes");
28396 return 0;
28399 if (predicate_list)
28401 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28402 /* For a C string literal the length includes the trailing NULL. */
28403 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28404 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28405 predicate_chain);
28409 /* Process feature name. */
28410 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28411 strcpy (tok_str, attrs_str);
28412 token = strtok (tok_str, ",");
28413 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28415 while (token != NULL)
28417 /* Do not process "arch=" */
28418 if (strncmp (token, "arch=", 5) == 0)
28420 token = strtok (NULL, ",");
28421 continue;
28423 for (i = 0; i < NUM_FEATURES; ++i)
28425 if (strcmp (token, feature_list[i].name) == 0)
28427 if (predicate_list)
28429 predicate_arg = build_string_literal (
28430 strlen (feature_list[i].name) + 1,
28431 feature_list[i].name);
28432 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28433 predicate_chain);
28435 /* Find the maximum priority feature. */
28436 if (feature_list[i].priority > priority)
28437 priority = feature_list[i].priority;
28439 break;
28442 if (predicate_list && i == NUM_FEATURES)
28444 error_at (DECL_SOURCE_LOCATION (decl),
28445 "No dispatcher found for %s", token);
28446 return 0;
28448 token = strtok (NULL, ",");
28450 free (tok_str);
28452 if (predicate_list && predicate_chain == NULL_TREE)
28454 error_at (DECL_SOURCE_LOCATION (decl),
28455 "No dispatcher found for the versioning attributes : %s",
28456 attrs_str);
28457 return 0;
28459 else if (predicate_list)
28461 predicate_chain = nreverse (predicate_chain);
28462 *predicate_list = predicate_chain;
28465 return priority;
28468 /* This compares the priority of target features in function DECL1
28469 and DECL2. It returns positive value if DECL1 is higher priority,
28470 negative value if DECL2 is higher priority and 0 if they are the
28471 same. */
28473 static int
28474 ix86_compare_version_priority (tree decl1, tree decl2)
28476 unsigned int priority1 = 0;
28477 unsigned int priority2 = 0;
28479 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl1)) != NULL)
28480 priority1 = get_builtin_code_for_version (decl1, NULL);
28482 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl2)) != NULL)
28483 priority2 = get_builtin_code_for_version (decl2, NULL);
28485 return (int)priority1 - (int)priority2;
28488 /* V1 and V2 point to function versions with different priorities
28489 based on the target ISA. This function compares their priorities. */
28491 static int
28492 feature_compare (const void *v1, const void *v2)
28494 typedef struct _function_version_info
28496 tree version_decl;
28497 tree predicate_chain;
28498 unsigned int dispatch_priority;
28499 } function_version_info;
28501 const function_version_info c1 = *(const function_version_info *)v1;
28502 const function_version_info c2 = *(const function_version_info *)v2;
28503 return (c2.dispatch_priority - c1.dispatch_priority);
28506 /* This function generates the dispatch function for
28507 multi-versioned functions. DISPATCH_DECL is the function which will
28508 contain the dispatch logic. FNDECLS are the function choices for
28509 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28510 in DISPATCH_DECL in which the dispatch code is generated. */
28512 static int
28513 dispatch_function_versions (tree dispatch_decl,
28514 void *fndecls_p,
28515 basic_block *empty_bb)
28517 tree default_decl;
28518 gimple ifunc_cpu_init_stmt;
28519 gimple_seq gseq;
28520 int ix;
28521 tree ele;
28522 VEC (tree, heap) *fndecls;
28523 unsigned int num_versions = 0;
28524 unsigned int actual_versions = 0;
28525 unsigned int i;
28527 struct _function_version_info
28529 tree version_decl;
28530 tree predicate_chain;
28531 unsigned int dispatch_priority;
28532 }*function_version_info;
28534 gcc_assert (dispatch_decl != NULL
28535 && fndecls_p != NULL
28536 && empty_bb != NULL);
28538 /*fndecls_p is actually a vector. */
28539 fndecls = (VEC (tree, heap) *)fndecls_p;
28541 /* At least one more version other than the default. */
28542 num_versions = VEC_length (tree, fndecls);
28543 gcc_assert (num_versions >= 2);
28545 function_version_info = (struct _function_version_info *)
28546 XNEWVEC (struct _function_version_info, (num_versions - 1));
28548 /* The first version in the vector is the default decl. */
28549 default_decl = VEC_index (tree, fndecls, 0);
28551 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28553 gseq = bb_seq (*empty_bb);
28554 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28555 constructors, so explicity call __builtin_cpu_init here. */
28556 ifunc_cpu_init_stmt = gimple_build_call_vec (
28557 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], NULL);
28558 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28559 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28560 set_bb_seq (*empty_bb, gseq);
28562 pop_cfun ();
28565 for (ix = 1; VEC_iterate (tree, fndecls, ix, ele); ++ix)
28567 tree version_decl = ele;
28568 tree predicate_chain = NULL_TREE;
28569 unsigned int priority;
28570 /* Get attribute string, parse it and find the right predicate decl.
28571 The predicate function could be a lengthy combination of many
28572 features, like arch-type and various isa-variants. */
28573 priority = get_builtin_code_for_version (version_decl,
28574 &predicate_chain);
28576 if (predicate_chain == NULL_TREE)
28577 continue;
28579 actual_versions++;
28580 function_version_info [ix - 1].version_decl = version_decl;
28581 function_version_info [ix - 1].predicate_chain = predicate_chain;
28582 function_version_info [ix - 1].dispatch_priority = priority;
28585 /* Sort the versions according to descending order of dispatch priority. The
28586 priority is based on the ISA. This is not a perfect solution. There
28587 could still be ambiguity. If more than one function version is suitable
28588 to execute, which one should be dispatched? In future, allow the user
28589 to specify a dispatch priority next to the version. */
28590 qsort (function_version_info, actual_versions,
28591 sizeof (struct _function_version_info), feature_compare);
28593 for (i = 0; i < actual_versions; ++i)
28594 *empty_bb = add_condition_to_bb (dispatch_decl,
28595 function_version_info[i].version_decl,
28596 function_version_info[i].predicate_chain,
28597 *empty_bb);
28599 /* dispatch default version at the end. */
28600 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28601 NULL, *empty_bb);
28603 free (function_version_info);
28604 return 0;
28607 /* This function returns true if FN1 and FN2 are versions of the same function,
28608 that is, the targets of the function decls are different. This assumes
28609 that FN1 and FN2 have the same signature. */
28611 static bool
28612 ix86_function_versions (tree fn1, tree fn2)
28614 tree attr1, attr2;
28615 struct cl_target_option *target1, *target2;
28617 if (TREE_CODE (fn1) != FUNCTION_DECL
28618 || TREE_CODE (fn2) != FUNCTION_DECL)
28619 return false;
28621 attr1 = DECL_FUNCTION_SPECIFIC_TARGET (fn1);
28622 attr2 = DECL_FUNCTION_SPECIFIC_TARGET (fn2);
28624 /* Atleast one function decl should have target attribute specified. */
28625 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
28626 return false;
28628 if (attr1 == NULL_TREE)
28629 attr1 = target_option_default_node;
28630 else if (attr2 == NULL_TREE)
28631 attr2 = target_option_default_node;
28633 target1 = TREE_TARGET_OPTION (attr1);
28634 target2 = TREE_TARGET_OPTION (attr2);
28636 /* target1 and target2 must be different in some way. */
28637 if (target1->x_ix86_isa_flags == target2->x_ix86_isa_flags
28638 && target1->x_target_flags == target2->x_target_flags
28639 && target1->arch == target2->arch
28640 && target1->tune == target2->tune
28641 && target1->x_ix86_fpmath == target2->x_ix86_fpmath
28642 && target1->branch_cost == target2->branch_cost)
28643 return false;
28645 return true;
28648 /* Comparator function to be used in qsort routine to sort attribute
28649 specification strings to "target". */
28651 static int
28652 attr_strcmp (const void *v1, const void *v2)
28654 const char *c1 = *(char *const*)v1;
28655 const char *c2 = *(char *const*)v2;
28656 return strcmp (c1, c2);
28659 /* STR is the argument to target attribute. This function tokenizes
28660 the comma separated arguments, sorts them and returns a string which
28661 is a unique identifier for the comma separated arguments. It also
28662 replaces non-identifier characters "=,-" with "_". */
28664 static char *
28665 sorted_attr_string (const char *str)
28667 char **args = NULL;
28668 char *attr_str, *ret_str;
28669 char *attr = NULL;
28670 unsigned int argnum = 1;
28671 unsigned int i;
28673 for (i = 0; i < strlen (str); i++)
28674 if (str[i] == ',')
28675 argnum++;
28677 attr_str = (char *)xmalloc (strlen (str) + 1);
28678 strcpy (attr_str, str);
28680 /* Replace "=,-" with "_". */
28681 for (i = 0; i < strlen (attr_str); i++)
28682 if (attr_str[i] == '=' || attr_str[i]== '-')
28683 attr_str[i] = '_';
28685 if (argnum == 1)
28686 return attr_str;
28688 args = XNEWVEC (char *, argnum);
28690 i = 0;
28691 attr = strtok (attr_str, ",");
28692 while (attr != NULL)
28694 args[i] = attr;
28695 i++;
28696 attr = strtok (NULL, ",");
28699 qsort (args, argnum, sizeof (char*), attr_strcmp);
28701 ret_str = (char *)xmalloc (strlen (str) + 1);
28702 strcpy (ret_str, args[0]);
28703 for (i = 1; i < argnum; i++)
28705 strcat (ret_str, "_");
28706 strcat (ret_str, args[i]);
28709 free (args);
28710 free (attr_str);
28711 return ret_str;
28714 /* This function changes the assembler name for functions that are
28715 versions. If DECL is a function version and has a "target"
28716 attribute, it appends the attribute string to its assembler name. */
28718 static tree
28719 ix86_mangle_function_version_assembler_name (tree decl, tree id)
28721 tree version_attr;
28722 const char *orig_name, *version_string, *attr_str;
28723 char *assembler_name;
28725 if (DECL_DECLARED_INLINE_P (decl)
28726 && lookup_attribute ("gnu_inline",
28727 DECL_ATTRIBUTES (decl)))
28728 error_at (DECL_SOURCE_LOCATION (decl),
28729 "Function versions cannot be marked as gnu_inline,"
28730 " bodies have to be generated");
28732 if (DECL_VIRTUAL_P (decl)
28733 || DECL_VINDEX (decl))
28734 error_at (DECL_SOURCE_LOCATION (decl),
28735 "Virtual function versioning not supported\n");
28737 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28739 /* target attribute string is NULL for default functions. */
28740 if (version_attr == NULL_TREE)
28741 return id;
28743 orig_name = IDENTIFIER_POINTER (id);
28744 version_string
28745 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
28747 attr_str = sorted_attr_string (version_string);
28748 assembler_name = (char *) xmalloc (strlen (orig_name)
28749 + strlen (attr_str) + 2);
28751 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
28753 /* Allow assembler name to be modified if already set. */
28754 if (DECL_ASSEMBLER_NAME_SET_P (decl))
28755 SET_DECL_RTL (decl, NULL);
28757 return get_identifier (assembler_name);
28760 static tree
28761 ix86_mangle_decl_assembler_name (tree decl, tree id)
28763 /* For function version, add the target suffix to the assembler name. */
28764 if (TREE_CODE (decl) == FUNCTION_DECL
28765 && DECL_FUNCTION_VERSIONED (decl))
28766 return ix86_mangle_function_version_assembler_name (decl, id);
28768 return id;
28771 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
28772 is true, append the full path name of the source file. */
28774 static char *
28775 make_name (tree decl, const char *suffix, bool make_unique)
28777 char *global_var_name;
28778 int name_len;
28779 const char *name;
28780 const char *unique_name = NULL;
28782 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
28784 /* Get a unique name that can be used globally without any chances
28785 of collision at link time. */
28786 if (make_unique)
28787 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
28789 name_len = strlen (name) + strlen (suffix) + 2;
28791 if (make_unique)
28792 name_len += strlen (unique_name) + 1;
28793 global_var_name = XNEWVEC (char, name_len);
28795 /* Use '.' to concatenate names as it is demangler friendly. */
28796 if (make_unique)
28797 snprintf (global_var_name, name_len, "%s.%s.%s", name,
28798 unique_name, suffix);
28799 else
28800 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
28802 return global_var_name;
28805 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
28807 /* Make a dispatcher declaration for the multi-versioned function DECL.
28808 Calls to DECL function will be replaced with calls to the dispatcher
28809 by the front-end. Return the decl created. */
28811 static tree
28812 make_dispatcher_decl (const tree decl)
28814 tree func_decl;
28815 char *func_name, *resolver_name;
28816 tree fn_type, func_type;
28817 bool is_uniq = false;
28819 if (TREE_PUBLIC (decl) == 0)
28820 is_uniq = true;
28822 func_name = make_name (decl, "ifunc", is_uniq);
28823 resolver_name = make_name (decl, "resolver", is_uniq);
28824 gcc_assert (resolver_name);
28826 fn_type = TREE_TYPE (decl);
28827 func_type = build_function_type (TREE_TYPE (fn_type),
28828 TYPE_ARG_TYPES (fn_type));
28830 func_decl = build_fn_decl (func_name, func_type);
28831 TREE_USED (func_decl) = 1;
28832 DECL_CONTEXT (func_decl) = NULL_TREE;
28833 DECL_INITIAL (func_decl) = error_mark_node;
28834 DECL_ARTIFICIAL (func_decl) = 1;
28835 /* Mark this func as external, the resolver will flip it again if
28836 it gets generated. */
28837 DECL_EXTERNAL (func_decl) = 1;
28838 /* This will be of type IFUNCs have to be externally visible. */
28839 TREE_PUBLIC (func_decl) = 1;
28841 return func_decl;
28844 #endif
28846 /* Returns true if decl is multi-versioned and DECL is the default function,
28847 that is it is not tagged with target specific optimization. */
28849 static bool
28850 is_function_default_version (const tree decl)
28852 return (TREE_CODE (decl) == FUNCTION_DECL
28853 && DECL_FUNCTION_VERSIONED (decl)
28854 && DECL_FUNCTION_SPECIFIC_TARGET (decl) == NULL_TREE);
28857 /* Make a dispatcher declaration for the multi-versioned function DECL.
28858 Calls to DECL function will be replaced with calls to the dispatcher
28859 by the front-end. Returns the decl of the dispatcher function. */
28861 static tree
28862 ix86_get_function_versions_dispatcher (void *decl)
28864 tree fn = (tree) decl;
28865 struct cgraph_node *node = NULL;
28866 struct cgraph_node *default_node = NULL;
28867 struct cgraph_function_version_info *node_v = NULL;
28868 struct cgraph_function_version_info *it_v = NULL;
28869 struct cgraph_function_version_info *first_v = NULL;
28871 tree dispatch_decl = NULL;
28872 struct cgraph_node *dispatcher_node = NULL;
28873 struct cgraph_function_version_info *dispatcher_version_info = NULL;
28875 struct cgraph_function_version_info *default_version_info = NULL;
28877 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
28879 node = cgraph_get_node (fn);
28880 gcc_assert (node != NULL);
28882 node_v = get_cgraph_node_version (node);
28883 gcc_assert (node_v != NULL);
28885 if (node_v->dispatcher_resolver != NULL)
28886 return node_v->dispatcher_resolver;
28888 /* Find the default version and make it the first node. */
28889 first_v = node_v;
28890 /* Go to the beginnig of the chain. */
28891 while (first_v->prev != NULL)
28892 first_v = first_v->prev;
28893 default_version_info = first_v;
28894 while (default_version_info != NULL)
28896 if (is_function_default_version
28897 (default_version_info->this_node->symbol.decl))
28898 break;
28899 default_version_info = default_version_info->next;
28902 /* If there is no default node, just return NULL. */
28903 if (default_version_info == NULL)
28904 return NULL;
28906 /* Make default info the first node. */
28907 if (first_v != default_version_info)
28909 default_version_info->prev->next = default_version_info->next;
28910 if (default_version_info->next)
28911 default_version_info->next->prev = default_version_info->prev;
28912 first_v->prev = default_version_info;
28913 default_version_info->next = first_v;
28914 default_version_info->prev = NULL;
28917 default_node = default_version_info->this_node;
28919 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
28920 /* Right now, the dispatching is done via ifunc. */
28921 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
28922 #else
28923 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
28924 "Multiversioning needs ifunc which is not supported "
28925 "in this configuration");
28926 #endif
28928 dispatcher_node = cgraph_get_create_node (dispatch_decl);
28929 gcc_assert (dispatcher_node != NULL);
28930 dispatcher_node->dispatcher_function = 1;
28931 dispatcher_version_info
28932 = insert_new_cgraph_node_version (dispatcher_node);
28933 dispatcher_version_info->next = default_version_info;
28934 dispatcher_node->local.finalized = 1;
28936 /* Set the dispatcher for all the versions. */
28937 it_v = default_version_info;
28938 while (it_v->next != NULL)
28940 it_v->dispatcher_resolver = dispatch_decl;
28941 it_v = it_v->next;
28944 return dispatch_decl;
28947 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
28948 it to CHAIN. */
28950 static tree
28951 make_attribute (const char *name, const char *arg_name, tree chain)
28953 tree attr_name;
28954 tree attr_arg_name;
28955 tree attr_args;
28956 tree attr;
28958 attr_name = get_identifier (name);
28959 attr_arg_name = build_string (strlen (arg_name), arg_name);
28960 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
28961 attr = tree_cons (attr_name, attr_args, chain);
28962 return attr;
28965 /* Make the resolver function decl to dispatch the versions of
28966 a multi-versioned function, DEFAULT_DECL. Create an
28967 empty basic block in the resolver and store the pointer in
28968 EMPTY_BB. Return the decl of the resolver function. */
28970 static tree
28971 make_resolver_func (const tree default_decl,
28972 const tree dispatch_decl,
28973 basic_block *empty_bb)
28975 char *resolver_name;
28976 tree decl, type, decl_name, t;
28977 bool is_uniq = false;
28979 /* IFUNC's have to be globally visible. So, if the default_decl is
28980 not, then the name of the IFUNC should be made unique. */
28981 if (TREE_PUBLIC (default_decl) == 0)
28982 is_uniq = true;
28984 /* Append the filename to the resolver function if the versions are
28985 not externally visible. This is because the resolver function has
28986 to be externally visible for the loader to find it. So, appending
28987 the filename will prevent conflicts with a resolver function from
28988 another module which is based on the same version name. */
28989 resolver_name = make_name (default_decl, "resolver", is_uniq);
28991 /* The resolver function should return a (void *). */
28992 type = build_function_type_list (ptr_type_node, NULL_TREE);
28994 decl = build_fn_decl (resolver_name, type);
28995 decl_name = get_identifier (resolver_name);
28996 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
28998 DECL_NAME (decl) = decl_name;
28999 TREE_USED (decl) = 1;
29000 DECL_ARTIFICIAL (decl) = 1;
29001 DECL_IGNORED_P (decl) = 0;
29002 /* IFUNC resolvers have to be externally visible. */
29003 TREE_PUBLIC (decl) = 1;
29004 DECL_UNINLINABLE (decl) = 0;
29006 /* Resolver is not external, body is generated. */
29007 DECL_EXTERNAL (decl) = 0;
29008 DECL_EXTERNAL (dispatch_decl) = 0;
29010 DECL_CONTEXT (decl) = NULL_TREE;
29011 DECL_INITIAL (decl) = make_node (BLOCK);
29012 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29014 if (DECL_COMDAT_GROUP (default_decl)
29015 || TREE_PUBLIC (default_decl))
29017 /* In this case, each translation unit with a call to this
29018 versioned function will put out a resolver. Ensure it
29019 is comdat to keep just one copy. */
29020 DECL_COMDAT (decl) = 1;
29021 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29023 /* Build result decl and add to function_decl. */
29024 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29025 DECL_ARTIFICIAL (t) = 1;
29026 DECL_IGNORED_P (t) = 1;
29027 DECL_RESULT (decl) = t;
29029 gimplify_function_tree (decl);
29030 push_cfun (DECL_STRUCT_FUNCTION (decl));
29031 *empty_bb = init_lowered_empty_function (decl, false);
29033 cgraph_add_new_function (decl, true);
29034 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29036 pop_cfun ();
29038 gcc_assert (dispatch_decl != NULL);
29039 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29040 DECL_ATTRIBUTES (dispatch_decl)
29041 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29043 /* Create the alias for dispatch to resolver here. */
29044 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29045 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29046 return decl;
29049 /* Generate the dispatching code body to dispatch multi-versioned function
29050 DECL. The target hook is called to process the "target" attributes and
29051 provide the code to dispatch the right function at run-time. NODE points
29052 to the dispatcher decl whose body will be created. */
29054 static tree
29055 ix86_generate_version_dispatcher_body (void *node_p)
29057 tree resolver_decl;
29058 basic_block empty_bb;
29059 VEC (tree, heap) *fn_ver_vec = NULL;
29060 tree default_ver_decl;
29061 struct cgraph_node *versn;
29062 struct cgraph_node *node;
29064 struct cgraph_function_version_info *node_version_info = NULL;
29065 struct cgraph_function_version_info *versn_info = NULL;
29067 node = (cgraph_node *)node_p;
29069 node_version_info = get_cgraph_node_version (node);
29070 gcc_assert (node->dispatcher_function
29071 && node_version_info != NULL);
29073 if (node_version_info->dispatcher_resolver)
29074 return node_version_info->dispatcher_resolver;
29076 /* The first version in the chain corresponds to the default version. */
29077 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29079 /* node is going to be an alias, so remove the finalized bit. */
29080 node->local.finalized = false;
29082 resolver_decl = make_resolver_func (default_ver_decl,
29083 node->symbol.decl, &empty_bb);
29085 node_version_info->dispatcher_resolver = resolver_decl;
29087 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29089 fn_ver_vec = VEC_alloc (tree, heap, 2);
29091 for (versn_info = node_version_info->next; versn_info;
29092 versn_info = versn_info->next)
29094 versn = versn_info->this_node;
29095 /* Check for virtual functions here again, as by this time it should
29096 have been determined if this function needs a vtable index or
29097 not. This happens for methods in derived classes that override
29098 virtual methods in base classes but are not explicitly marked as
29099 virtual. */
29100 if (DECL_VINDEX (versn->symbol.decl))
29101 error_at (DECL_SOURCE_LOCATION (versn->symbol.decl),
29102 "Virtual function multiversioning not supported");
29103 VEC_safe_push (tree, heap, fn_ver_vec, versn->symbol.decl);
29106 dispatch_function_versions (resolver_decl, fn_ver_vec, &empty_bb);
29108 rebuild_cgraph_edges ();
29109 pop_cfun ();
29110 return resolver_decl;
29112 /* This builds the processor_model struct type defined in
29113 libgcc/config/i386/cpuinfo.c */
29115 static tree
29116 build_processor_model_struct (void)
29118 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29119 "__cpu_features"};
29120 tree field = NULL_TREE, field_chain = NULL_TREE;
29121 int i;
29122 tree type = make_node (RECORD_TYPE);
29124 /* The first 3 fields are unsigned int. */
29125 for (i = 0; i < 3; ++i)
29127 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29128 get_identifier (field_name[i]), unsigned_type_node);
29129 if (field_chain != NULL_TREE)
29130 DECL_CHAIN (field) = field_chain;
29131 field_chain = field;
29134 /* The last field is an array of unsigned integers of size one. */
29135 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29136 get_identifier (field_name[3]),
29137 build_array_type (unsigned_type_node,
29138 build_index_type (size_one_node)));
29139 if (field_chain != NULL_TREE)
29140 DECL_CHAIN (field) = field_chain;
29141 field_chain = field;
29143 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29144 return type;
29147 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29149 static tree
29150 make_var_decl (tree type, const char *name)
29152 tree new_decl;
29154 new_decl = build_decl (UNKNOWN_LOCATION,
29155 VAR_DECL,
29156 get_identifier(name),
29157 type);
29159 DECL_EXTERNAL (new_decl) = 1;
29160 TREE_STATIC (new_decl) = 1;
29161 TREE_PUBLIC (new_decl) = 1;
29162 DECL_INITIAL (new_decl) = 0;
29163 DECL_ARTIFICIAL (new_decl) = 0;
29164 DECL_PRESERVE_P (new_decl) = 1;
29166 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29167 assemble_variable (new_decl, 0, 0, 0);
29169 return new_decl;
29172 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29173 into an integer defined in libgcc/config/i386/cpuinfo.c */
29175 static tree
29176 fold_builtin_cpu (tree fndecl, tree *args)
29178 unsigned int i;
29179 enum ix86_builtins fn_code = (enum ix86_builtins)
29180 DECL_FUNCTION_CODE (fndecl);
29181 tree param_string_cst = NULL;
29183 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29184 enum processor_features
29186 F_CMOV = 0,
29187 F_MMX,
29188 F_POPCNT,
29189 F_SSE,
29190 F_SSE2,
29191 F_SSE3,
29192 F_SSSE3,
29193 F_SSE4_1,
29194 F_SSE4_2,
29195 F_AVX,
29196 F_AVX2,
29197 F_MAX
29200 /* These are the values for vendor types and cpu types and subtypes
29201 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29202 the corresponding start value. */
29203 enum processor_model
29205 M_INTEL = 1,
29206 M_AMD,
29207 M_CPU_TYPE_START,
29208 M_INTEL_ATOM,
29209 M_INTEL_CORE2,
29210 M_INTEL_COREI7,
29211 M_AMDFAM10H,
29212 M_AMDFAM15H,
29213 M_CPU_SUBTYPE_START,
29214 M_INTEL_COREI7_NEHALEM,
29215 M_INTEL_COREI7_WESTMERE,
29216 M_INTEL_COREI7_SANDYBRIDGE,
29217 M_AMDFAM10H_BARCELONA,
29218 M_AMDFAM10H_SHANGHAI,
29219 M_AMDFAM10H_ISTANBUL,
29220 M_AMDFAM15H_BDVER1,
29221 M_AMDFAM15H_BDVER2
29224 static struct _arch_names_table
29226 const char *const name;
29227 const enum processor_model model;
29229 const arch_names_table[] =
29231 {"amd", M_AMD},
29232 {"intel", M_INTEL},
29233 {"atom", M_INTEL_ATOM},
29234 {"core2", M_INTEL_CORE2},
29235 {"corei7", M_INTEL_COREI7},
29236 {"nehalem", M_INTEL_COREI7_NEHALEM},
29237 {"westmere", M_INTEL_COREI7_WESTMERE},
29238 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29239 {"amdfam10h", M_AMDFAM10H},
29240 {"barcelona", M_AMDFAM10H_BARCELONA},
29241 {"shanghai", M_AMDFAM10H_SHANGHAI},
29242 {"istanbul", M_AMDFAM10H_ISTANBUL},
29243 {"amdfam15h", M_AMDFAM15H},
29244 {"bdver1", M_AMDFAM15H_BDVER1},
29245 {"bdver2", M_AMDFAM15H_BDVER2},
29248 static struct _isa_names_table
29250 const char *const name;
29251 const enum processor_features feature;
29253 const isa_names_table[] =
29255 {"cmov", F_CMOV},
29256 {"mmx", F_MMX},
29257 {"popcnt", F_POPCNT},
29258 {"sse", F_SSE},
29259 {"sse2", F_SSE2},
29260 {"sse3", F_SSE3},
29261 {"ssse3", F_SSSE3},
29262 {"sse4.1", F_SSE4_1},
29263 {"sse4.2", F_SSE4_2},
29264 {"avx", F_AVX},
29265 {"avx2", F_AVX2}
29268 static tree __processor_model_type = NULL_TREE;
29269 static tree __cpu_model_var = NULL_TREE;
29271 if (__processor_model_type == NULL_TREE)
29272 __processor_model_type = build_processor_model_struct ();
29274 if (__cpu_model_var == NULL_TREE)
29275 __cpu_model_var = make_var_decl (__processor_model_type,
29276 "__cpu_model");
29278 gcc_assert ((args != NULL) && (*args != NULL));
29280 param_string_cst = *args;
29281 while (param_string_cst
29282 && TREE_CODE (param_string_cst) != STRING_CST)
29284 /* *args must be a expr that can contain other EXPRS leading to a
29285 STRING_CST. */
29286 if (!EXPR_P (param_string_cst))
29288 error ("Parameter to builtin must be a string constant or literal");
29289 return integer_zero_node;
29291 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29294 gcc_assert (param_string_cst);
29296 if (fn_code == IX86_BUILTIN_CPU_IS)
29298 tree ref;
29299 tree field;
29300 tree final;
29302 unsigned int field_val = 0;
29303 unsigned int NUM_ARCH_NAMES
29304 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29306 for (i = 0; i < NUM_ARCH_NAMES; i++)
29307 if (strcmp (arch_names_table[i].name,
29308 TREE_STRING_POINTER (param_string_cst)) == 0)
29309 break;
29311 if (i == NUM_ARCH_NAMES)
29313 error ("Parameter to builtin not valid: %s",
29314 TREE_STRING_POINTER (param_string_cst));
29315 return integer_zero_node;
29318 field = TYPE_FIELDS (__processor_model_type);
29319 field_val = arch_names_table[i].model;
29321 /* CPU types are stored in the next field. */
29322 if (field_val > M_CPU_TYPE_START
29323 && field_val < M_CPU_SUBTYPE_START)
29325 field = DECL_CHAIN (field);
29326 field_val -= M_CPU_TYPE_START;
29329 /* CPU subtypes are stored in the next field. */
29330 if (field_val > M_CPU_SUBTYPE_START)
29332 field = DECL_CHAIN ( DECL_CHAIN (field));
29333 field_val -= M_CPU_SUBTYPE_START;
29336 /* Get the appropriate field in __cpu_model. */
29337 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29338 field, NULL_TREE);
29340 /* Check the value. */
29341 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29342 build_int_cstu (unsigned_type_node, field_val));
29343 return build1 (CONVERT_EXPR, integer_type_node, final);
29345 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29347 tree ref;
29348 tree array_elt;
29349 tree field;
29350 tree final;
29352 unsigned int field_val = 0;
29353 unsigned int NUM_ISA_NAMES
29354 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29356 for (i = 0; i < NUM_ISA_NAMES; i++)
29357 if (strcmp (isa_names_table[i].name,
29358 TREE_STRING_POINTER (param_string_cst)) == 0)
29359 break;
29361 if (i == NUM_ISA_NAMES)
29363 error ("Parameter to builtin not valid: %s",
29364 TREE_STRING_POINTER (param_string_cst));
29365 return integer_zero_node;
29368 field = TYPE_FIELDS (__processor_model_type);
29369 /* Get the last field, which is __cpu_features. */
29370 while (DECL_CHAIN (field))
29371 field = DECL_CHAIN (field);
29373 /* Get the appropriate field: __cpu_model.__cpu_features */
29374 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29375 field, NULL_TREE);
29377 /* Access the 0th element of __cpu_features array. */
29378 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29379 integer_zero_node, NULL_TREE, NULL_TREE);
29381 field_val = (1 << isa_names_table[i].feature);
29382 /* Return __cpu_model.__cpu_features[0] & field_val */
29383 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29384 build_int_cstu (unsigned_type_node, field_val));
29385 return build1 (CONVERT_EXPR, integer_type_node, final);
29387 gcc_unreachable ();
29390 static tree
29391 ix86_fold_builtin (tree fndecl, int n_args,
29392 tree *args, bool ignore ATTRIBUTE_UNUSED)
29394 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29396 enum ix86_builtins fn_code = (enum ix86_builtins)
29397 DECL_FUNCTION_CODE (fndecl);
29398 if (fn_code == IX86_BUILTIN_CPU_IS
29399 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29401 gcc_assert (n_args == 1);
29402 return fold_builtin_cpu (fndecl, args);
29406 #ifdef SUBTARGET_FOLD_BUILTIN
29407 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29408 #endif
29410 return NULL_TREE;
29413 /* Make builtins to detect cpu type and features supported. NAME is
29414 the builtin name, CODE is the builtin code, and FTYPE is the function
29415 type of the builtin. */
29417 static void
29418 make_cpu_type_builtin (const char* name, int code,
29419 enum ix86_builtin_func_type ftype, bool is_const)
29421 tree decl;
29422 tree type;
29424 type = ix86_get_builtin_func_type (ftype);
29425 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29426 NULL, NULL_TREE);
29427 gcc_assert (decl != NULL_TREE);
29428 ix86_builtins[(int) code] = decl;
29429 TREE_READONLY (decl) = is_const;
29432 /* Make builtins to get CPU type and features supported. The created
29433 builtins are :
29435 __builtin_cpu_init (), to detect cpu type and features,
29436 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29437 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29440 static void
29441 ix86_init_platform_type_builtins (void)
29443 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29444 INT_FTYPE_VOID, false);
29445 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29446 INT_FTYPE_PCCHAR, true);
29447 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29448 INT_FTYPE_PCCHAR, true);
29451 /* Internal method for ix86_init_builtins. */
29453 static void
29454 ix86_init_builtins_va_builtins_abi (void)
29456 tree ms_va_ref, sysv_va_ref;
29457 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29458 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29459 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29460 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29462 if (!TARGET_64BIT)
29463 return;
29464 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29465 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29466 ms_va_ref = build_reference_type (ms_va_list_type_node);
29467 sysv_va_ref =
29468 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29470 fnvoid_va_end_ms =
29471 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29472 fnvoid_va_start_ms =
29473 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29474 fnvoid_va_end_sysv =
29475 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29476 fnvoid_va_start_sysv =
29477 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29478 NULL_TREE);
29479 fnvoid_va_copy_ms =
29480 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29481 NULL_TREE);
29482 fnvoid_va_copy_sysv =
29483 build_function_type_list (void_type_node, sysv_va_ref,
29484 sysv_va_ref, NULL_TREE);
29486 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29487 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29488 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29489 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29490 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29491 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29492 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29493 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29494 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29495 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29496 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29497 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29500 static void
29501 ix86_init_builtin_types (void)
29503 tree float128_type_node, float80_type_node;
29505 /* The __float80 type. */
29506 float80_type_node = long_double_type_node;
29507 if (TYPE_MODE (float80_type_node) != XFmode)
29509 /* The __float80 type. */
29510 float80_type_node = make_node (REAL_TYPE);
29512 TYPE_PRECISION (float80_type_node) = 80;
29513 layout_type (float80_type_node);
29515 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29517 /* The __float128 type. */
29518 float128_type_node = make_node (REAL_TYPE);
29519 TYPE_PRECISION (float128_type_node) = 128;
29520 layout_type (float128_type_node);
29521 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29523 /* This macro is built by i386-builtin-types.awk. */
29524 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29527 static void
29528 ix86_init_builtins (void)
29530 tree t;
29532 ix86_init_builtin_types ();
29534 /* Builtins to get CPU type and features. */
29535 ix86_init_platform_type_builtins ();
29537 /* TFmode support builtins. */
29538 def_builtin_const (0, "__builtin_infq",
29539 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29540 def_builtin_const (0, "__builtin_huge_valq",
29541 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29543 /* We will expand them to normal call if SSE isn't available since
29544 they are used by libgcc. */
29545 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29546 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29547 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29548 TREE_READONLY (t) = 1;
29549 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29551 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29552 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29553 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29554 TREE_READONLY (t) = 1;
29555 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29557 ix86_init_tm_builtins ();
29558 ix86_init_mmx_sse_builtins ();
29560 if (TARGET_LP64)
29561 ix86_init_builtins_va_builtins_abi ();
29563 #ifdef SUBTARGET_INIT_BUILTINS
29564 SUBTARGET_INIT_BUILTINS;
29565 #endif
29568 /* Return the ix86 builtin for CODE. */
29570 static tree
29571 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
29573 if (code >= IX86_BUILTIN_MAX)
29574 return error_mark_node;
29576 return ix86_builtins[code];
29579 /* Errors in the source file can cause expand_expr to return const0_rtx
29580 where we expect a vector. To avoid crashing, use one of the vector
29581 clear instructions. */
29582 static rtx
29583 safe_vector_operand (rtx x, enum machine_mode mode)
29585 if (x == const0_rtx)
29586 x = CONST0_RTX (mode);
29587 return x;
29590 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
29592 static rtx
29593 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
29595 rtx pat;
29596 tree arg0 = CALL_EXPR_ARG (exp, 0);
29597 tree arg1 = CALL_EXPR_ARG (exp, 1);
29598 rtx op0 = expand_normal (arg0);
29599 rtx op1 = expand_normal (arg1);
29600 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29601 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
29602 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
29604 if (VECTOR_MODE_P (mode0))
29605 op0 = safe_vector_operand (op0, mode0);
29606 if (VECTOR_MODE_P (mode1))
29607 op1 = safe_vector_operand (op1, mode1);
29609 if (optimize || !target
29610 || GET_MODE (target) != tmode
29611 || !insn_data[icode].operand[0].predicate (target, tmode))
29612 target = gen_reg_rtx (tmode);
29614 if (GET_MODE (op1) == SImode && mode1 == TImode)
29616 rtx x = gen_reg_rtx (V4SImode);
29617 emit_insn (gen_sse2_loadd (x, op1));
29618 op1 = gen_lowpart (TImode, x);
29621 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29622 op0 = copy_to_mode_reg (mode0, op0);
29623 if (!insn_data[icode].operand[2].predicate (op1, mode1))
29624 op1 = copy_to_mode_reg (mode1, op1);
29626 pat = GEN_FCN (icode) (target, op0, op1);
29627 if (! pat)
29628 return 0;
29630 emit_insn (pat);
29632 return target;
29635 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
29637 static rtx
29638 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
29639 enum ix86_builtin_func_type m_type,
29640 enum rtx_code sub_code)
29642 rtx pat;
29643 int i;
29644 int nargs;
29645 bool comparison_p = false;
29646 bool tf_p = false;
29647 bool last_arg_constant = false;
29648 int num_memory = 0;
29649 struct {
29650 rtx op;
29651 enum machine_mode mode;
29652 } args[4];
29654 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29656 switch (m_type)
29658 case MULTI_ARG_4_DF2_DI_I:
29659 case MULTI_ARG_4_DF2_DI_I1:
29660 case MULTI_ARG_4_SF2_SI_I:
29661 case MULTI_ARG_4_SF2_SI_I1:
29662 nargs = 4;
29663 last_arg_constant = true;
29664 break;
29666 case MULTI_ARG_3_SF:
29667 case MULTI_ARG_3_DF:
29668 case MULTI_ARG_3_SF2:
29669 case MULTI_ARG_3_DF2:
29670 case MULTI_ARG_3_DI:
29671 case MULTI_ARG_3_SI:
29672 case MULTI_ARG_3_SI_DI:
29673 case MULTI_ARG_3_HI:
29674 case MULTI_ARG_3_HI_SI:
29675 case MULTI_ARG_3_QI:
29676 case MULTI_ARG_3_DI2:
29677 case MULTI_ARG_3_SI2:
29678 case MULTI_ARG_3_HI2:
29679 case MULTI_ARG_3_QI2:
29680 nargs = 3;
29681 break;
29683 case MULTI_ARG_2_SF:
29684 case MULTI_ARG_2_DF:
29685 case MULTI_ARG_2_DI:
29686 case MULTI_ARG_2_SI:
29687 case MULTI_ARG_2_HI:
29688 case MULTI_ARG_2_QI:
29689 nargs = 2;
29690 break;
29692 case MULTI_ARG_2_DI_IMM:
29693 case MULTI_ARG_2_SI_IMM:
29694 case MULTI_ARG_2_HI_IMM:
29695 case MULTI_ARG_2_QI_IMM:
29696 nargs = 2;
29697 last_arg_constant = true;
29698 break;
29700 case MULTI_ARG_1_SF:
29701 case MULTI_ARG_1_DF:
29702 case MULTI_ARG_1_SF2:
29703 case MULTI_ARG_1_DF2:
29704 case MULTI_ARG_1_DI:
29705 case MULTI_ARG_1_SI:
29706 case MULTI_ARG_1_HI:
29707 case MULTI_ARG_1_QI:
29708 case MULTI_ARG_1_SI_DI:
29709 case MULTI_ARG_1_HI_DI:
29710 case MULTI_ARG_1_HI_SI:
29711 case MULTI_ARG_1_QI_DI:
29712 case MULTI_ARG_1_QI_SI:
29713 case MULTI_ARG_1_QI_HI:
29714 nargs = 1;
29715 break;
29717 case MULTI_ARG_2_DI_CMP:
29718 case MULTI_ARG_2_SI_CMP:
29719 case MULTI_ARG_2_HI_CMP:
29720 case MULTI_ARG_2_QI_CMP:
29721 nargs = 2;
29722 comparison_p = true;
29723 break;
29725 case MULTI_ARG_2_SF_TF:
29726 case MULTI_ARG_2_DF_TF:
29727 case MULTI_ARG_2_DI_TF:
29728 case MULTI_ARG_2_SI_TF:
29729 case MULTI_ARG_2_HI_TF:
29730 case MULTI_ARG_2_QI_TF:
29731 nargs = 2;
29732 tf_p = true;
29733 break;
29735 default:
29736 gcc_unreachable ();
29739 if (optimize || !target
29740 || GET_MODE (target) != tmode
29741 || !insn_data[icode].operand[0].predicate (target, tmode))
29742 target = gen_reg_rtx (tmode);
29744 gcc_assert (nargs <= 4);
29746 for (i = 0; i < nargs; i++)
29748 tree arg = CALL_EXPR_ARG (exp, i);
29749 rtx op = expand_normal (arg);
29750 int adjust = (comparison_p) ? 1 : 0;
29751 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
29753 if (last_arg_constant && i == nargs - 1)
29755 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
29757 enum insn_code new_icode = icode;
29758 switch (icode)
29760 case CODE_FOR_xop_vpermil2v2df3:
29761 case CODE_FOR_xop_vpermil2v4sf3:
29762 case CODE_FOR_xop_vpermil2v4df3:
29763 case CODE_FOR_xop_vpermil2v8sf3:
29764 error ("the last argument must be a 2-bit immediate");
29765 return gen_reg_rtx (tmode);
29766 case CODE_FOR_xop_rotlv2di3:
29767 new_icode = CODE_FOR_rotlv2di3;
29768 goto xop_rotl;
29769 case CODE_FOR_xop_rotlv4si3:
29770 new_icode = CODE_FOR_rotlv4si3;
29771 goto xop_rotl;
29772 case CODE_FOR_xop_rotlv8hi3:
29773 new_icode = CODE_FOR_rotlv8hi3;
29774 goto xop_rotl;
29775 case CODE_FOR_xop_rotlv16qi3:
29776 new_icode = CODE_FOR_rotlv16qi3;
29777 xop_rotl:
29778 if (CONST_INT_P (op))
29780 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
29781 op = GEN_INT (INTVAL (op) & mask);
29782 gcc_checking_assert
29783 (insn_data[icode].operand[i + 1].predicate (op, mode));
29785 else
29787 gcc_checking_assert
29788 (nargs == 2
29789 && insn_data[new_icode].operand[0].mode == tmode
29790 && insn_data[new_icode].operand[1].mode == tmode
29791 && insn_data[new_icode].operand[2].mode == mode
29792 && insn_data[new_icode].operand[0].predicate
29793 == insn_data[icode].operand[0].predicate
29794 && insn_data[new_icode].operand[1].predicate
29795 == insn_data[icode].operand[1].predicate);
29796 icode = new_icode;
29797 goto non_constant;
29799 break;
29800 default:
29801 gcc_unreachable ();
29805 else
29807 non_constant:
29808 if (VECTOR_MODE_P (mode))
29809 op = safe_vector_operand (op, mode);
29811 /* If we aren't optimizing, only allow one memory operand to be
29812 generated. */
29813 if (memory_operand (op, mode))
29814 num_memory++;
29816 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
29818 if (optimize
29819 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
29820 || num_memory > 1)
29821 op = force_reg (mode, op);
29824 args[i].op = op;
29825 args[i].mode = mode;
29828 switch (nargs)
29830 case 1:
29831 pat = GEN_FCN (icode) (target, args[0].op);
29832 break;
29834 case 2:
29835 if (tf_p)
29836 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
29837 GEN_INT ((int)sub_code));
29838 else if (! comparison_p)
29839 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29840 else
29842 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
29843 args[0].op,
29844 args[1].op);
29846 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
29848 break;
29850 case 3:
29851 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29852 break;
29854 case 4:
29855 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
29856 break;
29858 default:
29859 gcc_unreachable ();
29862 if (! pat)
29863 return 0;
29865 emit_insn (pat);
29866 return target;
29869 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
29870 insns with vec_merge. */
29872 static rtx
29873 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
29874 rtx target)
29876 rtx pat;
29877 tree arg0 = CALL_EXPR_ARG (exp, 0);
29878 rtx op1, op0 = expand_normal (arg0);
29879 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29880 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
29882 if (optimize || !target
29883 || GET_MODE (target) != tmode
29884 || !insn_data[icode].operand[0].predicate (target, tmode))
29885 target = gen_reg_rtx (tmode);
29887 if (VECTOR_MODE_P (mode0))
29888 op0 = safe_vector_operand (op0, mode0);
29890 if ((optimize && !register_operand (op0, mode0))
29891 || !insn_data[icode].operand[1].predicate (op0, mode0))
29892 op0 = copy_to_mode_reg (mode0, op0);
29894 op1 = op0;
29895 if (!insn_data[icode].operand[2].predicate (op1, mode0))
29896 op1 = copy_to_mode_reg (mode0, op1);
29898 pat = GEN_FCN (icode) (target, op0, op1);
29899 if (! pat)
29900 return 0;
29901 emit_insn (pat);
29902 return target;
29905 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
29907 static rtx
29908 ix86_expand_sse_compare (const struct builtin_description *d,
29909 tree exp, rtx target, bool swap)
29911 rtx pat;
29912 tree arg0 = CALL_EXPR_ARG (exp, 0);
29913 tree arg1 = CALL_EXPR_ARG (exp, 1);
29914 rtx op0 = expand_normal (arg0);
29915 rtx op1 = expand_normal (arg1);
29916 rtx op2;
29917 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29918 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29919 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
29920 enum rtx_code comparison = d->comparison;
29922 if (VECTOR_MODE_P (mode0))
29923 op0 = safe_vector_operand (op0, mode0);
29924 if (VECTOR_MODE_P (mode1))
29925 op1 = safe_vector_operand (op1, mode1);
29927 /* Swap operands if we have a comparison that isn't available in
29928 hardware. */
29929 if (swap)
29931 rtx tmp = gen_reg_rtx (mode1);
29932 emit_move_insn (tmp, op1);
29933 op1 = op0;
29934 op0 = tmp;
29937 if (optimize || !target
29938 || GET_MODE (target) != tmode
29939 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29940 target = gen_reg_rtx (tmode);
29942 if ((optimize && !register_operand (op0, mode0))
29943 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
29944 op0 = copy_to_mode_reg (mode0, op0);
29945 if ((optimize && !register_operand (op1, mode1))
29946 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
29947 op1 = copy_to_mode_reg (mode1, op1);
29949 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
29950 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
29951 if (! pat)
29952 return 0;
29953 emit_insn (pat);
29954 return target;
29957 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
29959 static rtx
29960 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
29961 rtx target)
29963 rtx pat;
29964 tree arg0 = CALL_EXPR_ARG (exp, 0);
29965 tree arg1 = CALL_EXPR_ARG (exp, 1);
29966 rtx op0 = expand_normal (arg0);
29967 rtx op1 = expand_normal (arg1);
29968 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
29969 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
29970 enum rtx_code comparison = d->comparison;
29972 if (VECTOR_MODE_P (mode0))
29973 op0 = safe_vector_operand (op0, mode0);
29974 if (VECTOR_MODE_P (mode1))
29975 op1 = safe_vector_operand (op1, mode1);
29977 /* Swap operands if we have a comparison that isn't available in
29978 hardware. */
29979 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
29981 rtx tmp = op1;
29982 op1 = op0;
29983 op0 = tmp;
29986 target = gen_reg_rtx (SImode);
29987 emit_move_insn (target, const0_rtx);
29988 target = gen_rtx_SUBREG (QImode, target, 0);
29990 if ((optimize && !register_operand (op0, mode0))
29991 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29992 op0 = copy_to_mode_reg (mode0, op0);
29993 if ((optimize && !register_operand (op1, mode1))
29994 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29995 op1 = copy_to_mode_reg (mode1, op1);
29997 pat = GEN_FCN (d->icode) (op0, op1);
29998 if (! pat)
29999 return 0;
30000 emit_insn (pat);
30001 emit_insn (gen_rtx_SET (VOIDmode,
30002 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30003 gen_rtx_fmt_ee (comparison, QImode,
30004 SET_DEST (pat),
30005 const0_rtx)));
30007 return SUBREG_REG (target);
30010 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30012 static rtx
30013 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30014 rtx target)
30016 rtx pat;
30017 tree arg0 = CALL_EXPR_ARG (exp, 0);
30018 rtx op1, op0 = expand_normal (arg0);
30019 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30020 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30022 if (optimize || target == 0
30023 || GET_MODE (target) != tmode
30024 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30025 target = gen_reg_rtx (tmode);
30027 if (VECTOR_MODE_P (mode0))
30028 op0 = safe_vector_operand (op0, mode0);
30030 if ((optimize && !register_operand (op0, mode0))
30031 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30032 op0 = copy_to_mode_reg (mode0, op0);
30034 op1 = GEN_INT (d->comparison);
30036 pat = GEN_FCN (d->icode) (target, op0, op1);
30037 if (! pat)
30038 return 0;
30039 emit_insn (pat);
30040 return target;
30043 static rtx
30044 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30045 tree exp, rtx target)
30047 rtx pat;
30048 tree arg0 = CALL_EXPR_ARG (exp, 0);
30049 tree arg1 = CALL_EXPR_ARG (exp, 1);
30050 rtx op0 = expand_normal (arg0);
30051 rtx op1 = expand_normal (arg1);
30052 rtx op2;
30053 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30054 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30055 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30057 if (optimize || target == 0
30058 || GET_MODE (target) != tmode
30059 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30060 target = gen_reg_rtx (tmode);
30062 op0 = safe_vector_operand (op0, mode0);
30063 op1 = safe_vector_operand (op1, mode1);
30065 if ((optimize && !register_operand (op0, mode0))
30066 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30067 op0 = copy_to_mode_reg (mode0, op0);
30068 if ((optimize && !register_operand (op1, mode1))
30069 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30070 op1 = copy_to_mode_reg (mode1, op1);
30072 op2 = GEN_INT (d->comparison);
30074 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30075 if (! pat)
30076 return 0;
30077 emit_insn (pat);
30078 return target;
30081 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30083 static rtx
30084 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30085 rtx target)
30087 rtx pat;
30088 tree arg0 = CALL_EXPR_ARG (exp, 0);
30089 tree arg1 = CALL_EXPR_ARG (exp, 1);
30090 rtx op0 = expand_normal (arg0);
30091 rtx op1 = expand_normal (arg1);
30092 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30093 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30094 enum rtx_code comparison = d->comparison;
30096 if (VECTOR_MODE_P (mode0))
30097 op0 = safe_vector_operand (op0, mode0);
30098 if (VECTOR_MODE_P (mode1))
30099 op1 = safe_vector_operand (op1, mode1);
30101 target = gen_reg_rtx (SImode);
30102 emit_move_insn (target, const0_rtx);
30103 target = gen_rtx_SUBREG (QImode, target, 0);
30105 if ((optimize && !register_operand (op0, mode0))
30106 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30107 op0 = copy_to_mode_reg (mode0, op0);
30108 if ((optimize && !register_operand (op1, mode1))
30109 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30110 op1 = copy_to_mode_reg (mode1, op1);
30112 pat = GEN_FCN (d->icode) (op0, op1);
30113 if (! pat)
30114 return 0;
30115 emit_insn (pat);
30116 emit_insn (gen_rtx_SET (VOIDmode,
30117 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30118 gen_rtx_fmt_ee (comparison, QImode,
30119 SET_DEST (pat),
30120 const0_rtx)));
30122 return SUBREG_REG (target);
30125 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30127 static rtx
30128 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30129 tree exp, rtx target)
30131 rtx pat;
30132 tree arg0 = CALL_EXPR_ARG (exp, 0);
30133 tree arg1 = CALL_EXPR_ARG (exp, 1);
30134 tree arg2 = CALL_EXPR_ARG (exp, 2);
30135 tree arg3 = CALL_EXPR_ARG (exp, 3);
30136 tree arg4 = CALL_EXPR_ARG (exp, 4);
30137 rtx scratch0, scratch1;
30138 rtx op0 = expand_normal (arg0);
30139 rtx op1 = expand_normal (arg1);
30140 rtx op2 = expand_normal (arg2);
30141 rtx op3 = expand_normal (arg3);
30142 rtx op4 = expand_normal (arg4);
30143 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30145 tmode0 = insn_data[d->icode].operand[0].mode;
30146 tmode1 = insn_data[d->icode].operand[1].mode;
30147 modev2 = insn_data[d->icode].operand[2].mode;
30148 modei3 = insn_data[d->icode].operand[3].mode;
30149 modev4 = insn_data[d->icode].operand[4].mode;
30150 modei5 = insn_data[d->icode].operand[5].mode;
30151 modeimm = insn_data[d->icode].operand[6].mode;
30153 if (VECTOR_MODE_P (modev2))
30154 op0 = safe_vector_operand (op0, modev2);
30155 if (VECTOR_MODE_P (modev4))
30156 op2 = safe_vector_operand (op2, modev4);
30158 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30159 op0 = copy_to_mode_reg (modev2, op0);
30160 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30161 op1 = copy_to_mode_reg (modei3, op1);
30162 if ((optimize && !register_operand (op2, modev4))
30163 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30164 op2 = copy_to_mode_reg (modev4, op2);
30165 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30166 op3 = copy_to_mode_reg (modei5, op3);
30168 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30170 error ("the fifth argument must be an 8-bit immediate");
30171 return const0_rtx;
30174 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30176 if (optimize || !target
30177 || GET_MODE (target) != tmode0
30178 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30179 target = gen_reg_rtx (tmode0);
30181 scratch1 = gen_reg_rtx (tmode1);
30183 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30185 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30187 if (optimize || !target
30188 || GET_MODE (target) != tmode1
30189 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30190 target = gen_reg_rtx (tmode1);
30192 scratch0 = gen_reg_rtx (tmode0);
30194 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30196 else
30198 gcc_assert (d->flag);
30200 scratch0 = gen_reg_rtx (tmode0);
30201 scratch1 = gen_reg_rtx (tmode1);
30203 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30206 if (! pat)
30207 return 0;
30209 emit_insn (pat);
30211 if (d->flag)
30213 target = gen_reg_rtx (SImode);
30214 emit_move_insn (target, const0_rtx);
30215 target = gen_rtx_SUBREG (QImode, target, 0);
30217 emit_insn
30218 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30219 gen_rtx_fmt_ee (EQ, QImode,
30220 gen_rtx_REG ((enum machine_mode) d->flag,
30221 FLAGS_REG),
30222 const0_rtx)));
30223 return SUBREG_REG (target);
30225 else
30226 return target;
30230 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30232 static rtx
30233 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30234 tree exp, rtx target)
30236 rtx pat;
30237 tree arg0 = CALL_EXPR_ARG (exp, 0);
30238 tree arg1 = CALL_EXPR_ARG (exp, 1);
30239 tree arg2 = CALL_EXPR_ARG (exp, 2);
30240 rtx scratch0, scratch1;
30241 rtx op0 = expand_normal (arg0);
30242 rtx op1 = expand_normal (arg1);
30243 rtx op2 = expand_normal (arg2);
30244 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30246 tmode0 = insn_data[d->icode].operand[0].mode;
30247 tmode1 = insn_data[d->icode].operand[1].mode;
30248 modev2 = insn_data[d->icode].operand[2].mode;
30249 modev3 = insn_data[d->icode].operand[3].mode;
30250 modeimm = insn_data[d->icode].operand[4].mode;
30252 if (VECTOR_MODE_P (modev2))
30253 op0 = safe_vector_operand (op0, modev2);
30254 if (VECTOR_MODE_P (modev3))
30255 op1 = safe_vector_operand (op1, modev3);
30257 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30258 op0 = copy_to_mode_reg (modev2, op0);
30259 if ((optimize && !register_operand (op1, modev3))
30260 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30261 op1 = copy_to_mode_reg (modev3, op1);
30263 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30265 error ("the third argument must be an 8-bit immediate");
30266 return const0_rtx;
30269 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30271 if (optimize || !target
30272 || GET_MODE (target) != tmode0
30273 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30274 target = gen_reg_rtx (tmode0);
30276 scratch1 = gen_reg_rtx (tmode1);
30278 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30280 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30282 if (optimize || !target
30283 || GET_MODE (target) != tmode1
30284 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30285 target = gen_reg_rtx (tmode1);
30287 scratch0 = gen_reg_rtx (tmode0);
30289 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30291 else
30293 gcc_assert (d->flag);
30295 scratch0 = gen_reg_rtx (tmode0);
30296 scratch1 = gen_reg_rtx (tmode1);
30298 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30301 if (! pat)
30302 return 0;
30304 emit_insn (pat);
30306 if (d->flag)
30308 target = gen_reg_rtx (SImode);
30309 emit_move_insn (target, const0_rtx);
30310 target = gen_rtx_SUBREG (QImode, target, 0);
30312 emit_insn
30313 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30314 gen_rtx_fmt_ee (EQ, QImode,
30315 gen_rtx_REG ((enum machine_mode) d->flag,
30316 FLAGS_REG),
30317 const0_rtx)));
30318 return SUBREG_REG (target);
30320 else
30321 return target;
30324 /* Subroutine of ix86_expand_builtin to take care of insns with
30325 variable number of operands. */
30327 static rtx
30328 ix86_expand_args_builtin (const struct builtin_description *d,
30329 tree exp, rtx target)
30331 rtx pat, real_target;
30332 unsigned int i, nargs;
30333 unsigned int nargs_constant = 0;
30334 int num_memory = 0;
30335 struct
30337 rtx op;
30338 enum machine_mode mode;
30339 } args[4];
30340 bool last_arg_count = false;
30341 enum insn_code icode = d->icode;
30342 const struct insn_data_d *insn_p = &insn_data[icode];
30343 enum machine_mode tmode = insn_p->operand[0].mode;
30344 enum machine_mode rmode = VOIDmode;
30345 bool swap = false;
30346 enum rtx_code comparison = d->comparison;
30348 switch ((enum ix86_builtin_func_type) d->flag)
30350 case V2DF_FTYPE_V2DF_ROUND:
30351 case V4DF_FTYPE_V4DF_ROUND:
30352 case V4SF_FTYPE_V4SF_ROUND:
30353 case V8SF_FTYPE_V8SF_ROUND:
30354 case V4SI_FTYPE_V4SF_ROUND:
30355 case V8SI_FTYPE_V8SF_ROUND:
30356 return ix86_expand_sse_round (d, exp, target);
30357 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30358 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30359 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30360 case INT_FTYPE_V8SF_V8SF_PTEST:
30361 case INT_FTYPE_V4DI_V4DI_PTEST:
30362 case INT_FTYPE_V4DF_V4DF_PTEST:
30363 case INT_FTYPE_V4SF_V4SF_PTEST:
30364 case INT_FTYPE_V2DI_V2DI_PTEST:
30365 case INT_FTYPE_V2DF_V2DF_PTEST:
30366 return ix86_expand_sse_ptest (d, exp, target);
30367 case FLOAT128_FTYPE_FLOAT128:
30368 case FLOAT_FTYPE_FLOAT:
30369 case INT_FTYPE_INT:
30370 case UINT64_FTYPE_INT:
30371 case UINT16_FTYPE_UINT16:
30372 case INT64_FTYPE_INT64:
30373 case INT64_FTYPE_V4SF:
30374 case INT64_FTYPE_V2DF:
30375 case INT_FTYPE_V16QI:
30376 case INT_FTYPE_V8QI:
30377 case INT_FTYPE_V8SF:
30378 case INT_FTYPE_V4DF:
30379 case INT_FTYPE_V4SF:
30380 case INT_FTYPE_V2DF:
30381 case INT_FTYPE_V32QI:
30382 case V16QI_FTYPE_V16QI:
30383 case V8SI_FTYPE_V8SF:
30384 case V8SI_FTYPE_V4SI:
30385 case V8HI_FTYPE_V8HI:
30386 case V8HI_FTYPE_V16QI:
30387 case V8QI_FTYPE_V8QI:
30388 case V8SF_FTYPE_V8SF:
30389 case V8SF_FTYPE_V8SI:
30390 case V8SF_FTYPE_V4SF:
30391 case V8SF_FTYPE_V8HI:
30392 case V4SI_FTYPE_V4SI:
30393 case V4SI_FTYPE_V16QI:
30394 case V4SI_FTYPE_V4SF:
30395 case V4SI_FTYPE_V8SI:
30396 case V4SI_FTYPE_V8HI:
30397 case V4SI_FTYPE_V4DF:
30398 case V4SI_FTYPE_V2DF:
30399 case V4HI_FTYPE_V4HI:
30400 case V4DF_FTYPE_V4DF:
30401 case V4DF_FTYPE_V4SI:
30402 case V4DF_FTYPE_V4SF:
30403 case V4DF_FTYPE_V2DF:
30404 case V4SF_FTYPE_V4SF:
30405 case V4SF_FTYPE_V4SI:
30406 case V4SF_FTYPE_V8SF:
30407 case V4SF_FTYPE_V4DF:
30408 case V4SF_FTYPE_V8HI:
30409 case V4SF_FTYPE_V2DF:
30410 case V2DI_FTYPE_V2DI:
30411 case V2DI_FTYPE_V16QI:
30412 case V2DI_FTYPE_V8HI:
30413 case V2DI_FTYPE_V4SI:
30414 case V2DF_FTYPE_V2DF:
30415 case V2DF_FTYPE_V4SI:
30416 case V2DF_FTYPE_V4DF:
30417 case V2DF_FTYPE_V4SF:
30418 case V2DF_FTYPE_V2SI:
30419 case V2SI_FTYPE_V2SI:
30420 case V2SI_FTYPE_V4SF:
30421 case V2SI_FTYPE_V2SF:
30422 case V2SI_FTYPE_V2DF:
30423 case V2SF_FTYPE_V2SF:
30424 case V2SF_FTYPE_V2SI:
30425 case V32QI_FTYPE_V32QI:
30426 case V32QI_FTYPE_V16QI:
30427 case V16HI_FTYPE_V16HI:
30428 case V16HI_FTYPE_V8HI:
30429 case V8SI_FTYPE_V8SI:
30430 case V16HI_FTYPE_V16QI:
30431 case V8SI_FTYPE_V16QI:
30432 case V4DI_FTYPE_V16QI:
30433 case V8SI_FTYPE_V8HI:
30434 case V4DI_FTYPE_V8HI:
30435 case V4DI_FTYPE_V4SI:
30436 case V4DI_FTYPE_V2DI:
30437 nargs = 1;
30438 break;
30439 case V4SF_FTYPE_V4SF_VEC_MERGE:
30440 case V2DF_FTYPE_V2DF_VEC_MERGE:
30441 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30442 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30443 case V16QI_FTYPE_V16QI_V16QI:
30444 case V16QI_FTYPE_V8HI_V8HI:
30445 case V8QI_FTYPE_V8QI_V8QI:
30446 case V8QI_FTYPE_V4HI_V4HI:
30447 case V8HI_FTYPE_V8HI_V8HI:
30448 case V8HI_FTYPE_V16QI_V16QI:
30449 case V8HI_FTYPE_V4SI_V4SI:
30450 case V8SF_FTYPE_V8SF_V8SF:
30451 case V8SF_FTYPE_V8SF_V8SI:
30452 case V4SI_FTYPE_V4SI_V4SI:
30453 case V4SI_FTYPE_V8HI_V8HI:
30454 case V4SI_FTYPE_V4SF_V4SF:
30455 case V4SI_FTYPE_V2DF_V2DF:
30456 case V4HI_FTYPE_V4HI_V4HI:
30457 case V4HI_FTYPE_V8QI_V8QI:
30458 case V4HI_FTYPE_V2SI_V2SI:
30459 case V4DF_FTYPE_V4DF_V4DF:
30460 case V4DF_FTYPE_V4DF_V4DI:
30461 case V4SF_FTYPE_V4SF_V4SF:
30462 case V4SF_FTYPE_V4SF_V4SI:
30463 case V4SF_FTYPE_V4SF_V2SI:
30464 case V4SF_FTYPE_V4SF_V2DF:
30465 case V4SF_FTYPE_V4SF_DI:
30466 case V4SF_FTYPE_V4SF_SI:
30467 case V2DI_FTYPE_V2DI_V2DI:
30468 case V2DI_FTYPE_V16QI_V16QI:
30469 case V2DI_FTYPE_V4SI_V4SI:
30470 case V2UDI_FTYPE_V4USI_V4USI:
30471 case V2DI_FTYPE_V2DI_V16QI:
30472 case V2DI_FTYPE_V2DF_V2DF:
30473 case V2SI_FTYPE_V2SI_V2SI:
30474 case V2SI_FTYPE_V4HI_V4HI:
30475 case V2SI_FTYPE_V2SF_V2SF:
30476 case V2DF_FTYPE_V2DF_V2DF:
30477 case V2DF_FTYPE_V2DF_V4SF:
30478 case V2DF_FTYPE_V2DF_V2DI:
30479 case V2DF_FTYPE_V2DF_DI:
30480 case V2DF_FTYPE_V2DF_SI:
30481 case V2SF_FTYPE_V2SF_V2SF:
30482 case V1DI_FTYPE_V1DI_V1DI:
30483 case V1DI_FTYPE_V8QI_V8QI:
30484 case V1DI_FTYPE_V2SI_V2SI:
30485 case V32QI_FTYPE_V16HI_V16HI:
30486 case V16HI_FTYPE_V8SI_V8SI:
30487 case V32QI_FTYPE_V32QI_V32QI:
30488 case V16HI_FTYPE_V32QI_V32QI:
30489 case V16HI_FTYPE_V16HI_V16HI:
30490 case V8SI_FTYPE_V4DF_V4DF:
30491 case V8SI_FTYPE_V8SI_V8SI:
30492 case V8SI_FTYPE_V16HI_V16HI:
30493 case V4DI_FTYPE_V4DI_V4DI:
30494 case V4DI_FTYPE_V8SI_V8SI:
30495 case V4UDI_FTYPE_V8USI_V8USI:
30496 if (comparison == UNKNOWN)
30497 return ix86_expand_binop_builtin (icode, exp, target);
30498 nargs = 2;
30499 break;
30500 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30501 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30502 gcc_assert (comparison != UNKNOWN);
30503 nargs = 2;
30504 swap = true;
30505 break;
30506 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30507 case V16HI_FTYPE_V16HI_SI_COUNT:
30508 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30509 case V8SI_FTYPE_V8SI_SI_COUNT:
30510 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30511 case V4DI_FTYPE_V4DI_INT_COUNT:
30512 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30513 case V8HI_FTYPE_V8HI_SI_COUNT:
30514 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30515 case V4SI_FTYPE_V4SI_SI_COUNT:
30516 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30517 case V4HI_FTYPE_V4HI_SI_COUNT:
30518 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30519 case V2DI_FTYPE_V2DI_SI_COUNT:
30520 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30521 case V2SI_FTYPE_V2SI_SI_COUNT:
30522 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30523 case V1DI_FTYPE_V1DI_SI_COUNT:
30524 nargs = 2;
30525 last_arg_count = true;
30526 break;
30527 case UINT64_FTYPE_UINT64_UINT64:
30528 case UINT_FTYPE_UINT_UINT:
30529 case UINT_FTYPE_UINT_USHORT:
30530 case UINT_FTYPE_UINT_UCHAR:
30531 case UINT16_FTYPE_UINT16_INT:
30532 case UINT8_FTYPE_UINT8_INT:
30533 nargs = 2;
30534 break;
30535 case V2DI_FTYPE_V2DI_INT_CONVERT:
30536 nargs = 2;
30537 rmode = V1TImode;
30538 nargs_constant = 1;
30539 break;
30540 case V4DI_FTYPE_V4DI_INT_CONVERT:
30541 nargs = 2;
30542 rmode = V2TImode;
30543 nargs_constant = 1;
30544 break;
30545 case V8HI_FTYPE_V8HI_INT:
30546 case V8HI_FTYPE_V8SF_INT:
30547 case V8HI_FTYPE_V4SF_INT:
30548 case V8SF_FTYPE_V8SF_INT:
30549 case V4SI_FTYPE_V4SI_INT:
30550 case V4SI_FTYPE_V8SI_INT:
30551 case V4HI_FTYPE_V4HI_INT:
30552 case V4DF_FTYPE_V4DF_INT:
30553 case V4SF_FTYPE_V4SF_INT:
30554 case V4SF_FTYPE_V8SF_INT:
30555 case V2DI_FTYPE_V2DI_INT:
30556 case V2DF_FTYPE_V2DF_INT:
30557 case V2DF_FTYPE_V4DF_INT:
30558 case V16HI_FTYPE_V16HI_INT:
30559 case V8SI_FTYPE_V8SI_INT:
30560 case V4DI_FTYPE_V4DI_INT:
30561 case V2DI_FTYPE_V4DI_INT:
30562 nargs = 2;
30563 nargs_constant = 1;
30564 break;
30565 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30566 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30567 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30568 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30569 case V2DF_FTYPE_V2DF_V2DF_V2DF:
30570 case V32QI_FTYPE_V32QI_V32QI_V32QI:
30571 nargs = 3;
30572 break;
30573 case V32QI_FTYPE_V32QI_V32QI_INT:
30574 case V16HI_FTYPE_V16HI_V16HI_INT:
30575 case V16QI_FTYPE_V16QI_V16QI_INT:
30576 case V4DI_FTYPE_V4DI_V4DI_INT:
30577 case V8HI_FTYPE_V8HI_V8HI_INT:
30578 case V8SI_FTYPE_V8SI_V8SI_INT:
30579 case V8SI_FTYPE_V8SI_V4SI_INT:
30580 case V8SF_FTYPE_V8SF_V8SF_INT:
30581 case V8SF_FTYPE_V8SF_V4SF_INT:
30582 case V4SI_FTYPE_V4SI_V4SI_INT:
30583 case V4DF_FTYPE_V4DF_V4DF_INT:
30584 case V4DF_FTYPE_V4DF_V2DF_INT:
30585 case V4SF_FTYPE_V4SF_V4SF_INT:
30586 case V2DI_FTYPE_V2DI_V2DI_INT:
30587 case V4DI_FTYPE_V4DI_V2DI_INT:
30588 case V2DF_FTYPE_V2DF_V2DF_INT:
30589 nargs = 3;
30590 nargs_constant = 1;
30591 break;
30592 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
30593 nargs = 3;
30594 rmode = V4DImode;
30595 nargs_constant = 1;
30596 break;
30597 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
30598 nargs = 3;
30599 rmode = V2DImode;
30600 nargs_constant = 1;
30601 break;
30602 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
30603 nargs = 3;
30604 rmode = DImode;
30605 nargs_constant = 1;
30606 break;
30607 case V2DI_FTYPE_V2DI_UINT_UINT:
30608 nargs = 3;
30609 nargs_constant = 2;
30610 break;
30611 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
30612 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
30613 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
30614 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
30615 nargs = 4;
30616 nargs_constant = 1;
30617 break;
30618 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
30619 nargs = 4;
30620 nargs_constant = 2;
30621 break;
30622 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
30623 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
30624 nargs = 4;
30625 break;
30626 default:
30627 gcc_unreachable ();
30630 gcc_assert (nargs <= ARRAY_SIZE (args));
30632 if (comparison != UNKNOWN)
30634 gcc_assert (nargs == 2);
30635 return ix86_expand_sse_compare (d, exp, target, swap);
30638 if (rmode == VOIDmode || rmode == tmode)
30640 if (optimize
30641 || target == 0
30642 || GET_MODE (target) != tmode
30643 || !insn_p->operand[0].predicate (target, tmode))
30644 target = gen_reg_rtx (tmode);
30645 real_target = target;
30647 else
30649 target = gen_reg_rtx (rmode);
30650 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
30653 for (i = 0; i < nargs; i++)
30655 tree arg = CALL_EXPR_ARG (exp, i);
30656 rtx op = expand_normal (arg);
30657 enum machine_mode mode = insn_p->operand[i + 1].mode;
30658 bool match = insn_p->operand[i + 1].predicate (op, mode);
30660 if (last_arg_count && (i + 1) == nargs)
30662 /* SIMD shift insns take either an 8-bit immediate or
30663 register as count. But builtin functions take int as
30664 count. If count doesn't match, we put it in register. */
30665 if (!match)
30667 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
30668 if (!insn_p->operand[i + 1].predicate (op, mode))
30669 op = copy_to_reg (op);
30672 else if ((nargs - i) <= nargs_constant)
30674 if (!match)
30675 switch (icode)
30677 case CODE_FOR_avx2_inserti128:
30678 case CODE_FOR_avx2_extracti128:
30679 error ("the last argument must be an 1-bit immediate");
30680 return const0_rtx;
30682 case CODE_FOR_sse4_1_roundsd:
30683 case CODE_FOR_sse4_1_roundss:
30685 case CODE_FOR_sse4_1_roundpd:
30686 case CODE_FOR_sse4_1_roundps:
30687 case CODE_FOR_avx_roundpd256:
30688 case CODE_FOR_avx_roundps256:
30690 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
30691 case CODE_FOR_sse4_1_roundps_sfix:
30692 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
30693 case CODE_FOR_avx_roundps_sfix256:
30695 case CODE_FOR_sse4_1_blendps:
30696 case CODE_FOR_avx_blendpd256:
30697 case CODE_FOR_avx_vpermilv4df:
30698 error ("the last argument must be a 4-bit immediate");
30699 return const0_rtx;
30701 case CODE_FOR_sse4_1_blendpd:
30702 case CODE_FOR_avx_vpermilv2df:
30703 case CODE_FOR_xop_vpermil2v2df3:
30704 case CODE_FOR_xop_vpermil2v4sf3:
30705 case CODE_FOR_xop_vpermil2v4df3:
30706 case CODE_FOR_xop_vpermil2v8sf3:
30707 error ("the last argument must be a 2-bit immediate");
30708 return const0_rtx;
30710 case CODE_FOR_avx_vextractf128v4df:
30711 case CODE_FOR_avx_vextractf128v8sf:
30712 case CODE_FOR_avx_vextractf128v8si:
30713 case CODE_FOR_avx_vinsertf128v4df:
30714 case CODE_FOR_avx_vinsertf128v8sf:
30715 case CODE_FOR_avx_vinsertf128v8si:
30716 error ("the last argument must be a 1-bit immediate");
30717 return const0_rtx;
30719 case CODE_FOR_avx_vmcmpv2df3:
30720 case CODE_FOR_avx_vmcmpv4sf3:
30721 case CODE_FOR_avx_cmpv2df3:
30722 case CODE_FOR_avx_cmpv4sf3:
30723 case CODE_FOR_avx_cmpv4df3:
30724 case CODE_FOR_avx_cmpv8sf3:
30725 error ("the last argument must be a 5-bit immediate");
30726 return const0_rtx;
30728 default:
30729 switch (nargs_constant)
30731 case 2:
30732 if ((nargs - i) == nargs_constant)
30734 error ("the next to last argument must be an 8-bit immediate");
30735 break;
30737 case 1:
30738 error ("the last argument must be an 8-bit immediate");
30739 break;
30740 default:
30741 gcc_unreachable ();
30743 return const0_rtx;
30746 else
30748 if (VECTOR_MODE_P (mode))
30749 op = safe_vector_operand (op, mode);
30751 /* If we aren't optimizing, only allow one memory operand to
30752 be generated. */
30753 if (memory_operand (op, mode))
30754 num_memory++;
30756 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
30758 if (optimize || !match || num_memory > 1)
30759 op = copy_to_mode_reg (mode, op);
30761 else
30763 op = copy_to_reg (op);
30764 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
30768 args[i].op = op;
30769 args[i].mode = mode;
30772 switch (nargs)
30774 case 1:
30775 pat = GEN_FCN (icode) (real_target, args[0].op);
30776 break;
30777 case 2:
30778 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
30779 break;
30780 case 3:
30781 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
30782 args[2].op);
30783 break;
30784 case 4:
30785 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
30786 args[2].op, args[3].op);
30787 break;
30788 default:
30789 gcc_unreachable ();
30792 if (! pat)
30793 return 0;
30795 emit_insn (pat);
30796 return target;
30799 /* Subroutine of ix86_expand_builtin to take care of special insns
30800 with variable number of operands. */
30802 static rtx
30803 ix86_expand_special_args_builtin (const struct builtin_description *d,
30804 tree exp, rtx target)
30806 tree arg;
30807 rtx pat, op;
30808 unsigned int i, nargs, arg_adjust, memory;
30809 struct
30811 rtx op;
30812 enum machine_mode mode;
30813 } args[3];
30814 enum insn_code icode = d->icode;
30815 bool last_arg_constant = false;
30816 const struct insn_data_d *insn_p = &insn_data[icode];
30817 enum machine_mode tmode = insn_p->operand[0].mode;
30818 enum { load, store } klass;
30820 switch ((enum ix86_builtin_func_type) d->flag)
30822 case VOID_FTYPE_VOID:
30823 emit_insn (GEN_FCN (icode) (target));
30824 return 0;
30825 case VOID_FTYPE_UINT64:
30826 case VOID_FTYPE_UNSIGNED:
30827 nargs = 0;
30828 klass = store;
30829 memory = 0;
30830 break;
30832 case INT_FTYPE_VOID:
30833 case UINT64_FTYPE_VOID:
30834 case UNSIGNED_FTYPE_VOID:
30835 nargs = 0;
30836 klass = load;
30837 memory = 0;
30838 break;
30839 case UINT64_FTYPE_PUNSIGNED:
30840 case V2DI_FTYPE_PV2DI:
30841 case V4DI_FTYPE_PV4DI:
30842 case V32QI_FTYPE_PCCHAR:
30843 case V16QI_FTYPE_PCCHAR:
30844 case V8SF_FTYPE_PCV4SF:
30845 case V8SF_FTYPE_PCFLOAT:
30846 case V4SF_FTYPE_PCFLOAT:
30847 case V4DF_FTYPE_PCV2DF:
30848 case V4DF_FTYPE_PCDOUBLE:
30849 case V2DF_FTYPE_PCDOUBLE:
30850 case VOID_FTYPE_PVOID:
30851 nargs = 1;
30852 klass = load;
30853 memory = 0;
30854 break;
30855 case VOID_FTYPE_PV2SF_V4SF:
30856 case VOID_FTYPE_PV4DI_V4DI:
30857 case VOID_FTYPE_PV2DI_V2DI:
30858 case VOID_FTYPE_PCHAR_V32QI:
30859 case VOID_FTYPE_PCHAR_V16QI:
30860 case VOID_FTYPE_PFLOAT_V8SF:
30861 case VOID_FTYPE_PFLOAT_V4SF:
30862 case VOID_FTYPE_PDOUBLE_V4DF:
30863 case VOID_FTYPE_PDOUBLE_V2DF:
30864 case VOID_FTYPE_PLONGLONG_LONGLONG:
30865 case VOID_FTYPE_PULONGLONG_ULONGLONG:
30866 case VOID_FTYPE_PINT_INT:
30867 nargs = 1;
30868 klass = store;
30869 /* Reserve memory operand for target. */
30870 memory = ARRAY_SIZE (args);
30871 break;
30872 case V4SF_FTYPE_V4SF_PCV2SF:
30873 case V2DF_FTYPE_V2DF_PCDOUBLE:
30874 nargs = 2;
30875 klass = load;
30876 memory = 1;
30877 break;
30878 case V8SF_FTYPE_PCV8SF_V8SI:
30879 case V4DF_FTYPE_PCV4DF_V4DI:
30880 case V4SF_FTYPE_PCV4SF_V4SI:
30881 case V2DF_FTYPE_PCV2DF_V2DI:
30882 case V8SI_FTYPE_PCV8SI_V8SI:
30883 case V4DI_FTYPE_PCV4DI_V4DI:
30884 case V4SI_FTYPE_PCV4SI_V4SI:
30885 case V2DI_FTYPE_PCV2DI_V2DI:
30886 nargs = 2;
30887 klass = load;
30888 memory = 0;
30889 break;
30890 case VOID_FTYPE_PV8SF_V8SI_V8SF:
30891 case VOID_FTYPE_PV4DF_V4DI_V4DF:
30892 case VOID_FTYPE_PV4SF_V4SI_V4SF:
30893 case VOID_FTYPE_PV2DF_V2DI_V2DF:
30894 case VOID_FTYPE_PV8SI_V8SI_V8SI:
30895 case VOID_FTYPE_PV4DI_V4DI_V4DI:
30896 case VOID_FTYPE_PV4SI_V4SI_V4SI:
30897 case VOID_FTYPE_PV2DI_V2DI_V2DI:
30898 nargs = 2;
30899 klass = store;
30900 /* Reserve memory operand for target. */
30901 memory = ARRAY_SIZE (args);
30902 break;
30903 case VOID_FTYPE_UINT_UINT_UINT:
30904 case VOID_FTYPE_UINT64_UINT_UINT:
30905 case UCHAR_FTYPE_UINT_UINT_UINT:
30906 case UCHAR_FTYPE_UINT64_UINT_UINT:
30907 nargs = 3;
30908 klass = load;
30909 memory = ARRAY_SIZE (args);
30910 last_arg_constant = true;
30911 break;
30912 default:
30913 gcc_unreachable ();
30916 gcc_assert (nargs <= ARRAY_SIZE (args));
30918 if (klass == store)
30920 arg = CALL_EXPR_ARG (exp, 0);
30921 op = expand_normal (arg);
30922 gcc_assert (target == 0);
30923 if (memory)
30925 if (GET_MODE (op) != Pmode)
30926 op = convert_to_mode (Pmode, op, 1);
30927 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
30929 else
30930 target = force_reg (tmode, op);
30931 arg_adjust = 1;
30933 else
30935 arg_adjust = 0;
30936 if (optimize
30937 || target == 0
30938 || !register_operand (target, tmode)
30939 || GET_MODE (target) != tmode)
30940 target = gen_reg_rtx (tmode);
30943 for (i = 0; i < nargs; i++)
30945 enum machine_mode mode = insn_p->operand[i + 1].mode;
30946 bool match;
30948 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
30949 op = expand_normal (arg);
30950 match = insn_p->operand[i + 1].predicate (op, mode);
30952 if (last_arg_constant && (i + 1) == nargs)
30954 if (!match)
30956 if (icode == CODE_FOR_lwp_lwpvalsi3
30957 || icode == CODE_FOR_lwp_lwpinssi3
30958 || icode == CODE_FOR_lwp_lwpvaldi3
30959 || icode == CODE_FOR_lwp_lwpinsdi3)
30960 error ("the last argument must be a 32-bit immediate");
30961 else
30962 error ("the last argument must be an 8-bit immediate");
30963 return const0_rtx;
30966 else
30968 if (i == memory)
30970 /* This must be the memory operand. */
30971 if (GET_MODE (op) != Pmode)
30972 op = convert_to_mode (Pmode, op, 1);
30973 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
30974 gcc_assert (GET_MODE (op) == mode
30975 || GET_MODE (op) == VOIDmode);
30977 else
30979 /* This must be register. */
30980 if (VECTOR_MODE_P (mode))
30981 op = safe_vector_operand (op, mode);
30983 gcc_assert (GET_MODE (op) == mode
30984 || GET_MODE (op) == VOIDmode);
30985 op = copy_to_mode_reg (mode, op);
30989 args[i].op = op;
30990 args[i].mode = mode;
30993 switch (nargs)
30995 case 0:
30996 pat = GEN_FCN (icode) (target);
30997 break;
30998 case 1:
30999 pat = GEN_FCN (icode) (target, args[0].op);
31000 break;
31001 case 2:
31002 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31003 break;
31004 case 3:
31005 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31006 break;
31007 default:
31008 gcc_unreachable ();
31011 if (! pat)
31012 return 0;
31013 emit_insn (pat);
31014 return klass == store ? 0 : target;
31017 /* Return the integer constant in ARG. Constrain it to be in the range
31018 of the subparts of VEC_TYPE; issue an error if not. */
31020 static int
31021 get_element_number (tree vec_type, tree arg)
31023 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31025 if (!host_integerp (arg, 1)
31026 || (elt = tree_low_cst (arg, 1), elt > max))
31028 error ("selector must be an integer constant in the range 0..%wi", max);
31029 return 0;
31032 return elt;
31035 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31036 ix86_expand_vector_init. We DO have language-level syntax for this, in
31037 the form of (type){ init-list }. Except that since we can't place emms
31038 instructions from inside the compiler, we can't allow the use of MMX
31039 registers unless the user explicitly asks for it. So we do *not* define
31040 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31041 we have builtins invoked by mmintrin.h that gives us license to emit
31042 these sorts of instructions. */
31044 static rtx
31045 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31047 enum machine_mode tmode = TYPE_MODE (type);
31048 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31049 int i, n_elt = GET_MODE_NUNITS (tmode);
31050 rtvec v = rtvec_alloc (n_elt);
31052 gcc_assert (VECTOR_MODE_P (tmode));
31053 gcc_assert (call_expr_nargs (exp) == n_elt);
31055 for (i = 0; i < n_elt; ++i)
31057 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31058 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31061 if (!target || !register_operand (target, tmode))
31062 target = gen_reg_rtx (tmode);
31064 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31065 return target;
31068 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31069 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31070 had a language-level syntax for referencing vector elements. */
31072 static rtx
31073 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31075 enum machine_mode tmode, mode0;
31076 tree arg0, arg1;
31077 int elt;
31078 rtx op0;
31080 arg0 = CALL_EXPR_ARG (exp, 0);
31081 arg1 = CALL_EXPR_ARG (exp, 1);
31083 op0 = expand_normal (arg0);
31084 elt = get_element_number (TREE_TYPE (arg0), arg1);
31086 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31087 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31088 gcc_assert (VECTOR_MODE_P (mode0));
31090 op0 = force_reg (mode0, op0);
31092 if (optimize || !target || !register_operand (target, tmode))
31093 target = gen_reg_rtx (tmode);
31095 ix86_expand_vector_extract (true, target, op0, elt);
31097 return target;
31100 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31101 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31102 a language-level syntax for referencing vector elements. */
31104 static rtx
31105 ix86_expand_vec_set_builtin (tree exp)
31107 enum machine_mode tmode, mode1;
31108 tree arg0, arg1, arg2;
31109 int elt;
31110 rtx op0, op1, target;
31112 arg0 = CALL_EXPR_ARG (exp, 0);
31113 arg1 = CALL_EXPR_ARG (exp, 1);
31114 arg2 = CALL_EXPR_ARG (exp, 2);
31116 tmode = TYPE_MODE (TREE_TYPE (arg0));
31117 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31118 gcc_assert (VECTOR_MODE_P (tmode));
31120 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31121 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31122 elt = get_element_number (TREE_TYPE (arg0), arg2);
31124 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31125 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31127 op0 = force_reg (tmode, op0);
31128 op1 = force_reg (mode1, op1);
31130 /* OP0 is the source of these builtin functions and shouldn't be
31131 modified. Create a copy, use it and return it as target. */
31132 target = gen_reg_rtx (tmode);
31133 emit_move_insn (target, op0);
31134 ix86_expand_vector_set (true, target, op1, elt);
31136 return target;
31139 /* Expand an expression EXP that calls a built-in function,
31140 with result going to TARGET if that's convenient
31141 (and in mode MODE if that's convenient).
31142 SUBTARGET may be used as the target for computing one of EXP's operands.
31143 IGNORE is nonzero if the value is to be ignored. */
31145 static rtx
31146 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31147 enum machine_mode mode ATTRIBUTE_UNUSED,
31148 int ignore ATTRIBUTE_UNUSED)
31150 const struct builtin_description *d;
31151 size_t i;
31152 enum insn_code icode;
31153 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31154 tree arg0, arg1, arg2, arg3, arg4;
31155 rtx op0, op1, op2, op3, op4, pat, insn;
31156 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31157 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31159 /* For CPU builtins that can be folded, fold first and expand the fold. */
31160 switch (fcode)
31162 case IX86_BUILTIN_CPU_INIT:
31164 /* Make it call __cpu_indicator_init in libgcc. */
31165 tree call_expr, fndecl, type;
31166 type = build_function_type_list (integer_type_node, NULL_TREE);
31167 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31168 call_expr = build_call_expr (fndecl, 0);
31169 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31171 case IX86_BUILTIN_CPU_IS:
31172 case IX86_BUILTIN_CPU_SUPPORTS:
31174 tree arg0 = CALL_EXPR_ARG (exp, 0);
31175 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31176 gcc_assert (fold_expr != NULL_TREE);
31177 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31181 /* Determine whether the builtin function is available under the current ISA.
31182 Originally the builtin was not created if it wasn't applicable to the
31183 current ISA based on the command line switches. With function specific
31184 options, we need to check in the context of the function making the call
31185 whether it is supported. */
31186 if (ix86_builtins_isa[fcode].isa
31187 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31189 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31190 NULL, (enum fpmath_unit) 0, false);
31192 if (!opts)
31193 error ("%qE needs unknown isa option", fndecl);
31194 else
31196 gcc_assert (opts != NULL);
31197 error ("%qE needs isa option %s", fndecl, opts);
31198 free (opts);
31200 return const0_rtx;
31203 switch (fcode)
31205 case IX86_BUILTIN_MASKMOVQ:
31206 case IX86_BUILTIN_MASKMOVDQU:
31207 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31208 ? CODE_FOR_mmx_maskmovq
31209 : CODE_FOR_sse2_maskmovdqu);
31210 /* Note the arg order is different from the operand order. */
31211 arg1 = CALL_EXPR_ARG (exp, 0);
31212 arg2 = CALL_EXPR_ARG (exp, 1);
31213 arg0 = CALL_EXPR_ARG (exp, 2);
31214 op0 = expand_normal (arg0);
31215 op1 = expand_normal (arg1);
31216 op2 = expand_normal (arg2);
31217 mode0 = insn_data[icode].operand[0].mode;
31218 mode1 = insn_data[icode].operand[1].mode;
31219 mode2 = insn_data[icode].operand[2].mode;
31221 if (GET_MODE (op0) != Pmode)
31222 op0 = convert_to_mode (Pmode, op0, 1);
31223 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
31225 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31226 op0 = copy_to_mode_reg (mode0, op0);
31227 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31228 op1 = copy_to_mode_reg (mode1, op1);
31229 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31230 op2 = copy_to_mode_reg (mode2, op2);
31231 pat = GEN_FCN (icode) (op0, op1, op2);
31232 if (! pat)
31233 return 0;
31234 emit_insn (pat);
31235 return 0;
31237 case IX86_BUILTIN_LDMXCSR:
31238 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31239 target = assign_386_stack_local (SImode, SLOT_TEMP);
31240 emit_move_insn (target, op0);
31241 emit_insn (gen_sse_ldmxcsr (target));
31242 return 0;
31244 case IX86_BUILTIN_STMXCSR:
31245 target = assign_386_stack_local (SImode, SLOT_TEMP);
31246 emit_insn (gen_sse_stmxcsr (target));
31247 return copy_to_mode_reg (SImode, target);
31249 case IX86_BUILTIN_CLFLUSH:
31250 arg0 = CALL_EXPR_ARG (exp, 0);
31251 op0 = expand_normal (arg0);
31252 icode = CODE_FOR_sse2_clflush;
31253 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31255 if (GET_MODE (op0) != Pmode)
31256 op0 = convert_to_mode (Pmode, op0, 1);
31257 op0 = force_reg (Pmode, op0);
31260 emit_insn (gen_sse2_clflush (op0));
31261 return 0;
31263 case IX86_BUILTIN_MONITOR:
31264 arg0 = CALL_EXPR_ARG (exp, 0);
31265 arg1 = CALL_EXPR_ARG (exp, 1);
31266 arg2 = CALL_EXPR_ARG (exp, 2);
31267 op0 = expand_normal (arg0);
31268 op1 = expand_normal (arg1);
31269 op2 = expand_normal (arg2);
31270 if (!REG_P (op0))
31272 if (GET_MODE (op0) != Pmode)
31273 op0 = convert_to_mode (Pmode, op0, 1);
31274 op0 = force_reg (Pmode, op0);
31276 if (!REG_P (op1))
31277 op1 = copy_to_mode_reg (SImode, op1);
31278 if (!REG_P (op2))
31279 op2 = copy_to_mode_reg (SImode, op2);
31280 emit_insn (ix86_gen_monitor (op0, op1, op2));
31281 return 0;
31283 case IX86_BUILTIN_MWAIT:
31284 arg0 = CALL_EXPR_ARG (exp, 0);
31285 arg1 = CALL_EXPR_ARG (exp, 1);
31286 op0 = expand_normal (arg0);
31287 op1 = expand_normal (arg1);
31288 if (!REG_P (op0))
31289 op0 = copy_to_mode_reg (SImode, op0);
31290 if (!REG_P (op1))
31291 op1 = copy_to_mode_reg (SImode, op1);
31292 emit_insn (gen_sse3_mwait (op0, op1));
31293 return 0;
31295 case IX86_BUILTIN_VEC_INIT_V2SI:
31296 case IX86_BUILTIN_VEC_INIT_V4HI:
31297 case IX86_BUILTIN_VEC_INIT_V8QI:
31298 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31300 case IX86_BUILTIN_VEC_EXT_V2DF:
31301 case IX86_BUILTIN_VEC_EXT_V2DI:
31302 case IX86_BUILTIN_VEC_EXT_V4SF:
31303 case IX86_BUILTIN_VEC_EXT_V4SI:
31304 case IX86_BUILTIN_VEC_EXT_V8HI:
31305 case IX86_BUILTIN_VEC_EXT_V2SI:
31306 case IX86_BUILTIN_VEC_EXT_V4HI:
31307 case IX86_BUILTIN_VEC_EXT_V16QI:
31308 return ix86_expand_vec_ext_builtin (exp, target);
31310 case IX86_BUILTIN_VEC_SET_V2DI:
31311 case IX86_BUILTIN_VEC_SET_V4SF:
31312 case IX86_BUILTIN_VEC_SET_V4SI:
31313 case IX86_BUILTIN_VEC_SET_V8HI:
31314 case IX86_BUILTIN_VEC_SET_V4HI:
31315 case IX86_BUILTIN_VEC_SET_V16QI:
31316 return ix86_expand_vec_set_builtin (exp);
31318 case IX86_BUILTIN_INFQ:
31319 case IX86_BUILTIN_HUGE_VALQ:
31321 REAL_VALUE_TYPE inf;
31322 rtx tmp;
31324 real_inf (&inf);
31325 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31327 tmp = validize_mem (force_const_mem (mode, tmp));
31329 if (target == 0)
31330 target = gen_reg_rtx (mode);
31332 emit_move_insn (target, tmp);
31333 return target;
31336 case IX86_BUILTIN_RDPMC:
31337 case IX86_BUILTIN_RDTSC:
31338 case IX86_BUILTIN_RDTSCP:
31340 op0 = gen_reg_rtx (DImode);
31341 op1 = gen_reg_rtx (DImode);
31343 if (fcode == IX86_BUILTIN_RDPMC)
31345 arg0 = CALL_EXPR_ARG (exp, 0);
31346 op2 = expand_normal (arg0);
31347 if (!register_operand (op2, SImode))
31348 op2 = copy_to_mode_reg (SImode, op2);
31350 insn = (TARGET_64BIT
31351 ? gen_rdpmc_rex64 (op0, op1, op2)
31352 : gen_rdpmc (op0, op2));
31353 emit_insn (insn);
31355 else if (fcode == IX86_BUILTIN_RDTSC)
31357 insn = (TARGET_64BIT
31358 ? gen_rdtsc_rex64 (op0, op1)
31359 : gen_rdtsc (op0));
31360 emit_insn (insn);
31362 else
31364 op2 = gen_reg_rtx (SImode);
31366 insn = (TARGET_64BIT
31367 ? gen_rdtscp_rex64 (op0, op1, op2)
31368 : gen_rdtscp (op0, op2));
31369 emit_insn (insn);
31371 arg0 = CALL_EXPR_ARG (exp, 0);
31372 op4 = expand_normal (arg0);
31373 if (!address_operand (op4, VOIDmode))
31375 op4 = convert_memory_address (Pmode, op4);
31376 op4 = copy_addr_to_reg (op4);
31378 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31381 if (target == 0)
31382 target = gen_reg_rtx (mode);
31384 if (TARGET_64BIT)
31386 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31387 op1, 1, OPTAB_DIRECT);
31388 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31389 op0, 1, OPTAB_DIRECT);
31392 emit_move_insn (target, op0);
31393 return target;
31395 case IX86_BUILTIN_FXSAVE:
31396 case IX86_BUILTIN_FXRSTOR:
31397 case IX86_BUILTIN_FXSAVE64:
31398 case IX86_BUILTIN_FXRSTOR64:
31399 switch (fcode)
31401 case IX86_BUILTIN_FXSAVE:
31402 icode = CODE_FOR_fxsave;
31403 break;
31404 case IX86_BUILTIN_FXRSTOR:
31405 icode = CODE_FOR_fxrstor;
31406 break;
31407 case IX86_BUILTIN_FXSAVE64:
31408 icode = CODE_FOR_fxsave64;
31409 break;
31410 case IX86_BUILTIN_FXRSTOR64:
31411 icode = CODE_FOR_fxrstor64;
31412 break;
31413 default:
31414 gcc_unreachable ();
31417 arg0 = CALL_EXPR_ARG (exp, 0);
31418 op0 = expand_normal (arg0);
31420 if (!address_operand (op0, VOIDmode))
31422 op0 = convert_memory_address (Pmode, op0);
31423 op0 = copy_addr_to_reg (op0);
31425 op0 = gen_rtx_MEM (BLKmode, op0);
31427 pat = GEN_FCN (icode) (op0);
31428 if (pat)
31429 emit_insn (pat);
31430 return 0;
31432 case IX86_BUILTIN_XSAVE:
31433 case IX86_BUILTIN_XRSTOR:
31434 case IX86_BUILTIN_XSAVE64:
31435 case IX86_BUILTIN_XRSTOR64:
31436 case IX86_BUILTIN_XSAVEOPT:
31437 case IX86_BUILTIN_XSAVEOPT64:
31438 arg0 = CALL_EXPR_ARG (exp, 0);
31439 arg1 = CALL_EXPR_ARG (exp, 1);
31440 op0 = expand_normal (arg0);
31441 op1 = expand_normal (arg1);
31443 if (!address_operand (op0, VOIDmode))
31445 op0 = convert_memory_address (Pmode, op0);
31446 op0 = copy_addr_to_reg (op0);
31448 op0 = gen_rtx_MEM (BLKmode, op0);
31450 op1 = force_reg (DImode, op1);
31452 if (TARGET_64BIT)
31454 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31455 NULL, 1, OPTAB_DIRECT);
31456 switch (fcode)
31458 case IX86_BUILTIN_XSAVE:
31459 icode = CODE_FOR_xsave_rex64;
31460 break;
31461 case IX86_BUILTIN_XRSTOR:
31462 icode = CODE_FOR_xrstor_rex64;
31463 break;
31464 case IX86_BUILTIN_XSAVE64:
31465 icode = CODE_FOR_xsave64;
31466 break;
31467 case IX86_BUILTIN_XRSTOR64:
31468 icode = CODE_FOR_xrstor64;
31469 break;
31470 case IX86_BUILTIN_XSAVEOPT:
31471 icode = CODE_FOR_xsaveopt_rex64;
31472 break;
31473 case IX86_BUILTIN_XSAVEOPT64:
31474 icode = CODE_FOR_xsaveopt64;
31475 break;
31476 default:
31477 gcc_unreachable ();
31480 op2 = gen_lowpart (SImode, op2);
31481 op1 = gen_lowpart (SImode, op1);
31482 pat = GEN_FCN (icode) (op0, op1, op2);
31484 else
31486 switch (fcode)
31488 case IX86_BUILTIN_XSAVE:
31489 icode = CODE_FOR_xsave;
31490 break;
31491 case IX86_BUILTIN_XRSTOR:
31492 icode = CODE_FOR_xrstor;
31493 break;
31494 case IX86_BUILTIN_XSAVEOPT:
31495 icode = CODE_FOR_xsaveopt;
31496 break;
31497 default:
31498 gcc_unreachable ();
31500 pat = GEN_FCN (icode) (op0, op1);
31503 if (pat)
31504 emit_insn (pat);
31505 return 0;
31507 case IX86_BUILTIN_LLWPCB:
31508 arg0 = CALL_EXPR_ARG (exp, 0);
31509 op0 = expand_normal (arg0);
31510 icode = CODE_FOR_lwp_llwpcb;
31511 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31513 if (GET_MODE (op0) != Pmode)
31514 op0 = convert_to_mode (Pmode, op0, 1);
31515 op0 = force_reg (Pmode, op0);
31517 emit_insn (gen_lwp_llwpcb (op0));
31518 return 0;
31520 case IX86_BUILTIN_SLWPCB:
31521 icode = CODE_FOR_lwp_slwpcb;
31522 if (!target
31523 || !insn_data[icode].operand[0].predicate (target, Pmode))
31524 target = gen_reg_rtx (Pmode);
31525 emit_insn (gen_lwp_slwpcb (target));
31526 return target;
31528 case IX86_BUILTIN_BEXTRI32:
31529 case IX86_BUILTIN_BEXTRI64:
31530 arg0 = CALL_EXPR_ARG (exp, 0);
31531 arg1 = CALL_EXPR_ARG (exp, 1);
31532 op0 = expand_normal (arg0);
31533 op1 = expand_normal (arg1);
31534 icode = (fcode == IX86_BUILTIN_BEXTRI32
31535 ? CODE_FOR_tbm_bextri_si
31536 : CODE_FOR_tbm_bextri_di);
31537 if (!CONST_INT_P (op1))
31539 error ("last argument must be an immediate");
31540 return const0_rtx;
31542 else
31544 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
31545 unsigned char lsb_index = INTVAL (op1) & 0xFF;
31546 op1 = GEN_INT (length);
31547 op2 = GEN_INT (lsb_index);
31548 pat = GEN_FCN (icode) (target, op0, op1, op2);
31549 if (pat)
31550 emit_insn (pat);
31551 return target;
31554 case IX86_BUILTIN_RDRAND16_STEP:
31555 icode = CODE_FOR_rdrandhi_1;
31556 mode0 = HImode;
31557 goto rdrand_step;
31559 case IX86_BUILTIN_RDRAND32_STEP:
31560 icode = CODE_FOR_rdrandsi_1;
31561 mode0 = SImode;
31562 goto rdrand_step;
31564 case IX86_BUILTIN_RDRAND64_STEP:
31565 icode = CODE_FOR_rdranddi_1;
31566 mode0 = DImode;
31568 rdrand_step:
31569 op0 = gen_reg_rtx (mode0);
31570 emit_insn (GEN_FCN (icode) (op0));
31572 arg0 = CALL_EXPR_ARG (exp, 0);
31573 op1 = expand_normal (arg0);
31574 if (!address_operand (op1, VOIDmode))
31576 op1 = convert_memory_address (Pmode, op1);
31577 op1 = copy_addr_to_reg (op1);
31579 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31581 op1 = gen_reg_rtx (SImode);
31582 emit_move_insn (op1, CONST1_RTX (SImode));
31584 /* Emit SImode conditional move. */
31585 if (mode0 == HImode)
31587 op2 = gen_reg_rtx (SImode);
31588 emit_insn (gen_zero_extendhisi2 (op2, op0));
31590 else if (mode0 == SImode)
31591 op2 = op0;
31592 else
31593 op2 = gen_rtx_SUBREG (SImode, op0, 0);
31595 if (target == 0)
31596 target = gen_reg_rtx (SImode);
31598 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
31599 const0_rtx);
31600 emit_insn (gen_rtx_SET (VOIDmode, target,
31601 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
31602 return target;
31604 case IX86_BUILTIN_RDSEED16_STEP:
31605 icode = CODE_FOR_rdseedhi_1;
31606 mode0 = HImode;
31607 goto rdseed_step;
31609 case IX86_BUILTIN_RDSEED32_STEP:
31610 icode = CODE_FOR_rdseedsi_1;
31611 mode0 = SImode;
31612 goto rdseed_step;
31614 case IX86_BUILTIN_RDSEED64_STEP:
31615 icode = CODE_FOR_rdseeddi_1;
31616 mode0 = DImode;
31618 rdseed_step:
31619 op0 = gen_reg_rtx (mode0);
31620 emit_insn (GEN_FCN (icode) (op0));
31622 arg0 = CALL_EXPR_ARG (exp, 0);
31623 op1 = expand_normal (arg0);
31624 if (!address_operand (op1, VOIDmode))
31626 op1 = convert_memory_address (Pmode, op1);
31627 op1 = copy_addr_to_reg (op1);
31629 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31631 op2 = gen_reg_rtx (QImode);
31633 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
31634 const0_rtx);
31635 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
31637 if (target == 0)
31638 target = gen_reg_rtx (SImode);
31640 emit_insn (gen_zero_extendqisi2 (target, op2));
31641 return target;
31643 case IX86_BUILTIN_ADDCARRYX32:
31644 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
31645 mode0 = SImode;
31646 goto addcarryx;
31648 case IX86_BUILTIN_ADDCARRYX64:
31649 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
31650 mode0 = DImode;
31652 addcarryx:
31653 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
31654 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
31655 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
31656 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
31658 op0 = gen_reg_rtx (QImode);
31660 /* Generate CF from input operand. */
31661 op1 = expand_normal (arg0);
31662 if (GET_MODE (op1) != QImode)
31663 op1 = convert_to_mode (QImode, op1, 1);
31664 op1 = copy_to_mode_reg (QImode, op1);
31665 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
31667 /* Gen ADCX instruction to compute X+Y+CF. */
31668 op2 = expand_normal (arg1);
31669 op3 = expand_normal (arg2);
31671 if (!REG_P (op2))
31672 op2 = copy_to_mode_reg (mode0, op2);
31673 if (!REG_P (op3))
31674 op3 = copy_to_mode_reg (mode0, op3);
31676 op0 = gen_reg_rtx (mode0);
31678 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
31679 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
31680 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
31682 /* Store the result. */
31683 op4 = expand_normal (arg3);
31684 if (!address_operand (op4, VOIDmode))
31686 op4 = convert_memory_address (Pmode, op4);
31687 op4 = copy_addr_to_reg (op4);
31689 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
31691 /* Return current CF value. */
31692 if (target == 0)
31693 target = gen_reg_rtx (QImode);
31695 PUT_MODE (pat, QImode);
31696 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
31697 return target;
31699 case IX86_BUILTIN_GATHERSIV2DF:
31700 icode = CODE_FOR_avx2_gathersiv2df;
31701 goto gather_gen;
31702 case IX86_BUILTIN_GATHERSIV4DF:
31703 icode = CODE_FOR_avx2_gathersiv4df;
31704 goto gather_gen;
31705 case IX86_BUILTIN_GATHERDIV2DF:
31706 icode = CODE_FOR_avx2_gatherdiv2df;
31707 goto gather_gen;
31708 case IX86_BUILTIN_GATHERDIV4DF:
31709 icode = CODE_FOR_avx2_gatherdiv4df;
31710 goto gather_gen;
31711 case IX86_BUILTIN_GATHERSIV4SF:
31712 icode = CODE_FOR_avx2_gathersiv4sf;
31713 goto gather_gen;
31714 case IX86_BUILTIN_GATHERSIV8SF:
31715 icode = CODE_FOR_avx2_gathersiv8sf;
31716 goto gather_gen;
31717 case IX86_BUILTIN_GATHERDIV4SF:
31718 icode = CODE_FOR_avx2_gatherdiv4sf;
31719 goto gather_gen;
31720 case IX86_BUILTIN_GATHERDIV8SF:
31721 icode = CODE_FOR_avx2_gatherdiv8sf;
31722 goto gather_gen;
31723 case IX86_BUILTIN_GATHERSIV2DI:
31724 icode = CODE_FOR_avx2_gathersiv2di;
31725 goto gather_gen;
31726 case IX86_BUILTIN_GATHERSIV4DI:
31727 icode = CODE_FOR_avx2_gathersiv4di;
31728 goto gather_gen;
31729 case IX86_BUILTIN_GATHERDIV2DI:
31730 icode = CODE_FOR_avx2_gatherdiv2di;
31731 goto gather_gen;
31732 case IX86_BUILTIN_GATHERDIV4DI:
31733 icode = CODE_FOR_avx2_gatherdiv4di;
31734 goto gather_gen;
31735 case IX86_BUILTIN_GATHERSIV4SI:
31736 icode = CODE_FOR_avx2_gathersiv4si;
31737 goto gather_gen;
31738 case IX86_BUILTIN_GATHERSIV8SI:
31739 icode = CODE_FOR_avx2_gathersiv8si;
31740 goto gather_gen;
31741 case IX86_BUILTIN_GATHERDIV4SI:
31742 icode = CODE_FOR_avx2_gatherdiv4si;
31743 goto gather_gen;
31744 case IX86_BUILTIN_GATHERDIV8SI:
31745 icode = CODE_FOR_avx2_gatherdiv8si;
31746 goto gather_gen;
31747 case IX86_BUILTIN_GATHERALTSIV4DF:
31748 icode = CODE_FOR_avx2_gathersiv4df;
31749 goto gather_gen;
31750 case IX86_BUILTIN_GATHERALTDIV8SF:
31751 icode = CODE_FOR_avx2_gatherdiv8sf;
31752 goto gather_gen;
31753 case IX86_BUILTIN_GATHERALTSIV4DI:
31754 icode = CODE_FOR_avx2_gathersiv4di;
31755 goto gather_gen;
31756 case IX86_BUILTIN_GATHERALTDIV8SI:
31757 icode = CODE_FOR_avx2_gatherdiv8si;
31758 goto gather_gen;
31760 gather_gen:
31761 arg0 = CALL_EXPR_ARG (exp, 0);
31762 arg1 = CALL_EXPR_ARG (exp, 1);
31763 arg2 = CALL_EXPR_ARG (exp, 2);
31764 arg3 = CALL_EXPR_ARG (exp, 3);
31765 arg4 = CALL_EXPR_ARG (exp, 4);
31766 op0 = expand_normal (arg0);
31767 op1 = expand_normal (arg1);
31768 op2 = expand_normal (arg2);
31769 op3 = expand_normal (arg3);
31770 op4 = expand_normal (arg4);
31771 /* Note the arg order is different from the operand order. */
31772 mode0 = insn_data[icode].operand[1].mode;
31773 mode2 = insn_data[icode].operand[3].mode;
31774 mode3 = insn_data[icode].operand[4].mode;
31775 mode4 = insn_data[icode].operand[5].mode;
31777 if (target == NULL_RTX
31778 || GET_MODE (target) != insn_data[icode].operand[0].mode)
31779 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
31780 else
31781 subtarget = target;
31783 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
31784 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
31786 rtx half = gen_reg_rtx (V4SImode);
31787 if (!nonimmediate_operand (op2, V8SImode))
31788 op2 = copy_to_mode_reg (V8SImode, op2);
31789 emit_insn (gen_vec_extract_lo_v8si (half, op2));
31790 op2 = half;
31792 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
31793 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
31795 rtx (*gen) (rtx, rtx);
31796 rtx half = gen_reg_rtx (mode0);
31797 if (mode0 == V4SFmode)
31798 gen = gen_vec_extract_lo_v8sf;
31799 else
31800 gen = gen_vec_extract_lo_v8si;
31801 if (!nonimmediate_operand (op0, GET_MODE (op0)))
31802 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
31803 emit_insn (gen (half, op0));
31804 op0 = half;
31805 if (!nonimmediate_operand (op3, GET_MODE (op3)))
31806 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
31807 emit_insn (gen (half, op3));
31808 op3 = half;
31811 /* Force memory operand only with base register here. But we
31812 don't want to do it on memory operand for other builtin
31813 functions. */
31814 if (GET_MODE (op1) != Pmode)
31815 op1 = convert_to_mode (Pmode, op1, 1);
31816 op1 = force_reg (Pmode, op1);
31818 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31819 op0 = copy_to_mode_reg (mode0, op0);
31820 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
31821 op1 = copy_to_mode_reg (Pmode, op1);
31822 if (!insn_data[icode].operand[3].predicate (op2, mode2))
31823 op2 = copy_to_mode_reg (mode2, op2);
31824 if (!insn_data[icode].operand[4].predicate (op3, mode3))
31825 op3 = copy_to_mode_reg (mode3, op3);
31826 if (!insn_data[icode].operand[5].predicate (op4, mode4))
31828 error ("last argument must be scale 1, 2, 4, 8");
31829 return const0_rtx;
31832 /* Optimize. If mask is known to have all high bits set,
31833 replace op0 with pc_rtx to signal that the instruction
31834 overwrites the whole destination and doesn't use its
31835 previous contents. */
31836 if (optimize)
31838 if (TREE_CODE (arg3) == VECTOR_CST)
31840 unsigned int negative = 0;
31841 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
31843 tree cst = VECTOR_CST_ELT (arg3, i);
31844 if (TREE_CODE (cst) == INTEGER_CST
31845 && tree_int_cst_sign_bit (cst))
31846 negative++;
31847 else if (TREE_CODE (cst) == REAL_CST
31848 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
31849 negative++;
31851 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
31852 op0 = pc_rtx;
31854 else if (TREE_CODE (arg3) == SSA_NAME)
31856 /* Recognize also when mask is like:
31857 __v2df src = _mm_setzero_pd ();
31858 __v2df mask = _mm_cmpeq_pd (src, src);
31860 __v8sf src = _mm256_setzero_ps ();
31861 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
31862 as that is a cheaper way to load all ones into
31863 a register than having to load a constant from
31864 memory. */
31865 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
31866 if (is_gimple_call (def_stmt))
31868 tree fndecl = gimple_call_fndecl (def_stmt);
31869 if (fndecl
31870 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
31871 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
31873 case IX86_BUILTIN_CMPPD:
31874 case IX86_BUILTIN_CMPPS:
31875 case IX86_BUILTIN_CMPPD256:
31876 case IX86_BUILTIN_CMPPS256:
31877 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
31878 break;
31879 /* FALLTHRU */
31880 case IX86_BUILTIN_CMPEQPD:
31881 case IX86_BUILTIN_CMPEQPS:
31882 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
31883 && initializer_zerop (gimple_call_arg (def_stmt,
31884 1)))
31885 op0 = pc_rtx;
31886 break;
31887 default:
31888 break;
31894 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
31895 if (! pat)
31896 return const0_rtx;
31897 emit_insn (pat);
31899 if (fcode == IX86_BUILTIN_GATHERDIV8SF
31900 || fcode == IX86_BUILTIN_GATHERDIV8SI)
31902 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
31903 ? V4SFmode : V4SImode;
31904 if (target == NULL_RTX)
31905 target = gen_reg_rtx (tmode);
31906 if (tmode == V4SFmode)
31907 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
31908 else
31909 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
31911 else
31912 target = subtarget;
31914 return target;
31916 case IX86_BUILTIN_XABORT:
31917 icode = CODE_FOR_xabort;
31918 arg0 = CALL_EXPR_ARG (exp, 0);
31919 op0 = expand_normal (arg0);
31920 mode0 = insn_data[icode].operand[0].mode;
31921 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31923 error ("the xabort's argument must be an 8-bit immediate");
31924 return const0_rtx;
31926 emit_insn (gen_xabort (op0));
31927 return 0;
31929 default:
31930 break;
31933 for (i = 0, d = bdesc_special_args;
31934 i < ARRAY_SIZE (bdesc_special_args);
31935 i++, d++)
31936 if (d->code == fcode)
31937 return ix86_expand_special_args_builtin (d, exp, target);
31939 for (i = 0, d = bdesc_args;
31940 i < ARRAY_SIZE (bdesc_args);
31941 i++, d++)
31942 if (d->code == fcode)
31943 switch (fcode)
31945 case IX86_BUILTIN_FABSQ:
31946 case IX86_BUILTIN_COPYSIGNQ:
31947 if (!TARGET_SSE)
31948 /* Emit a normal call if SSE isn't available. */
31949 return expand_call (exp, target, ignore);
31950 default:
31951 return ix86_expand_args_builtin (d, exp, target);
31954 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31955 if (d->code == fcode)
31956 return ix86_expand_sse_comi (d, exp, target);
31958 for (i = 0, d = bdesc_pcmpestr;
31959 i < ARRAY_SIZE (bdesc_pcmpestr);
31960 i++, d++)
31961 if (d->code == fcode)
31962 return ix86_expand_sse_pcmpestr (d, exp, target);
31964 for (i = 0, d = bdesc_pcmpistr;
31965 i < ARRAY_SIZE (bdesc_pcmpistr);
31966 i++, d++)
31967 if (d->code == fcode)
31968 return ix86_expand_sse_pcmpistr (d, exp, target);
31970 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31971 if (d->code == fcode)
31972 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
31973 (enum ix86_builtin_func_type)
31974 d->flag, d->comparison);
31976 gcc_unreachable ();
31979 /* Returns a function decl for a vectorized version of the builtin function
31980 with builtin function code FN and the result vector type TYPE, or NULL_TREE
31981 if it is not available. */
31983 static tree
31984 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
31985 tree type_in)
31987 enum machine_mode in_mode, out_mode;
31988 int in_n, out_n;
31989 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
31991 if (TREE_CODE (type_out) != VECTOR_TYPE
31992 || TREE_CODE (type_in) != VECTOR_TYPE
31993 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
31994 return NULL_TREE;
31996 out_mode = TYPE_MODE (TREE_TYPE (type_out));
31997 out_n = TYPE_VECTOR_SUBPARTS (type_out);
31998 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31999 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32001 switch (fn)
32003 case BUILT_IN_SQRT:
32004 if (out_mode == DFmode && in_mode == DFmode)
32006 if (out_n == 2 && in_n == 2)
32007 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32008 else if (out_n == 4 && in_n == 4)
32009 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32011 break;
32013 case BUILT_IN_SQRTF:
32014 if (out_mode == SFmode && in_mode == SFmode)
32016 if (out_n == 4 && in_n == 4)
32017 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32018 else if (out_n == 8 && in_n == 8)
32019 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32021 break;
32023 case BUILT_IN_IFLOOR:
32024 case BUILT_IN_LFLOOR:
32025 case BUILT_IN_LLFLOOR:
32026 /* The round insn does not trap on denormals. */
32027 if (flag_trapping_math || !TARGET_ROUND)
32028 break;
32030 if (out_mode == SImode && in_mode == DFmode)
32032 if (out_n == 4 && in_n == 2)
32033 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32034 else if (out_n == 8 && in_n == 4)
32035 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32037 break;
32039 case BUILT_IN_IFLOORF:
32040 case BUILT_IN_LFLOORF:
32041 case BUILT_IN_LLFLOORF:
32042 /* The round insn does not trap on denormals. */
32043 if (flag_trapping_math || !TARGET_ROUND)
32044 break;
32046 if (out_mode == SImode && in_mode == SFmode)
32048 if (out_n == 4 && in_n == 4)
32049 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32050 else if (out_n == 8 && in_n == 8)
32051 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32053 break;
32055 case BUILT_IN_ICEIL:
32056 case BUILT_IN_LCEIL:
32057 case BUILT_IN_LLCEIL:
32058 /* The round insn does not trap on denormals. */
32059 if (flag_trapping_math || !TARGET_ROUND)
32060 break;
32062 if (out_mode == SImode && in_mode == DFmode)
32064 if (out_n == 4 && in_n == 2)
32065 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32066 else if (out_n == 8 && in_n == 4)
32067 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32069 break;
32071 case BUILT_IN_ICEILF:
32072 case BUILT_IN_LCEILF:
32073 case BUILT_IN_LLCEILF:
32074 /* The round insn does not trap on denormals. */
32075 if (flag_trapping_math || !TARGET_ROUND)
32076 break;
32078 if (out_mode == SImode && in_mode == SFmode)
32080 if (out_n == 4 && in_n == 4)
32081 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32082 else if (out_n == 8 && in_n == 8)
32083 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32085 break;
32087 case BUILT_IN_IRINT:
32088 case BUILT_IN_LRINT:
32089 case BUILT_IN_LLRINT:
32090 if (out_mode == SImode && in_mode == DFmode)
32092 if (out_n == 4 && in_n == 2)
32093 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32094 else if (out_n == 8 && in_n == 4)
32095 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32097 break;
32099 case BUILT_IN_IRINTF:
32100 case BUILT_IN_LRINTF:
32101 case BUILT_IN_LLRINTF:
32102 if (out_mode == SImode && in_mode == SFmode)
32104 if (out_n == 4 && in_n == 4)
32105 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32106 else if (out_n == 8 && in_n == 8)
32107 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32109 break;
32111 case BUILT_IN_IROUND:
32112 case BUILT_IN_LROUND:
32113 case BUILT_IN_LLROUND:
32114 /* The round insn does not trap on denormals. */
32115 if (flag_trapping_math || !TARGET_ROUND)
32116 break;
32118 if (out_mode == SImode && in_mode == DFmode)
32120 if (out_n == 4 && in_n == 2)
32121 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32122 else if (out_n == 8 && in_n == 4)
32123 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32125 break;
32127 case BUILT_IN_IROUNDF:
32128 case BUILT_IN_LROUNDF:
32129 case BUILT_IN_LLROUNDF:
32130 /* The round insn does not trap on denormals. */
32131 if (flag_trapping_math || !TARGET_ROUND)
32132 break;
32134 if (out_mode == SImode && in_mode == SFmode)
32136 if (out_n == 4 && in_n == 4)
32137 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32138 else if (out_n == 8 && in_n == 8)
32139 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32141 break;
32143 case BUILT_IN_COPYSIGN:
32144 if (out_mode == DFmode && in_mode == DFmode)
32146 if (out_n == 2 && in_n == 2)
32147 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32148 else if (out_n == 4 && in_n == 4)
32149 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32151 break;
32153 case BUILT_IN_COPYSIGNF:
32154 if (out_mode == SFmode && in_mode == SFmode)
32156 if (out_n == 4 && in_n == 4)
32157 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32158 else if (out_n == 8 && in_n == 8)
32159 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32161 break;
32163 case BUILT_IN_FLOOR:
32164 /* The round insn does not trap on denormals. */
32165 if (flag_trapping_math || !TARGET_ROUND)
32166 break;
32168 if (out_mode == DFmode && in_mode == DFmode)
32170 if (out_n == 2 && in_n == 2)
32171 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32172 else if (out_n == 4 && in_n == 4)
32173 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32175 break;
32177 case BUILT_IN_FLOORF:
32178 /* The round insn does not trap on denormals. */
32179 if (flag_trapping_math || !TARGET_ROUND)
32180 break;
32182 if (out_mode == SFmode && in_mode == SFmode)
32184 if (out_n == 4 && in_n == 4)
32185 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32186 else if (out_n == 8 && in_n == 8)
32187 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32189 break;
32191 case BUILT_IN_CEIL:
32192 /* The round insn does not trap on denormals. */
32193 if (flag_trapping_math || !TARGET_ROUND)
32194 break;
32196 if (out_mode == DFmode && in_mode == DFmode)
32198 if (out_n == 2 && in_n == 2)
32199 return ix86_builtins[IX86_BUILTIN_CEILPD];
32200 else if (out_n == 4 && in_n == 4)
32201 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32203 break;
32205 case BUILT_IN_CEILF:
32206 /* The round insn does not trap on denormals. */
32207 if (flag_trapping_math || !TARGET_ROUND)
32208 break;
32210 if (out_mode == SFmode && in_mode == SFmode)
32212 if (out_n == 4 && in_n == 4)
32213 return ix86_builtins[IX86_BUILTIN_CEILPS];
32214 else if (out_n == 8 && in_n == 8)
32215 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32217 break;
32219 case BUILT_IN_TRUNC:
32220 /* The round insn does not trap on denormals. */
32221 if (flag_trapping_math || !TARGET_ROUND)
32222 break;
32224 if (out_mode == DFmode && in_mode == DFmode)
32226 if (out_n == 2 && in_n == 2)
32227 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32228 else if (out_n == 4 && in_n == 4)
32229 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32231 break;
32233 case BUILT_IN_TRUNCF:
32234 /* The round insn does not trap on denormals. */
32235 if (flag_trapping_math || !TARGET_ROUND)
32236 break;
32238 if (out_mode == SFmode && in_mode == SFmode)
32240 if (out_n == 4 && in_n == 4)
32241 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32242 else if (out_n == 8 && in_n == 8)
32243 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32245 break;
32247 case BUILT_IN_RINT:
32248 /* The round insn does not trap on denormals. */
32249 if (flag_trapping_math || !TARGET_ROUND)
32250 break;
32252 if (out_mode == DFmode && in_mode == DFmode)
32254 if (out_n == 2 && in_n == 2)
32255 return ix86_builtins[IX86_BUILTIN_RINTPD];
32256 else if (out_n == 4 && in_n == 4)
32257 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32259 break;
32261 case BUILT_IN_RINTF:
32262 /* The round insn does not trap on denormals. */
32263 if (flag_trapping_math || !TARGET_ROUND)
32264 break;
32266 if (out_mode == SFmode && in_mode == SFmode)
32268 if (out_n == 4 && in_n == 4)
32269 return ix86_builtins[IX86_BUILTIN_RINTPS];
32270 else if (out_n == 8 && in_n == 8)
32271 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32273 break;
32275 case BUILT_IN_ROUND:
32276 /* The round insn does not trap on denormals. */
32277 if (flag_trapping_math || !TARGET_ROUND)
32278 break;
32280 if (out_mode == DFmode && in_mode == DFmode)
32282 if (out_n == 2 && in_n == 2)
32283 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32284 else if (out_n == 4 && in_n == 4)
32285 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32287 break;
32289 case BUILT_IN_ROUNDF:
32290 /* The round insn does not trap on denormals. */
32291 if (flag_trapping_math || !TARGET_ROUND)
32292 break;
32294 if (out_mode == SFmode && in_mode == SFmode)
32296 if (out_n == 4 && in_n == 4)
32297 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32298 else if (out_n == 8 && in_n == 8)
32299 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32301 break;
32303 case BUILT_IN_FMA:
32304 if (out_mode == DFmode && in_mode == DFmode)
32306 if (out_n == 2 && in_n == 2)
32307 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32308 if (out_n == 4 && in_n == 4)
32309 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32311 break;
32313 case BUILT_IN_FMAF:
32314 if (out_mode == SFmode && in_mode == SFmode)
32316 if (out_n == 4 && in_n == 4)
32317 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32318 if (out_n == 8 && in_n == 8)
32319 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32321 break;
32323 default:
32324 break;
32327 /* Dispatch to a handler for a vectorization library. */
32328 if (ix86_veclib_handler)
32329 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32330 type_in);
32332 return NULL_TREE;
32335 /* Handler for an SVML-style interface to
32336 a library with vectorized intrinsics. */
32338 static tree
32339 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32341 char name[20];
32342 tree fntype, new_fndecl, args;
32343 unsigned arity;
32344 const char *bname;
32345 enum machine_mode el_mode, in_mode;
32346 int n, in_n;
32348 /* The SVML is suitable for unsafe math only. */
32349 if (!flag_unsafe_math_optimizations)
32350 return NULL_TREE;
32352 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32353 n = TYPE_VECTOR_SUBPARTS (type_out);
32354 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32355 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32356 if (el_mode != in_mode
32357 || n != in_n)
32358 return NULL_TREE;
32360 switch (fn)
32362 case BUILT_IN_EXP:
32363 case BUILT_IN_LOG:
32364 case BUILT_IN_LOG10:
32365 case BUILT_IN_POW:
32366 case BUILT_IN_TANH:
32367 case BUILT_IN_TAN:
32368 case BUILT_IN_ATAN:
32369 case BUILT_IN_ATAN2:
32370 case BUILT_IN_ATANH:
32371 case BUILT_IN_CBRT:
32372 case BUILT_IN_SINH:
32373 case BUILT_IN_SIN:
32374 case BUILT_IN_ASINH:
32375 case BUILT_IN_ASIN:
32376 case BUILT_IN_COSH:
32377 case BUILT_IN_COS:
32378 case BUILT_IN_ACOSH:
32379 case BUILT_IN_ACOS:
32380 if (el_mode != DFmode || n != 2)
32381 return NULL_TREE;
32382 break;
32384 case BUILT_IN_EXPF:
32385 case BUILT_IN_LOGF:
32386 case BUILT_IN_LOG10F:
32387 case BUILT_IN_POWF:
32388 case BUILT_IN_TANHF:
32389 case BUILT_IN_TANF:
32390 case BUILT_IN_ATANF:
32391 case BUILT_IN_ATAN2F:
32392 case BUILT_IN_ATANHF:
32393 case BUILT_IN_CBRTF:
32394 case BUILT_IN_SINHF:
32395 case BUILT_IN_SINF:
32396 case BUILT_IN_ASINHF:
32397 case BUILT_IN_ASINF:
32398 case BUILT_IN_COSHF:
32399 case BUILT_IN_COSF:
32400 case BUILT_IN_ACOSHF:
32401 case BUILT_IN_ACOSF:
32402 if (el_mode != SFmode || n != 4)
32403 return NULL_TREE;
32404 break;
32406 default:
32407 return NULL_TREE;
32410 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32412 if (fn == BUILT_IN_LOGF)
32413 strcpy (name, "vmlsLn4");
32414 else if (fn == BUILT_IN_LOG)
32415 strcpy (name, "vmldLn2");
32416 else if (n == 4)
32418 sprintf (name, "vmls%s", bname+10);
32419 name[strlen (name)-1] = '4';
32421 else
32422 sprintf (name, "vmld%s2", bname+10);
32424 /* Convert to uppercase. */
32425 name[4] &= ~0x20;
32427 arity = 0;
32428 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32429 args;
32430 args = TREE_CHAIN (args))
32431 arity++;
32433 if (arity == 1)
32434 fntype = build_function_type_list (type_out, type_in, NULL);
32435 else
32436 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32438 /* Build a function declaration for the vectorized function. */
32439 new_fndecl = build_decl (BUILTINS_LOCATION,
32440 FUNCTION_DECL, get_identifier (name), fntype);
32441 TREE_PUBLIC (new_fndecl) = 1;
32442 DECL_EXTERNAL (new_fndecl) = 1;
32443 DECL_IS_NOVOPS (new_fndecl) = 1;
32444 TREE_READONLY (new_fndecl) = 1;
32446 return new_fndecl;
32449 /* Handler for an ACML-style interface to
32450 a library with vectorized intrinsics. */
32452 static tree
32453 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32455 char name[20] = "__vr.._";
32456 tree fntype, new_fndecl, args;
32457 unsigned arity;
32458 const char *bname;
32459 enum machine_mode el_mode, in_mode;
32460 int n, in_n;
32462 /* The ACML is 64bits only and suitable for unsafe math only as
32463 it does not correctly support parts of IEEE with the required
32464 precision such as denormals. */
32465 if (!TARGET_64BIT
32466 || !flag_unsafe_math_optimizations)
32467 return NULL_TREE;
32469 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32470 n = TYPE_VECTOR_SUBPARTS (type_out);
32471 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32472 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32473 if (el_mode != in_mode
32474 || n != in_n)
32475 return NULL_TREE;
32477 switch (fn)
32479 case BUILT_IN_SIN:
32480 case BUILT_IN_COS:
32481 case BUILT_IN_EXP:
32482 case BUILT_IN_LOG:
32483 case BUILT_IN_LOG2:
32484 case BUILT_IN_LOG10:
32485 name[4] = 'd';
32486 name[5] = '2';
32487 if (el_mode != DFmode
32488 || n != 2)
32489 return NULL_TREE;
32490 break;
32492 case BUILT_IN_SINF:
32493 case BUILT_IN_COSF:
32494 case BUILT_IN_EXPF:
32495 case BUILT_IN_POWF:
32496 case BUILT_IN_LOGF:
32497 case BUILT_IN_LOG2F:
32498 case BUILT_IN_LOG10F:
32499 name[4] = 's';
32500 name[5] = '4';
32501 if (el_mode != SFmode
32502 || n != 4)
32503 return NULL_TREE;
32504 break;
32506 default:
32507 return NULL_TREE;
32510 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32511 sprintf (name + 7, "%s", bname+10);
32513 arity = 0;
32514 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32515 args;
32516 args = TREE_CHAIN (args))
32517 arity++;
32519 if (arity == 1)
32520 fntype = build_function_type_list (type_out, type_in, NULL);
32521 else
32522 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32524 /* Build a function declaration for the vectorized function. */
32525 new_fndecl = build_decl (BUILTINS_LOCATION,
32526 FUNCTION_DECL, get_identifier (name), fntype);
32527 TREE_PUBLIC (new_fndecl) = 1;
32528 DECL_EXTERNAL (new_fndecl) = 1;
32529 DECL_IS_NOVOPS (new_fndecl) = 1;
32530 TREE_READONLY (new_fndecl) = 1;
32532 return new_fndecl;
32535 /* Returns a decl of a function that implements gather load with
32536 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
32537 Return NULL_TREE if it is not available. */
32539 static tree
32540 ix86_vectorize_builtin_gather (const_tree mem_vectype,
32541 const_tree index_type, int scale)
32543 bool si;
32544 enum ix86_builtins code;
32546 if (! TARGET_AVX2)
32547 return NULL_TREE;
32549 if ((TREE_CODE (index_type) != INTEGER_TYPE
32550 && !POINTER_TYPE_P (index_type))
32551 || (TYPE_MODE (index_type) != SImode
32552 && TYPE_MODE (index_type) != DImode))
32553 return NULL_TREE;
32555 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
32556 return NULL_TREE;
32558 /* v*gather* insn sign extends index to pointer mode. */
32559 if (TYPE_PRECISION (index_type) < POINTER_SIZE
32560 && TYPE_UNSIGNED (index_type))
32561 return NULL_TREE;
32563 if (scale <= 0
32564 || scale > 8
32565 || (scale & (scale - 1)) != 0)
32566 return NULL_TREE;
32568 si = TYPE_MODE (index_type) == SImode;
32569 switch (TYPE_MODE (mem_vectype))
32571 case V2DFmode:
32572 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
32573 break;
32574 case V4DFmode:
32575 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
32576 break;
32577 case V2DImode:
32578 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
32579 break;
32580 case V4DImode:
32581 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
32582 break;
32583 case V4SFmode:
32584 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
32585 break;
32586 case V8SFmode:
32587 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
32588 break;
32589 case V4SImode:
32590 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
32591 break;
32592 case V8SImode:
32593 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
32594 break;
32595 default:
32596 return NULL_TREE;
32599 return ix86_builtins[code];
32602 /* Returns a code for a target-specific builtin that implements
32603 reciprocal of the function, or NULL_TREE if not available. */
32605 static tree
32606 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
32607 bool sqrt ATTRIBUTE_UNUSED)
32609 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
32610 && flag_finite_math_only && !flag_trapping_math
32611 && flag_unsafe_math_optimizations))
32612 return NULL_TREE;
32614 if (md_fn)
32615 /* Machine dependent builtins. */
32616 switch (fn)
32618 /* Vectorized version of sqrt to rsqrt conversion. */
32619 case IX86_BUILTIN_SQRTPS_NR:
32620 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
32622 case IX86_BUILTIN_SQRTPS_NR256:
32623 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
32625 default:
32626 return NULL_TREE;
32628 else
32629 /* Normal builtins. */
32630 switch (fn)
32632 /* Sqrt to rsqrt conversion. */
32633 case BUILT_IN_SQRTF:
32634 return ix86_builtins[IX86_BUILTIN_RSQRTF];
32636 default:
32637 return NULL_TREE;
32641 /* Helper for avx_vpermilps256_operand et al. This is also used by
32642 the expansion functions to turn the parallel back into a mask.
32643 The return value is 0 for no match and the imm8+1 for a match. */
32646 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
32648 unsigned i, nelt = GET_MODE_NUNITS (mode);
32649 unsigned mask = 0;
32650 unsigned char ipar[8];
32652 if (XVECLEN (par, 0) != (int) nelt)
32653 return 0;
32655 /* Validate that all of the elements are constants, and not totally
32656 out of range. Copy the data into an integral array to make the
32657 subsequent checks easier. */
32658 for (i = 0; i < nelt; ++i)
32660 rtx er = XVECEXP (par, 0, i);
32661 unsigned HOST_WIDE_INT ei;
32663 if (!CONST_INT_P (er))
32664 return 0;
32665 ei = INTVAL (er);
32666 if (ei >= nelt)
32667 return 0;
32668 ipar[i] = ei;
32671 switch (mode)
32673 case V4DFmode:
32674 /* In the 256-bit DFmode case, we can only move elements within
32675 a 128-bit lane. */
32676 for (i = 0; i < 2; ++i)
32678 if (ipar[i] >= 2)
32679 return 0;
32680 mask |= ipar[i] << i;
32682 for (i = 2; i < 4; ++i)
32684 if (ipar[i] < 2)
32685 return 0;
32686 mask |= (ipar[i] - 2) << i;
32688 break;
32690 case V8SFmode:
32691 /* In the 256-bit SFmode case, we have full freedom of movement
32692 within the low 128-bit lane, but the high 128-bit lane must
32693 mirror the exact same pattern. */
32694 for (i = 0; i < 4; ++i)
32695 if (ipar[i] + 4 != ipar[i + 4])
32696 return 0;
32697 nelt = 4;
32698 /* FALLTHRU */
32700 case V2DFmode:
32701 case V4SFmode:
32702 /* In the 128-bit case, we've full freedom in the placement of
32703 the elements from the source operand. */
32704 for (i = 0; i < nelt; ++i)
32705 mask |= ipar[i] << (i * (nelt / 2));
32706 break;
32708 default:
32709 gcc_unreachable ();
32712 /* Make sure success has a non-zero value by adding one. */
32713 return mask + 1;
32716 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
32717 the expansion functions to turn the parallel back into a mask.
32718 The return value is 0 for no match and the imm8+1 for a match. */
32721 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
32723 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
32724 unsigned mask = 0;
32725 unsigned char ipar[8];
32727 if (XVECLEN (par, 0) != (int) nelt)
32728 return 0;
32730 /* Validate that all of the elements are constants, and not totally
32731 out of range. Copy the data into an integral array to make the
32732 subsequent checks easier. */
32733 for (i = 0; i < nelt; ++i)
32735 rtx er = XVECEXP (par, 0, i);
32736 unsigned HOST_WIDE_INT ei;
32738 if (!CONST_INT_P (er))
32739 return 0;
32740 ei = INTVAL (er);
32741 if (ei >= 2 * nelt)
32742 return 0;
32743 ipar[i] = ei;
32746 /* Validate that the halves of the permute are halves. */
32747 for (i = 0; i < nelt2 - 1; ++i)
32748 if (ipar[i] + 1 != ipar[i + 1])
32749 return 0;
32750 for (i = nelt2; i < nelt - 1; ++i)
32751 if (ipar[i] + 1 != ipar[i + 1])
32752 return 0;
32754 /* Reconstruct the mask. */
32755 for (i = 0; i < 2; ++i)
32757 unsigned e = ipar[i * nelt2];
32758 if (e % nelt2)
32759 return 0;
32760 e /= nelt2;
32761 mask |= e << (i * 4);
32764 /* Make sure success has a non-zero value by adding one. */
32765 return mask + 1;
32768 /* Store OPERAND to the memory after reload is completed. This means
32769 that we can't easily use assign_stack_local. */
32771 ix86_force_to_memory (enum machine_mode mode, rtx operand)
32773 rtx result;
32775 gcc_assert (reload_completed);
32776 if (ix86_using_red_zone ())
32778 result = gen_rtx_MEM (mode,
32779 gen_rtx_PLUS (Pmode,
32780 stack_pointer_rtx,
32781 GEN_INT (-RED_ZONE_SIZE)));
32782 emit_move_insn (result, operand);
32784 else if (TARGET_64BIT)
32786 switch (mode)
32788 case HImode:
32789 case SImode:
32790 operand = gen_lowpart (DImode, operand);
32791 /* FALLTHRU */
32792 case DImode:
32793 emit_insn (
32794 gen_rtx_SET (VOIDmode,
32795 gen_rtx_MEM (DImode,
32796 gen_rtx_PRE_DEC (DImode,
32797 stack_pointer_rtx)),
32798 operand));
32799 break;
32800 default:
32801 gcc_unreachable ();
32803 result = gen_rtx_MEM (mode, stack_pointer_rtx);
32805 else
32807 switch (mode)
32809 case DImode:
32811 rtx operands[2];
32812 split_double_mode (mode, &operand, 1, operands, operands + 1);
32813 emit_insn (
32814 gen_rtx_SET (VOIDmode,
32815 gen_rtx_MEM (SImode,
32816 gen_rtx_PRE_DEC (Pmode,
32817 stack_pointer_rtx)),
32818 operands[1]));
32819 emit_insn (
32820 gen_rtx_SET (VOIDmode,
32821 gen_rtx_MEM (SImode,
32822 gen_rtx_PRE_DEC (Pmode,
32823 stack_pointer_rtx)),
32824 operands[0]));
32826 break;
32827 case HImode:
32828 /* Store HImodes as SImodes. */
32829 operand = gen_lowpart (SImode, operand);
32830 /* FALLTHRU */
32831 case SImode:
32832 emit_insn (
32833 gen_rtx_SET (VOIDmode,
32834 gen_rtx_MEM (GET_MODE (operand),
32835 gen_rtx_PRE_DEC (SImode,
32836 stack_pointer_rtx)),
32837 operand));
32838 break;
32839 default:
32840 gcc_unreachable ();
32842 result = gen_rtx_MEM (mode, stack_pointer_rtx);
32844 return result;
32847 /* Free operand from the memory. */
32848 void
32849 ix86_free_from_memory (enum machine_mode mode)
32851 if (!ix86_using_red_zone ())
32853 int size;
32855 if (mode == DImode || TARGET_64BIT)
32856 size = 8;
32857 else
32858 size = 4;
32859 /* Use LEA to deallocate stack space. In peephole2 it will be converted
32860 to pop or add instruction if registers are available. */
32861 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
32862 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
32863 GEN_INT (size))));
32867 /* Return true if we use LRA instead of reload pass. */
32868 static bool
32869 ix86_lra_p (void)
32871 return true;
32874 /* Return a register priority for hard reg REGNO. */
32875 static int
32876 ix86_register_priority (int hard_regno)
32878 /* ebp and r13 as the base always wants a displacement, r12 as the
32879 base always wants an index. So discourage their usage in an
32880 address. */
32881 if (hard_regno == R12_REG || hard_regno == R13_REG)
32882 return 0;
32883 if (hard_regno == BP_REG)
32884 return 1;
32885 /* New x86-64 int registers result in bigger code size. Discourage
32886 them. */
32887 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
32888 return 2;
32889 /* New x86-64 SSE registers result in bigger code size. Discourage
32890 them. */
32891 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
32892 return 2;
32893 /* Usage of AX register results in smaller code. Prefer it. */
32894 if (hard_regno == 0)
32895 return 4;
32896 return 3;
32899 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
32901 Put float CONST_DOUBLE in the constant pool instead of fp regs.
32902 QImode must go into class Q_REGS.
32903 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
32904 movdf to do mem-to-mem moves through integer regs. */
32906 static reg_class_t
32907 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
32909 enum machine_mode mode = GET_MODE (x);
32911 /* We're only allowed to return a subclass of CLASS. Many of the
32912 following checks fail for NO_REGS, so eliminate that early. */
32913 if (regclass == NO_REGS)
32914 return NO_REGS;
32916 /* All classes can load zeros. */
32917 if (x == CONST0_RTX (mode))
32918 return regclass;
32920 /* Force constants into memory if we are loading a (nonzero) constant into
32921 an MMX or SSE register. This is because there are no MMX/SSE instructions
32922 to load from a constant. */
32923 if (CONSTANT_P (x)
32924 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
32925 return NO_REGS;
32927 /* Prefer SSE regs only, if we can use them for math. */
32928 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
32929 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
32931 /* Floating-point constants need more complex checks. */
32932 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
32934 /* General regs can load everything. */
32935 if (reg_class_subset_p (regclass, GENERAL_REGS))
32936 return regclass;
32938 /* Floats can load 0 and 1 plus some others. Note that we eliminated
32939 zero above. We only want to wind up preferring 80387 registers if
32940 we plan on doing computation with them. */
32941 if (TARGET_80387
32942 && standard_80387_constant_p (x) > 0)
32944 /* Limit class to non-sse. */
32945 if (regclass == FLOAT_SSE_REGS)
32946 return FLOAT_REGS;
32947 if (regclass == FP_TOP_SSE_REGS)
32948 return FP_TOP_REG;
32949 if (regclass == FP_SECOND_SSE_REGS)
32950 return FP_SECOND_REG;
32951 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
32952 return regclass;
32955 return NO_REGS;
32958 /* Generally when we see PLUS here, it's the function invariant
32959 (plus soft-fp const_int). Which can only be computed into general
32960 regs. */
32961 if (GET_CODE (x) == PLUS)
32962 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
32964 /* QImode constants are easy to load, but non-constant QImode data
32965 must go into Q_REGS. */
32966 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
32968 if (reg_class_subset_p (regclass, Q_REGS))
32969 return regclass;
32970 if (reg_class_subset_p (Q_REGS, regclass))
32971 return Q_REGS;
32972 return NO_REGS;
32975 return regclass;
32978 /* Discourage putting floating-point values in SSE registers unless
32979 SSE math is being used, and likewise for the 387 registers. */
32980 static reg_class_t
32981 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
32983 enum machine_mode mode = GET_MODE (x);
32985 /* Restrict the output reload class to the register bank that we are doing
32986 math on. If we would like not to return a subset of CLASS, reject this
32987 alternative: if reload cannot do this, it will still use its choice. */
32988 mode = GET_MODE (x);
32989 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
32990 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
32992 if (X87_FLOAT_MODE_P (mode))
32994 if (regclass == FP_TOP_SSE_REGS)
32995 return FP_TOP_REG;
32996 else if (regclass == FP_SECOND_SSE_REGS)
32997 return FP_SECOND_REG;
32998 else
32999 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33002 return regclass;
33005 static reg_class_t
33006 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33007 enum machine_mode mode, secondary_reload_info *sri)
33009 /* Double-word spills from general registers to non-offsettable memory
33010 references (zero-extended addresses) require special handling. */
33011 if (TARGET_64BIT
33012 && MEM_P (x)
33013 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33014 && rclass == GENERAL_REGS
33015 && !offsettable_memref_p (x))
33017 sri->icode = (in_p
33018 ? CODE_FOR_reload_noff_load
33019 : CODE_FOR_reload_noff_store);
33020 /* Add the cost of moving address to a temporary. */
33021 sri->extra_cost = 1;
33023 return NO_REGS;
33026 /* QImode spills from non-QI registers require
33027 intermediate register on 32bit targets. */
33028 if (!TARGET_64BIT
33029 && !in_p && mode == QImode
33030 && (rclass == GENERAL_REGS
33031 || rclass == LEGACY_REGS
33032 || rclass == NON_Q_REGS
33033 || rclass == SIREG
33034 || rclass == DIREG
33035 || rclass == INDEX_REGS))
33037 int regno;
33039 if (REG_P (x))
33040 regno = REGNO (x);
33041 else
33042 regno = -1;
33044 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33045 regno = true_regnum (x);
33047 /* Return Q_REGS if the operand is in memory. */
33048 if (regno == -1)
33049 return Q_REGS;
33052 /* This condition handles corner case where an expression involving
33053 pointers gets vectorized. We're trying to use the address of a
33054 stack slot as a vector initializer.
33056 (set (reg:V2DI 74 [ vect_cst_.2 ])
33057 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33059 Eventually frame gets turned into sp+offset like this:
33061 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33062 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33063 (const_int 392 [0x188]))))
33065 That later gets turned into:
33067 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33068 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33069 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33071 We'll have the following reload recorded:
33073 Reload 0: reload_in (DI) =
33074 (plus:DI (reg/f:DI 7 sp)
33075 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33076 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33077 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33078 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33079 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33080 reload_reg_rtx: (reg:V2DI 22 xmm1)
33082 Which isn't going to work since SSE instructions can't handle scalar
33083 additions. Returning GENERAL_REGS forces the addition into integer
33084 register and reload can handle subsequent reloads without problems. */
33086 if (in_p && GET_CODE (x) == PLUS
33087 && SSE_CLASS_P (rclass)
33088 && SCALAR_INT_MODE_P (mode))
33089 return GENERAL_REGS;
33091 return NO_REGS;
33094 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33096 static bool
33097 ix86_class_likely_spilled_p (reg_class_t rclass)
33099 switch (rclass)
33101 case AREG:
33102 case DREG:
33103 case CREG:
33104 case BREG:
33105 case AD_REGS:
33106 case SIREG:
33107 case DIREG:
33108 case SSE_FIRST_REG:
33109 case FP_TOP_REG:
33110 case FP_SECOND_REG:
33111 return true;
33113 default:
33114 break;
33117 return false;
33120 /* If we are copying between general and FP registers, we need a memory
33121 location. The same is true for SSE and MMX registers.
33123 To optimize register_move_cost performance, allow inline variant.
33125 The macro can't work reliably when one of the CLASSES is class containing
33126 registers from multiple units (SSE, MMX, integer). We avoid this by never
33127 combining those units in single alternative in the machine description.
33128 Ensure that this constraint holds to avoid unexpected surprises.
33130 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33131 enforce these sanity checks. */
33133 static inline bool
33134 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33135 enum machine_mode mode, int strict)
33137 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33138 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33139 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33140 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33141 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33142 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33144 gcc_assert (!strict || lra_in_progress);
33145 return true;
33148 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33149 return true;
33151 /* ??? This is a lie. We do have moves between mmx/general, and for
33152 mmx/sse2. But by saying we need secondary memory we discourage the
33153 register allocator from using the mmx registers unless needed. */
33154 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33155 return true;
33157 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33159 /* SSE1 doesn't have any direct moves from other classes. */
33160 if (!TARGET_SSE2)
33161 return true;
33163 /* If the target says that inter-unit moves are more expensive
33164 than moving through memory, then don't generate them. */
33165 if (!TARGET_INTER_UNIT_MOVES)
33166 return true;
33168 /* Between SSE and general, we have moves no larger than word size. */
33169 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33170 return true;
33173 return false;
33176 bool
33177 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33178 enum machine_mode mode, int strict)
33180 return inline_secondary_memory_needed (class1, class2, mode, strict);
33183 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33185 On the 80386, this is the size of MODE in words,
33186 except in the FP regs, where a single reg is always enough. */
33188 static unsigned char
33189 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33191 if (MAYBE_INTEGER_CLASS_P (rclass))
33193 if (mode == XFmode)
33194 return (TARGET_64BIT ? 2 : 3);
33195 else if (mode == XCmode)
33196 return (TARGET_64BIT ? 4 : 6);
33197 else
33198 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33200 else
33202 if (COMPLEX_MODE_P (mode))
33203 return 2;
33204 else
33205 return 1;
33209 /* Return true if the registers in CLASS cannot represent the change from
33210 modes FROM to TO. */
33212 bool
33213 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33214 enum reg_class regclass)
33216 if (from == to)
33217 return false;
33219 /* x87 registers can't do subreg at all, as all values are reformatted
33220 to extended precision. */
33221 if (MAYBE_FLOAT_CLASS_P (regclass))
33222 return true;
33224 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33226 /* Vector registers do not support QI or HImode loads. If we don't
33227 disallow a change to these modes, reload will assume it's ok to
33228 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33229 the vec_dupv4hi pattern. */
33230 if (GET_MODE_SIZE (from) < 4)
33231 return true;
33233 /* Vector registers do not support subreg with nonzero offsets, which
33234 are otherwise valid for integer registers. Since we can't see
33235 whether we have a nonzero offset from here, prohibit all
33236 nonparadoxical subregs changing size. */
33237 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33238 return true;
33241 return false;
33244 /* Return the cost of moving data of mode M between a
33245 register and memory. A value of 2 is the default; this cost is
33246 relative to those in `REGISTER_MOVE_COST'.
33248 This function is used extensively by register_move_cost that is used to
33249 build tables at startup. Make it inline in this case.
33250 When IN is 2, return maximum of in and out move cost.
33252 If moving between registers and memory is more expensive than
33253 between two registers, you should define this macro to express the
33254 relative cost.
33256 Model also increased moving costs of QImode registers in non
33257 Q_REGS classes.
33259 static inline int
33260 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33261 int in)
33263 int cost;
33264 if (FLOAT_CLASS_P (regclass))
33266 int index;
33267 switch (mode)
33269 case SFmode:
33270 index = 0;
33271 break;
33272 case DFmode:
33273 index = 1;
33274 break;
33275 case XFmode:
33276 index = 2;
33277 break;
33278 default:
33279 return 100;
33281 if (in == 2)
33282 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33283 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33285 if (SSE_CLASS_P (regclass))
33287 int index;
33288 switch (GET_MODE_SIZE (mode))
33290 case 4:
33291 index = 0;
33292 break;
33293 case 8:
33294 index = 1;
33295 break;
33296 case 16:
33297 index = 2;
33298 break;
33299 default:
33300 return 100;
33302 if (in == 2)
33303 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33304 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33306 if (MMX_CLASS_P (regclass))
33308 int index;
33309 switch (GET_MODE_SIZE (mode))
33311 case 4:
33312 index = 0;
33313 break;
33314 case 8:
33315 index = 1;
33316 break;
33317 default:
33318 return 100;
33320 if (in)
33321 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33322 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33324 switch (GET_MODE_SIZE (mode))
33326 case 1:
33327 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33329 if (!in)
33330 return ix86_cost->int_store[0];
33331 if (TARGET_PARTIAL_REG_DEPENDENCY
33332 && optimize_function_for_speed_p (cfun))
33333 cost = ix86_cost->movzbl_load;
33334 else
33335 cost = ix86_cost->int_load[0];
33336 if (in == 2)
33337 return MAX (cost, ix86_cost->int_store[0]);
33338 return cost;
33340 else
33342 if (in == 2)
33343 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33344 if (in)
33345 return ix86_cost->movzbl_load;
33346 else
33347 return ix86_cost->int_store[0] + 4;
33349 break;
33350 case 2:
33351 if (in == 2)
33352 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33353 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33354 default:
33355 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33356 if (mode == TFmode)
33357 mode = XFmode;
33358 if (in == 2)
33359 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33360 else if (in)
33361 cost = ix86_cost->int_load[2];
33362 else
33363 cost = ix86_cost->int_store[2];
33364 return (cost * (((int) GET_MODE_SIZE (mode)
33365 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33369 static int
33370 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33371 bool in)
33373 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33377 /* Return the cost of moving data from a register in class CLASS1 to
33378 one in class CLASS2.
33380 It is not required that the cost always equal 2 when FROM is the same as TO;
33381 on some machines it is expensive to move between registers if they are not
33382 general registers. */
33384 static int
33385 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33386 reg_class_t class2_i)
33388 enum reg_class class1 = (enum reg_class) class1_i;
33389 enum reg_class class2 = (enum reg_class) class2_i;
33391 /* In case we require secondary memory, compute cost of the store followed
33392 by load. In order to avoid bad register allocation choices, we need
33393 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33395 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33397 int cost = 1;
33399 cost += inline_memory_move_cost (mode, class1, 2);
33400 cost += inline_memory_move_cost (mode, class2, 2);
33402 /* In case of copying from general_purpose_register we may emit multiple
33403 stores followed by single load causing memory size mismatch stall.
33404 Count this as arbitrarily high cost of 20. */
33405 if (targetm.class_max_nregs (class1, mode)
33406 > targetm.class_max_nregs (class2, mode))
33407 cost += 20;
33409 /* In the case of FP/MMX moves, the registers actually overlap, and we
33410 have to switch modes in order to treat them differently. */
33411 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33412 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33413 cost += 20;
33415 return cost;
33418 /* Moves between SSE/MMX and integer unit are expensive. */
33419 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33420 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33422 /* ??? By keeping returned value relatively high, we limit the number
33423 of moves between integer and MMX/SSE registers for all targets.
33424 Additionally, high value prevents problem with x86_modes_tieable_p(),
33425 where integer modes in MMX/SSE registers are not tieable
33426 because of missing QImode and HImode moves to, from or between
33427 MMX/SSE registers. */
33428 return MAX (8, ix86_cost->mmxsse_to_integer);
33430 if (MAYBE_FLOAT_CLASS_P (class1))
33431 return ix86_cost->fp_move;
33432 if (MAYBE_SSE_CLASS_P (class1))
33433 return ix86_cost->sse_move;
33434 if (MAYBE_MMX_CLASS_P (class1))
33435 return ix86_cost->mmx_move;
33436 return 2;
33439 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33440 MODE. */
33442 bool
33443 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33445 /* Flags and only flags can only hold CCmode values. */
33446 if (CC_REGNO_P (regno))
33447 return GET_MODE_CLASS (mode) == MODE_CC;
33448 if (GET_MODE_CLASS (mode) == MODE_CC
33449 || GET_MODE_CLASS (mode) == MODE_RANDOM
33450 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33451 return false;
33452 if (STACK_REGNO_P (regno))
33453 return VALID_FP_MODE_P (mode);
33454 if (SSE_REGNO_P (regno))
33456 /* We implement the move patterns for all vector modes into and
33457 out of SSE registers, even when no operation instructions
33458 are available. OImode move is available only when AVX is
33459 enabled. */
33460 return ((TARGET_AVX && mode == OImode)
33461 || VALID_AVX256_REG_MODE (mode)
33462 || VALID_SSE_REG_MODE (mode)
33463 || VALID_SSE2_REG_MODE (mode)
33464 || VALID_MMX_REG_MODE (mode)
33465 || VALID_MMX_REG_MODE_3DNOW (mode));
33467 if (MMX_REGNO_P (regno))
33469 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33470 so if the register is available at all, then we can move data of
33471 the given mode into or out of it. */
33472 return (VALID_MMX_REG_MODE (mode)
33473 || VALID_MMX_REG_MODE_3DNOW (mode));
33476 if (mode == QImode)
33478 /* Take care for QImode values - they can be in non-QI regs,
33479 but then they do cause partial register stalls. */
33480 if (TARGET_64BIT || QI_REGNO_P (regno))
33481 return true;
33482 if (!TARGET_PARTIAL_REG_STALL)
33483 return true;
33484 return !can_create_pseudo_p ();
33486 /* We handle both integer and floats in the general purpose registers. */
33487 else if (VALID_INT_MODE_P (mode))
33488 return true;
33489 else if (VALID_FP_MODE_P (mode))
33490 return true;
33491 else if (VALID_DFP_MODE_P (mode))
33492 return true;
33493 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33494 on to use that value in smaller contexts, this can easily force a
33495 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33496 supporting DImode, allow it. */
33497 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33498 return true;
33500 return false;
33503 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33504 tieable integer mode. */
33506 static bool
33507 ix86_tieable_integer_mode_p (enum machine_mode mode)
33509 switch (mode)
33511 case HImode:
33512 case SImode:
33513 return true;
33515 case QImode:
33516 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33518 case DImode:
33519 return TARGET_64BIT;
33521 default:
33522 return false;
33526 /* Return true if MODE1 is accessible in a register that can hold MODE2
33527 without copying. That is, all register classes that can hold MODE2
33528 can also hold MODE1. */
33530 bool
33531 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
33533 if (mode1 == mode2)
33534 return true;
33536 if (ix86_tieable_integer_mode_p (mode1)
33537 && ix86_tieable_integer_mode_p (mode2))
33538 return true;
33540 /* MODE2 being XFmode implies fp stack or general regs, which means we
33541 can tie any smaller floating point modes to it. Note that we do not
33542 tie this with TFmode. */
33543 if (mode2 == XFmode)
33544 return mode1 == SFmode || mode1 == DFmode;
33546 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
33547 that we can tie it with SFmode. */
33548 if (mode2 == DFmode)
33549 return mode1 == SFmode;
33551 /* If MODE2 is only appropriate for an SSE register, then tie with
33552 any other mode acceptable to SSE registers. */
33553 if (GET_MODE_SIZE (mode2) == 32
33554 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33555 return (GET_MODE_SIZE (mode1) == 32
33556 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33557 if (GET_MODE_SIZE (mode2) == 16
33558 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33559 return (GET_MODE_SIZE (mode1) == 16
33560 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33562 /* If MODE2 is appropriate for an MMX register, then tie
33563 with any other mode acceptable to MMX registers. */
33564 if (GET_MODE_SIZE (mode2) == 8
33565 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
33566 return (GET_MODE_SIZE (mode1) == 8
33567 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
33569 return false;
33572 /* Return the cost of moving between two registers of mode MODE. */
33574 static int
33575 ix86_set_reg_reg_cost (enum machine_mode mode)
33577 unsigned int units = UNITS_PER_WORD;
33579 switch (GET_MODE_CLASS (mode))
33581 default:
33582 break;
33584 case MODE_CC:
33585 units = GET_MODE_SIZE (CCmode);
33586 break;
33588 case MODE_FLOAT:
33589 if ((TARGET_SSE && mode == TFmode)
33590 || (TARGET_80387 && mode == XFmode)
33591 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
33592 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
33593 units = GET_MODE_SIZE (mode);
33594 break;
33596 case MODE_COMPLEX_FLOAT:
33597 if ((TARGET_SSE && mode == TCmode)
33598 || (TARGET_80387 && mode == XCmode)
33599 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
33600 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
33601 units = GET_MODE_SIZE (mode);
33602 break;
33604 case MODE_VECTOR_INT:
33605 case MODE_VECTOR_FLOAT:
33606 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33607 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33608 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33609 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
33610 units = GET_MODE_SIZE (mode);
33613 /* Return the cost of moving between two registers of mode MODE,
33614 assuming that the move will be in pieces of at most UNITS bytes. */
33615 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
33618 /* Compute a (partial) cost for rtx X. Return true if the complete
33619 cost has been computed, and false if subexpressions should be
33620 scanned. In either case, *TOTAL contains the cost result. */
33622 static bool
33623 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
33624 bool speed)
33626 enum rtx_code code = (enum rtx_code) code_i;
33627 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
33628 enum machine_mode mode = GET_MODE (x);
33629 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
33631 switch (code)
33633 case SET:
33634 if (register_operand (SET_DEST (x), VOIDmode)
33635 && reg_or_0_operand (SET_SRC (x), VOIDmode))
33637 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
33638 return true;
33640 return false;
33642 case CONST_INT:
33643 case CONST:
33644 case LABEL_REF:
33645 case SYMBOL_REF:
33646 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
33647 *total = 3;
33648 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
33649 *total = 2;
33650 else if (flag_pic && SYMBOLIC_CONST (x)
33651 && (!TARGET_64BIT
33652 || (!GET_CODE (x) != LABEL_REF
33653 && (GET_CODE (x) != SYMBOL_REF
33654 || !SYMBOL_REF_LOCAL_P (x)))))
33655 *total = 1;
33656 else
33657 *total = 0;
33658 return true;
33660 case CONST_DOUBLE:
33661 if (mode == VOIDmode)
33663 *total = 0;
33664 return true;
33666 switch (standard_80387_constant_p (x))
33668 case 1: /* 0.0 */
33669 *total = 1;
33670 return true;
33671 default: /* Other constants */
33672 *total = 2;
33673 return true;
33674 case 0:
33675 case -1:
33676 break;
33678 if (SSE_FLOAT_MODE_P (mode))
33680 case CONST_VECTOR:
33681 switch (standard_sse_constant_p (x))
33683 case 0:
33684 break;
33685 case 1: /* 0: xor eliminates false dependency */
33686 *total = 0;
33687 return true;
33688 default: /* -1: cmp contains false dependency */
33689 *total = 1;
33690 return true;
33693 /* Fall back to (MEM (SYMBOL_REF)), since that's where
33694 it'll probably end up. Add a penalty for size. */
33695 *total = (COSTS_N_INSNS (1)
33696 + (flag_pic != 0 && !TARGET_64BIT)
33697 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
33698 return true;
33700 case ZERO_EXTEND:
33701 /* The zero extensions is often completely free on x86_64, so make
33702 it as cheap as possible. */
33703 if (TARGET_64BIT && mode == DImode
33704 && GET_MODE (XEXP (x, 0)) == SImode)
33705 *total = 1;
33706 else if (TARGET_ZERO_EXTEND_WITH_AND)
33707 *total = cost->add;
33708 else
33709 *total = cost->movzx;
33710 return false;
33712 case SIGN_EXTEND:
33713 *total = cost->movsx;
33714 return false;
33716 case ASHIFT:
33717 if (SCALAR_INT_MODE_P (mode)
33718 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
33719 && CONST_INT_P (XEXP (x, 1)))
33721 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
33722 if (value == 1)
33724 *total = cost->add;
33725 return false;
33727 if ((value == 2 || value == 3)
33728 && cost->lea <= cost->shift_const)
33730 *total = cost->lea;
33731 return false;
33734 /* FALLTHRU */
33736 case ROTATE:
33737 case ASHIFTRT:
33738 case LSHIFTRT:
33739 case ROTATERT:
33740 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33742 /* ??? Should be SSE vector operation cost. */
33743 /* At least for published AMD latencies, this really is the same
33744 as the latency for a simple fpu operation like fabs. */
33745 /* V*QImode is emulated with 1-11 insns. */
33746 if (mode == V16QImode || mode == V32QImode)
33748 int count = 11;
33749 if (TARGET_XOP && mode == V16QImode)
33751 /* For XOP we use vpshab, which requires a broadcast of the
33752 value to the variable shift insn. For constants this
33753 means a V16Q const in mem; even when we can perform the
33754 shift with one insn set the cost to prefer paddb. */
33755 if (CONSTANT_P (XEXP (x, 1)))
33757 *total = (cost->fabs
33758 + rtx_cost (XEXP (x, 0), code, 0, speed)
33759 + (speed ? 2 : COSTS_N_BYTES (16)));
33760 return true;
33762 count = 3;
33764 else if (TARGET_SSSE3)
33765 count = 7;
33766 *total = cost->fabs * count;
33768 else
33769 *total = cost->fabs;
33771 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33773 if (CONST_INT_P (XEXP (x, 1)))
33775 if (INTVAL (XEXP (x, 1)) > 32)
33776 *total = cost->shift_const + COSTS_N_INSNS (2);
33777 else
33778 *total = cost->shift_const * 2;
33780 else
33782 if (GET_CODE (XEXP (x, 1)) == AND)
33783 *total = cost->shift_var * 2;
33784 else
33785 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
33788 else
33790 if (CONST_INT_P (XEXP (x, 1)))
33791 *total = cost->shift_const;
33792 else
33793 *total = cost->shift_var;
33795 return false;
33797 case FMA:
33799 rtx sub;
33801 gcc_assert (FLOAT_MODE_P (mode));
33802 gcc_assert (TARGET_FMA || TARGET_FMA4);
33804 /* ??? SSE scalar/vector cost should be used here. */
33805 /* ??? Bald assumption that fma has the same cost as fmul. */
33806 *total = cost->fmul;
33807 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
33809 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
33810 sub = XEXP (x, 0);
33811 if (GET_CODE (sub) == NEG)
33812 sub = XEXP (sub, 0);
33813 *total += rtx_cost (sub, FMA, 0, speed);
33815 sub = XEXP (x, 2);
33816 if (GET_CODE (sub) == NEG)
33817 sub = XEXP (sub, 0);
33818 *total += rtx_cost (sub, FMA, 2, speed);
33819 return true;
33822 case MULT:
33823 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33825 /* ??? SSE scalar cost should be used here. */
33826 *total = cost->fmul;
33827 return false;
33829 else if (X87_FLOAT_MODE_P (mode))
33831 *total = cost->fmul;
33832 return false;
33834 else if (FLOAT_MODE_P (mode))
33836 /* ??? SSE vector cost should be used here. */
33837 *total = cost->fmul;
33838 return false;
33840 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33842 /* V*QImode is emulated with 7-13 insns. */
33843 if (mode == V16QImode || mode == V32QImode)
33845 int extra = 11;
33846 if (TARGET_XOP && mode == V16QImode)
33847 extra = 5;
33848 else if (TARGET_SSSE3)
33849 extra = 6;
33850 *total = cost->fmul * 2 + cost->fabs * extra;
33852 /* V*DImode is emulated with 5-8 insns. */
33853 else if (mode == V2DImode || mode == V4DImode)
33855 if (TARGET_XOP && mode == V2DImode)
33856 *total = cost->fmul * 2 + cost->fabs * 3;
33857 else
33858 *total = cost->fmul * 3 + cost->fabs * 5;
33860 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
33861 insns, including two PMULUDQ. */
33862 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
33863 *total = cost->fmul * 2 + cost->fabs * 5;
33864 else
33865 *total = cost->fmul;
33866 return false;
33868 else
33870 rtx op0 = XEXP (x, 0);
33871 rtx op1 = XEXP (x, 1);
33872 int nbits;
33873 if (CONST_INT_P (XEXP (x, 1)))
33875 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
33876 for (nbits = 0; value != 0; value &= value - 1)
33877 nbits++;
33879 else
33880 /* This is arbitrary. */
33881 nbits = 7;
33883 /* Compute costs correctly for widening multiplication. */
33884 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
33885 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
33886 == GET_MODE_SIZE (mode))
33888 int is_mulwiden = 0;
33889 enum machine_mode inner_mode = GET_MODE (op0);
33891 if (GET_CODE (op0) == GET_CODE (op1))
33892 is_mulwiden = 1, op1 = XEXP (op1, 0);
33893 else if (CONST_INT_P (op1))
33895 if (GET_CODE (op0) == SIGN_EXTEND)
33896 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
33897 == INTVAL (op1);
33898 else
33899 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
33902 if (is_mulwiden)
33903 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
33906 *total = (cost->mult_init[MODE_INDEX (mode)]
33907 + nbits * cost->mult_bit
33908 + rtx_cost (op0, outer_code, opno, speed)
33909 + rtx_cost (op1, outer_code, opno, speed));
33911 return true;
33914 case DIV:
33915 case UDIV:
33916 case MOD:
33917 case UMOD:
33918 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33919 /* ??? SSE cost should be used here. */
33920 *total = cost->fdiv;
33921 else if (X87_FLOAT_MODE_P (mode))
33922 *total = cost->fdiv;
33923 else if (FLOAT_MODE_P (mode))
33924 /* ??? SSE vector cost should be used here. */
33925 *total = cost->fdiv;
33926 else
33927 *total = cost->divide[MODE_INDEX (mode)];
33928 return false;
33930 case PLUS:
33931 if (GET_MODE_CLASS (mode) == MODE_INT
33932 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
33934 if (GET_CODE (XEXP (x, 0)) == PLUS
33935 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
33936 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
33937 && CONSTANT_P (XEXP (x, 1)))
33939 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
33940 if (val == 2 || val == 4 || val == 8)
33942 *total = cost->lea;
33943 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
33944 outer_code, opno, speed);
33945 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
33946 outer_code, opno, speed);
33947 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33948 return true;
33951 else if (GET_CODE (XEXP (x, 0)) == MULT
33952 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
33954 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
33955 if (val == 2 || val == 4 || val == 8)
33957 *total = cost->lea;
33958 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
33959 outer_code, opno, speed);
33960 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33961 return true;
33964 else if (GET_CODE (XEXP (x, 0)) == PLUS)
33966 *total = cost->lea;
33967 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
33968 outer_code, opno, speed);
33969 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
33970 outer_code, opno, speed);
33971 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33972 return true;
33975 /* FALLTHRU */
33977 case MINUS:
33978 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33980 /* ??? SSE cost should be used here. */
33981 *total = cost->fadd;
33982 return false;
33984 else if (X87_FLOAT_MODE_P (mode))
33986 *total = cost->fadd;
33987 return false;
33989 else if (FLOAT_MODE_P (mode))
33991 /* ??? SSE vector cost should be used here. */
33992 *total = cost->fadd;
33993 return false;
33995 /* FALLTHRU */
33997 case AND:
33998 case IOR:
33999 case XOR:
34000 if (GET_MODE_CLASS (mode) == MODE_INT
34001 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34003 *total = (cost->add * 2
34004 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34005 << (GET_MODE (XEXP (x, 0)) != DImode))
34006 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34007 << (GET_MODE (XEXP (x, 1)) != DImode)));
34008 return true;
34010 /* FALLTHRU */
34012 case NEG:
34013 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34015 /* ??? SSE cost should be used here. */
34016 *total = cost->fchs;
34017 return false;
34019 else if (X87_FLOAT_MODE_P (mode))
34021 *total = cost->fchs;
34022 return false;
34024 else if (FLOAT_MODE_P (mode))
34026 /* ??? SSE vector cost should be used here. */
34027 *total = cost->fchs;
34028 return false;
34030 /* FALLTHRU */
34032 case NOT:
34033 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34035 /* ??? Should be SSE vector operation cost. */
34036 /* At least for published AMD latencies, this really is the same
34037 as the latency for a simple fpu operation like fabs. */
34038 *total = cost->fabs;
34040 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34041 *total = cost->add * 2;
34042 else
34043 *total = cost->add;
34044 return false;
34046 case COMPARE:
34047 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34048 && XEXP (XEXP (x, 0), 1) == const1_rtx
34049 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34050 && XEXP (x, 1) == const0_rtx)
34052 /* This kind of construct is implemented using test[bwl].
34053 Treat it as if we had an AND. */
34054 *total = (cost->add
34055 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34056 + rtx_cost (const1_rtx, outer_code, opno, speed));
34057 return true;
34059 return false;
34061 case FLOAT_EXTEND:
34062 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34063 *total = 0;
34064 return false;
34066 case ABS:
34067 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34068 /* ??? SSE cost should be used here. */
34069 *total = cost->fabs;
34070 else if (X87_FLOAT_MODE_P (mode))
34071 *total = cost->fabs;
34072 else if (FLOAT_MODE_P (mode))
34073 /* ??? SSE vector cost should be used here. */
34074 *total = cost->fabs;
34075 return false;
34077 case SQRT:
34078 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34079 /* ??? SSE cost should be used here. */
34080 *total = cost->fsqrt;
34081 else if (X87_FLOAT_MODE_P (mode))
34082 *total = cost->fsqrt;
34083 else if (FLOAT_MODE_P (mode))
34084 /* ??? SSE vector cost should be used here. */
34085 *total = cost->fsqrt;
34086 return false;
34088 case UNSPEC:
34089 if (XINT (x, 1) == UNSPEC_TP)
34090 *total = 0;
34091 return false;
34093 case VEC_SELECT:
34094 case VEC_CONCAT:
34095 case VEC_MERGE:
34096 case VEC_DUPLICATE:
34097 /* ??? Assume all of these vector manipulation patterns are
34098 recognizable. In which case they all pretty much have the
34099 same cost. */
34100 *total = cost->fabs;
34101 return true;
34103 default:
34104 return false;
34108 #if TARGET_MACHO
34110 static int current_machopic_label_num;
34112 /* Given a symbol name and its associated stub, write out the
34113 definition of the stub. */
34115 void
34116 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34118 unsigned int length;
34119 char *binder_name, *symbol_name, lazy_ptr_name[32];
34120 int label = ++current_machopic_label_num;
34122 /* For 64-bit we shouldn't get here. */
34123 gcc_assert (!TARGET_64BIT);
34125 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34126 symb = targetm.strip_name_encoding (symb);
34128 length = strlen (stub);
34129 binder_name = XALLOCAVEC (char, length + 32);
34130 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34132 length = strlen (symb);
34133 symbol_name = XALLOCAVEC (char, length + 32);
34134 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34136 sprintf (lazy_ptr_name, "L%d$lz", label);
34138 if (MACHOPIC_ATT_STUB)
34139 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34140 else if (MACHOPIC_PURE)
34141 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34142 else
34143 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34145 fprintf (file, "%s:\n", stub);
34146 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34148 if (MACHOPIC_ATT_STUB)
34150 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34152 else if (MACHOPIC_PURE)
34154 /* PIC stub. */
34155 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34156 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34157 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34158 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34159 label, lazy_ptr_name, label);
34160 fprintf (file, "\tjmp\t*%%ecx\n");
34162 else
34163 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34165 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34166 it needs no stub-binding-helper. */
34167 if (MACHOPIC_ATT_STUB)
34168 return;
34170 fprintf (file, "%s:\n", binder_name);
34172 if (MACHOPIC_PURE)
34174 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34175 fprintf (file, "\tpushl\t%%ecx\n");
34177 else
34178 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34180 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34182 /* N.B. Keep the correspondence of these
34183 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34184 old-pic/new-pic/non-pic stubs; altering this will break
34185 compatibility with existing dylibs. */
34186 if (MACHOPIC_PURE)
34188 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34189 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34191 else
34192 /* 16-byte -mdynamic-no-pic stub. */
34193 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34195 fprintf (file, "%s:\n", lazy_ptr_name);
34196 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34197 fprintf (file, ASM_LONG "%s\n", binder_name);
34199 #endif /* TARGET_MACHO */
34201 /* Order the registers for register allocator. */
34203 void
34204 x86_order_regs_for_local_alloc (void)
34206 int pos = 0;
34207 int i;
34209 /* First allocate the local general purpose registers. */
34210 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34211 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34212 reg_alloc_order [pos++] = i;
34214 /* Global general purpose registers. */
34215 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34216 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34217 reg_alloc_order [pos++] = i;
34219 /* x87 registers come first in case we are doing FP math
34220 using them. */
34221 if (!TARGET_SSE_MATH)
34222 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34223 reg_alloc_order [pos++] = i;
34225 /* SSE registers. */
34226 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34227 reg_alloc_order [pos++] = i;
34228 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34229 reg_alloc_order [pos++] = i;
34231 /* x87 registers. */
34232 if (TARGET_SSE_MATH)
34233 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34234 reg_alloc_order [pos++] = i;
34236 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34237 reg_alloc_order [pos++] = i;
34239 /* Initialize the rest of array as we do not allocate some registers
34240 at all. */
34241 while (pos < FIRST_PSEUDO_REGISTER)
34242 reg_alloc_order [pos++] = 0;
34245 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34246 in struct attribute_spec handler. */
34247 static tree
34248 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34249 tree args,
34250 int flags ATTRIBUTE_UNUSED,
34251 bool *no_add_attrs)
34253 if (TREE_CODE (*node) != FUNCTION_TYPE
34254 && TREE_CODE (*node) != METHOD_TYPE
34255 && TREE_CODE (*node) != FIELD_DECL
34256 && TREE_CODE (*node) != TYPE_DECL)
34258 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34259 name);
34260 *no_add_attrs = true;
34261 return NULL_TREE;
34263 if (TARGET_64BIT)
34265 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34266 name);
34267 *no_add_attrs = true;
34268 return NULL_TREE;
34270 if (is_attribute_p ("callee_pop_aggregate_return", name))
34272 tree cst;
34274 cst = TREE_VALUE (args);
34275 if (TREE_CODE (cst) != INTEGER_CST)
34277 warning (OPT_Wattributes,
34278 "%qE attribute requires an integer constant argument",
34279 name);
34280 *no_add_attrs = true;
34282 else if (compare_tree_int (cst, 0) != 0
34283 && compare_tree_int (cst, 1) != 0)
34285 warning (OPT_Wattributes,
34286 "argument to %qE attribute is neither zero, nor one",
34287 name);
34288 *no_add_attrs = true;
34291 return NULL_TREE;
34294 return NULL_TREE;
34297 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34298 struct attribute_spec.handler. */
34299 static tree
34300 ix86_handle_abi_attribute (tree *node, tree name,
34301 tree args ATTRIBUTE_UNUSED,
34302 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34304 if (TREE_CODE (*node) != FUNCTION_TYPE
34305 && TREE_CODE (*node) != METHOD_TYPE
34306 && TREE_CODE (*node) != FIELD_DECL
34307 && TREE_CODE (*node) != TYPE_DECL)
34309 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34310 name);
34311 *no_add_attrs = true;
34312 return NULL_TREE;
34315 /* Can combine regparm with all attributes but fastcall. */
34316 if (is_attribute_p ("ms_abi", name))
34318 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34320 error ("ms_abi and sysv_abi attributes are not compatible");
34323 return NULL_TREE;
34325 else if (is_attribute_p ("sysv_abi", name))
34327 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34329 error ("ms_abi and sysv_abi attributes are not compatible");
34332 return NULL_TREE;
34335 return NULL_TREE;
34338 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34339 struct attribute_spec.handler. */
34340 static tree
34341 ix86_handle_struct_attribute (tree *node, tree name,
34342 tree args ATTRIBUTE_UNUSED,
34343 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34345 tree *type = NULL;
34346 if (DECL_P (*node))
34348 if (TREE_CODE (*node) == TYPE_DECL)
34349 type = &TREE_TYPE (*node);
34351 else
34352 type = node;
34354 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34356 warning (OPT_Wattributes, "%qE attribute ignored",
34357 name);
34358 *no_add_attrs = true;
34361 else if ((is_attribute_p ("ms_struct", name)
34362 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34363 || ((is_attribute_p ("gcc_struct", name)
34364 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34366 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34367 name);
34368 *no_add_attrs = true;
34371 return NULL_TREE;
34374 static tree
34375 ix86_handle_fndecl_attribute (tree *node, tree name,
34376 tree args ATTRIBUTE_UNUSED,
34377 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34379 if (TREE_CODE (*node) != FUNCTION_DECL)
34381 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34382 name);
34383 *no_add_attrs = true;
34385 return NULL_TREE;
34388 static bool
34389 ix86_ms_bitfield_layout_p (const_tree record_type)
34391 return ((TARGET_MS_BITFIELD_LAYOUT
34392 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34393 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34396 /* Returns an expression indicating where the this parameter is
34397 located on entry to the FUNCTION. */
34399 static rtx
34400 x86_this_parameter (tree function)
34402 tree type = TREE_TYPE (function);
34403 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34404 int nregs;
34406 if (TARGET_64BIT)
34408 const int *parm_regs;
34410 if (ix86_function_type_abi (type) == MS_ABI)
34411 parm_regs = x86_64_ms_abi_int_parameter_registers;
34412 else
34413 parm_regs = x86_64_int_parameter_registers;
34414 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34417 nregs = ix86_function_regparm (type, function);
34419 if (nregs > 0 && !stdarg_p (type))
34421 int regno;
34422 unsigned int ccvt = ix86_get_callcvt (type);
34424 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34425 regno = aggr ? DX_REG : CX_REG;
34426 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34428 regno = CX_REG;
34429 if (aggr)
34430 return gen_rtx_MEM (SImode,
34431 plus_constant (Pmode, stack_pointer_rtx, 4));
34433 else
34435 regno = AX_REG;
34436 if (aggr)
34438 regno = DX_REG;
34439 if (nregs == 1)
34440 return gen_rtx_MEM (SImode,
34441 plus_constant (Pmode,
34442 stack_pointer_rtx, 4));
34445 return gen_rtx_REG (SImode, regno);
34448 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34449 aggr ? 8 : 4));
34452 /* Determine whether x86_output_mi_thunk can succeed. */
34454 static bool
34455 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34456 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34457 HOST_WIDE_INT vcall_offset, const_tree function)
34459 /* 64-bit can handle anything. */
34460 if (TARGET_64BIT)
34461 return true;
34463 /* For 32-bit, everything's fine if we have one free register. */
34464 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34465 return true;
34467 /* Need a free register for vcall_offset. */
34468 if (vcall_offset)
34469 return false;
34471 /* Need a free register for GOT references. */
34472 if (flag_pic && !targetm.binds_local_p (function))
34473 return false;
34475 /* Otherwise ok. */
34476 return true;
34479 /* Output the assembler code for a thunk function. THUNK_DECL is the
34480 declaration for the thunk function itself, FUNCTION is the decl for
34481 the target function. DELTA is an immediate constant offset to be
34482 added to THIS. If VCALL_OFFSET is nonzero, the word at
34483 *(*this + vcall_offset) should be added to THIS. */
34485 static void
34486 x86_output_mi_thunk (FILE *file,
34487 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34488 HOST_WIDE_INT vcall_offset, tree function)
34490 rtx this_param = x86_this_parameter (function);
34491 rtx this_reg, tmp, fnaddr;
34492 unsigned int tmp_regno;
34494 if (TARGET_64BIT)
34495 tmp_regno = R10_REG;
34496 else
34498 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34499 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
34500 tmp_regno = AX_REG;
34501 else
34502 tmp_regno = CX_REG;
34505 emit_note (NOTE_INSN_PROLOGUE_END);
34507 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34508 pull it in now and let DELTA benefit. */
34509 if (REG_P (this_param))
34510 this_reg = this_param;
34511 else if (vcall_offset)
34513 /* Put the this parameter into %eax. */
34514 this_reg = gen_rtx_REG (Pmode, AX_REG);
34515 emit_move_insn (this_reg, this_param);
34517 else
34518 this_reg = NULL_RTX;
34520 /* Adjust the this parameter by a fixed constant. */
34521 if (delta)
34523 rtx delta_rtx = GEN_INT (delta);
34524 rtx delta_dst = this_reg ? this_reg : this_param;
34526 if (TARGET_64BIT)
34528 if (!x86_64_general_operand (delta_rtx, Pmode))
34530 tmp = gen_rtx_REG (Pmode, tmp_regno);
34531 emit_move_insn (tmp, delta_rtx);
34532 delta_rtx = tmp;
34536 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
34539 /* Adjust the this parameter by a value stored in the vtable. */
34540 if (vcall_offset)
34542 rtx vcall_addr, vcall_mem, this_mem;
34544 tmp = gen_rtx_REG (Pmode, tmp_regno);
34546 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
34547 if (Pmode != ptr_mode)
34548 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
34549 emit_move_insn (tmp, this_mem);
34551 /* Adjust the this parameter. */
34552 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
34553 if (TARGET_64BIT
34554 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
34556 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
34557 emit_move_insn (tmp2, GEN_INT (vcall_offset));
34558 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
34561 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
34562 if (Pmode != ptr_mode)
34563 emit_insn (gen_addsi_1_zext (this_reg,
34564 gen_rtx_REG (ptr_mode,
34565 REGNO (this_reg)),
34566 vcall_mem));
34567 else
34568 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
34571 /* If necessary, drop THIS back to its stack slot. */
34572 if (this_reg && this_reg != this_param)
34573 emit_move_insn (this_param, this_reg);
34575 fnaddr = XEXP (DECL_RTL (function), 0);
34576 if (TARGET_64BIT)
34578 if (!flag_pic || targetm.binds_local_p (function)
34579 || cfun->machine->call_abi == MS_ABI)
34581 else
34583 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
34584 tmp = gen_rtx_CONST (Pmode, tmp);
34585 fnaddr = gen_rtx_MEM (Pmode, tmp);
34588 else
34590 if (!flag_pic || targetm.binds_local_p (function))
34592 #if TARGET_MACHO
34593 else if (TARGET_MACHO)
34595 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
34596 fnaddr = XEXP (fnaddr, 0);
34598 #endif /* TARGET_MACHO */
34599 else
34601 tmp = gen_rtx_REG (Pmode, CX_REG);
34602 output_set_got (tmp, NULL_RTX);
34604 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
34605 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
34606 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
34610 /* Our sibling call patterns do not allow memories, because we have no
34611 predicate that can distinguish between frame and non-frame memory.
34612 For our purposes here, we can get away with (ab)using a jump pattern,
34613 because we're going to do no optimization. */
34614 if (MEM_P (fnaddr))
34615 emit_jump_insn (gen_indirect_jump (fnaddr));
34616 else
34618 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
34619 fnaddr = legitimize_pic_address (fnaddr,
34620 gen_rtx_REG (Pmode, tmp_regno));
34622 if (!sibcall_insn_operand (fnaddr, word_mode))
34624 tmp = gen_rtx_REG (word_mode, tmp_regno);
34625 if (GET_MODE (fnaddr) != word_mode)
34626 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
34627 emit_move_insn (tmp, fnaddr);
34628 fnaddr = tmp;
34631 tmp = gen_rtx_MEM (QImode, fnaddr);
34632 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
34633 tmp = emit_call_insn (tmp);
34634 SIBLING_CALL_P (tmp) = 1;
34636 emit_barrier ();
34638 /* Emit just enough of rest_of_compilation to get the insns emitted.
34639 Note that use_thunk calls assemble_start_function et al. */
34640 tmp = get_insns ();
34641 shorten_branches (tmp);
34642 final_start_function (tmp, file, 1);
34643 final (tmp, file, 1);
34644 final_end_function ();
34647 static void
34648 x86_file_start (void)
34650 default_file_start ();
34651 #if TARGET_MACHO
34652 darwin_file_start ();
34653 #endif
34654 if (X86_FILE_START_VERSION_DIRECTIVE)
34655 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
34656 if (X86_FILE_START_FLTUSED)
34657 fputs ("\t.global\t__fltused\n", asm_out_file);
34658 if (ix86_asm_dialect == ASM_INTEL)
34659 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
34663 x86_field_alignment (tree field, int computed)
34665 enum machine_mode mode;
34666 tree type = TREE_TYPE (field);
34668 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
34669 return computed;
34670 mode = TYPE_MODE (strip_array_types (type));
34671 if (mode == DFmode || mode == DCmode
34672 || GET_MODE_CLASS (mode) == MODE_INT
34673 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
34674 return MIN (32, computed);
34675 return computed;
34678 /* Output assembler code to FILE to increment profiler label # LABELNO
34679 for profiling a function entry. */
34680 void
34681 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
34683 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
34684 : MCOUNT_NAME);
34686 if (TARGET_64BIT)
34688 #ifndef NO_PROFILE_COUNTERS
34689 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
34690 #endif
34692 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
34693 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
34694 else
34695 fprintf (file, "\tcall\t%s\n", mcount_name);
34697 else if (flag_pic)
34699 #ifndef NO_PROFILE_COUNTERS
34700 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
34701 LPREFIX, labelno);
34702 #endif
34703 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
34705 else
34707 #ifndef NO_PROFILE_COUNTERS
34708 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
34709 LPREFIX, labelno);
34710 #endif
34711 fprintf (file, "\tcall\t%s\n", mcount_name);
34715 /* We don't have exact information about the insn sizes, but we may assume
34716 quite safely that we are informed about all 1 byte insns and memory
34717 address sizes. This is enough to eliminate unnecessary padding in
34718 99% of cases. */
34720 static int
34721 min_insn_size (rtx insn)
34723 int l = 0, len;
34725 if (!INSN_P (insn) || !active_insn_p (insn))
34726 return 0;
34728 /* Discard alignments we've emit and jump instructions. */
34729 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
34730 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
34731 return 0;
34732 if (JUMP_TABLE_DATA_P (insn))
34733 return 0;
34735 /* Important case - calls are always 5 bytes.
34736 It is common to have many calls in the row. */
34737 if (CALL_P (insn)
34738 && symbolic_reference_mentioned_p (PATTERN (insn))
34739 && !SIBLING_CALL_P (insn))
34740 return 5;
34741 len = get_attr_length (insn);
34742 if (len <= 1)
34743 return 1;
34745 /* For normal instructions we rely on get_attr_length being exact,
34746 with a few exceptions. */
34747 if (!JUMP_P (insn))
34749 enum attr_type type = get_attr_type (insn);
34751 switch (type)
34753 case TYPE_MULTI:
34754 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
34755 || asm_noperands (PATTERN (insn)) >= 0)
34756 return 0;
34757 break;
34758 case TYPE_OTHER:
34759 case TYPE_FCMP:
34760 break;
34761 default:
34762 /* Otherwise trust get_attr_length. */
34763 return len;
34766 l = get_attr_length_address (insn);
34767 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
34768 l = 4;
34770 if (l)
34771 return 1+l;
34772 else
34773 return 2;
34776 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
34778 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
34779 window. */
34781 static void
34782 ix86_avoid_jump_mispredicts (void)
34784 rtx insn, start = get_insns ();
34785 int nbytes = 0, njumps = 0;
34786 int isjump = 0;
34788 /* Look for all minimal intervals of instructions containing 4 jumps.
34789 The intervals are bounded by START and INSN. NBYTES is the total
34790 size of instructions in the interval including INSN and not including
34791 START. When the NBYTES is smaller than 16 bytes, it is possible
34792 that the end of START and INSN ends up in the same 16byte page.
34794 The smallest offset in the page INSN can start is the case where START
34795 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
34796 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
34798 for (insn = start; insn; insn = NEXT_INSN (insn))
34800 int min_size;
34802 if (LABEL_P (insn))
34804 int align = label_to_alignment (insn);
34805 int max_skip = label_to_max_skip (insn);
34807 if (max_skip > 15)
34808 max_skip = 15;
34809 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
34810 already in the current 16 byte page, because otherwise
34811 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
34812 bytes to reach 16 byte boundary. */
34813 if (align <= 0
34814 || (align <= 3 && max_skip != (1 << align) - 1))
34815 max_skip = 0;
34816 if (dump_file)
34817 fprintf (dump_file, "Label %i with max_skip %i\n",
34818 INSN_UID (insn), max_skip);
34819 if (max_skip)
34821 while (nbytes + max_skip >= 16)
34823 start = NEXT_INSN (start);
34824 if ((JUMP_P (start)
34825 && GET_CODE (PATTERN (start)) != ADDR_VEC
34826 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
34827 || CALL_P (start))
34828 njumps--, isjump = 1;
34829 else
34830 isjump = 0;
34831 nbytes -= min_insn_size (start);
34834 continue;
34837 min_size = min_insn_size (insn);
34838 nbytes += min_size;
34839 if (dump_file)
34840 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
34841 INSN_UID (insn), min_size);
34842 if ((JUMP_P (insn)
34843 && GET_CODE (PATTERN (insn)) != ADDR_VEC
34844 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
34845 || CALL_P (insn))
34846 njumps++;
34847 else
34848 continue;
34850 while (njumps > 3)
34852 start = NEXT_INSN (start);
34853 if ((JUMP_P (start)
34854 && GET_CODE (PATTERN (start)) != ADDR_VEC
34855 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
34856 || CALL_P (start))
34857 njumps--, isjump = 1;
34858 else
34859 isjump = 0;
34860 nbytes -= min_insn_size (start);
34862 gcc_assert (njumps >= 0);
34863 if (dump_file)
34864 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
34865 INSN_UID (start), INSN_UID (insn), nbytes);
34867 if (njumps == 3 && isjump && nbytes < 16)
34869 int padsize = 15 - nbytes + min_insn_size (insn);
34871 if (dump_file)
34872 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
34873 INSN_UID (insn), padsize);
34874 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
34878 #endif
34880 /* AMD Athlon works faster
34881 when RET is not destination of conditional jump or directly preceded
34882 by other jump instruction. We avoid the penalty by inserting NOP just
34883 before the RET instructions in such cases. */
34884 static void
34885 ix86_pad_returns (void)
34887 edge e;
34888 edge_iterator ei;
34890 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
34892 basic_block bb = e->src;
34893 rtx ret = BB_END (bb);
34894 rtx prev;
34895 bool replace = false;
34897 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
34898 || optimize_bb_for_size_p (bb))
34899 continue;
34900 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
34901 if (active_insn_p (prev) || LABEL_P (prev))
34902 break;
34903 if (prev && LABEL_P (prev))
34905 edge e;
34906 edge_iterator ei;
34908 FOR_EACH_EDGE (e, ei, bb->preds)
34909 if (EDGE_FREQUENCY (e) && e->src->index >= 0
34910 && !(e->flags & EDGE_FALLTHRU))
34911 replace = true;
34913 if (!replace)
34915 prev = prev_active_insn (ret);
34916 if (prev
34917 && ((JUMP_P (prev) && any_condjump_p (prev))
34918 || CALL_P (prev)))
34919 replace = true;
34920 /* Empty functions get branch mispredict even when
34921 the jump destination is not visible to us. */
34922 if (!prev && !optimize_function_for_size_p (cfun))
34923 replace = true;
34925 if (replace)
34927 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
34928 delete_insn (ret);
34933 /* Count the minimum number of instructions in BB. Return 4 if the
34934 number of instructions >= 4. */
34936 static int
34937 ix86_count_insn_bb (basic_block bb)
34939 rtx insn;
34940 int insn_count = 0;
34942 /* Count number of instructions in this block. Return 4 if the number
34943 of instructions >= 4. */
34944 FOR_BB_INSNS (bb, insn)
34946 /* Only happen in exit blocks. */
34947 if (JUMP_P (insn)
34948 && ANY_RETURN_P (PATTERN (insn)))
34949 break;
34951 if (NONDEBUG_INSN_P (insn)
34952 && GET_CODE (PATTERN (insn)) != USE
34953 && GET_CODE (PATTERN (insn)) != CLOBBER)
34955 insn_count++;
34956 if (insn_count >= 4)
34957 return insn_count;
34961 return insn_count;
34965 /* Count the minimum number of instructions in code path in BB.
34966 Return 4 if the number of instructions >= 4. */
34968 static int
34969 ix86_count_insn (basic_block bb)
34971 edge e;
34972 edge_iterator ei;
34973 int min_prev_count;
34975 /* Only bother counting instructions along paths with no
34976 more than 2 basic blocks between entry and exit. Given
34977 that BB has an edge to exit, determine if a predecessor
34978 of BB has an edge from entry. If so, compute the number
34979 of instructions in the predecessor block. If there
34980 happen to be multiple such blocks, compute the minimum. */
34981 min_prev_count = 4;
34982 FOR_EACH_EDGE (e, ei, bb->preds)
34984 edge prev_e;
34985 edge_iterator prev_ei;
34987 if (e->src == ENTRY_BLOCK_PTR)
34989 min_prev_count = 0;
34990 break;
34992 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
34994 if (prev_e->src == ENTRY_BLOCK_PTR)
34996 int count = ix86_count_insn_bb (e->src);
34997 if (count < min_prev_count)
34998 min_prev_count = count;
34999 break;
35004 if (min_prev_count < 4)
35005 min_prev_count += ix86_count_insn_bb (bb);
35007 return min_prev_count;
35010 /* Pad short function to 4 instructions. */
35012 static void
35013 ix86_pad_short_function (void)
35015 edge e;
35016 edge_iterator ei;
35018 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35020 rtx ret = BB_END (e->src);
35021 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35023 int insn_count = ix86_count_insn (e->src);
35025 /* Pad short function. */
35026 if (insn_count < 4)
35028 rtx insn = ret;
35030 /* Find epilogue. */
35031 while (insn
35032 && (!NOTE_P (insn)
35033 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35034 insn = PREV_INSN (insn);
35036 if (!insn)
35037 insn = ret;
35039 /* Two NOPs count as one instruction. */
35040 insn_count = 2 * (4 - insn_count);
35041 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35047 /* Implement machine specific optimizations. We implement padding of returns
35048 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35049 static void
35050 ix86_reorg (void)
35052 /* We are freeing block_for_insn in the toplev to keep compatibility
35053 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35054 compute_bb_for_insn ();
35056 if (optimize && optimize_function_for_speed_p (cfun))
35058 if (TARGET_PAD_SHORT_FUNCTION)
35059 ix86_pad_short_function ();
35060 else if (TARGET_PAD_RETURNS)
35061 ix86_pad_returns ();
35062 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35063 if (TARGET_FOUR_JUMP_LIMIT)
35064 ix86_avoid_jump_mispredicts ();
35065 #endif
35069 /* Return nonzero when QImode register that must be represented via REX prefix
35070 is used. */
35071 bool
35072 x86_extended_QIreg_mentioned_p (rtx insn)
35074 int i;
35075 extract_insn_cached (insn);
35076 for (i = 0; i < recog_data.n_operands; i++)
35077 if (GENERAL_REG_P (recog_data.operand[i])
35078 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35079 return true;
35080 return false;
35083 /* Return nonzero when P points to register encoded via REX prefix.
35084 Called via for_each_rtx. */
35085 static int
35086 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35088 unsigned int regno;
35089 if (!REG_P (*p))
35090 return 0;
35091 regno = REGNO (*p);
35092 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35095 /* Return true when INSN mentions register that must be encoded using REX
35096 prefix. */
35097 bool
35098 x86_extended_reg_mentioned_p (rtx insn)
35100 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35101 extended_reg_mentioned_1, NULL);
35104 /* If profitable, negate (without causing overflow) integer constant
35105 of mode MODE at location LOC. Return true in this case. */
35106 bool
35107 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35109 HOST_WIDE_INT val;
35111 if (!CONST_INT_P (*loc))
35112 return false;
35114 switch (mode)
35116 case DImode:
35117 /* DImode x86_64 constants must fit in 32 bits. */
35118 gcc_assert (x86_64_immediate_operand (*loc, mode));
35120 mode = SImode;
35121 break;
35123 case SImode:
35124 case HImode:
35125 case QImode:
35126 break;
35128 default:
35129 gcc_unreachable ();
35132 /* Avoid overflows. */
35133 if (mode_signbit_p (mode, *loc))
35134 return false;
35136 val = INTVAL (*loc);
35138 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35139 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35140 if ((val < 0 && val != -128)
35141 || val == 128)
35143 *loc = GEN_INT (-val);
35144 return true;
35147 return false;
35150 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35151 optabs would emit if we didn't have TFmode patterns. */
35153 void
35154 x86_emit_floatuns (rtx operands[2])
35156 rtx neglab, donelab, i0, i1, f0, in, out;
35157 enum machine_mode mode, inmode;
35159 inmode = GET_MODE (operands[1]);
35160 gcc_assert (inmode == SImode || inmode == DImode);
35162 out = operands[0];
35163 in = force_reg (inmode, operands[1]);
35164 mode = GET_MODE (out);
35165 neglab = gen_label_rtx ();
35166 donelab = gen_label_rtx ();
35167 f0 = gen_reg_rtx (mode);
35169 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35171 expand_float (out, in, 0);
35173 emit_jump_insn (gen_jump (donelab));
35174 emit_barrier ();
35176 emit_label (neglab);
35178 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35179 1, OPTAB_DIRECT);
35180 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35181 1, OPTAB_DIRECT);
35182 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35184 expand_float (f0, i0, 0);
35186 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35188 emit_label (donelab);
35191 /* AVX2 does support 32-byte integer vector operations,
35192 thus the longest vector we are faced with is V32QImode. */
35193 #define MAX_VECT_LEN 32
35195 struct expand_vec_perm_d
35197 rtx target, op0, op1;
35198 unsigned char perm[MAX_VECT_LEN];
35199 enum machine_mode vmode;
35200 unsigned char nelt;
35201 bool one_operand_p;
35202 bool testing_p;
35205 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35206 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35207 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35209 /* Get a vector mode of the same size as the original but with elements
35210 twice as wide. This is only guaranteed to apply to integral vectors. */
35212 static inline enum machine_mode
35213 get_mode_wider_vector (enum machine_mode o)
35215 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35216 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35217 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35218 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35219 return n;
35222 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35223 with all elements equal to VAR. Return true if successful. */
35225 static bool
35226 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35227 rtx target, rtx val)
35229 bool ok;
35231 switch (mode)
35233 case V2SImode:
35234 case V2SFmode:
35235 if (!mmx_ok)
35236 return false;
35237 /* FALLTHRU */
35239 case V4DFmode:
35240 case V4DImode:
35241 case V8SFmode:
35242 case V8SImode:
35243 case V2DFmode:
35244 case V2DImode:
35245 case V4SFmode:
35246 case V4SImode:
35248 rtx insn, dup;
35250 /* First attempt to recognize VAL as-is. */
35251 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35252 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35253 if (recog_memoized (insn) < 0)
35255 rtx seq;
35256 /* If that fails, force VAL into a register. */
35258 start_sequence ();
35259 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35260 seq = get_insns ();
35261 end_sequence ();
35262 if (seq)
35263 emit_insn_before (seq, insn);
35265 ok = recog_memoized (insn) >= 0;
35266 gcc_assert (ok);
35269 return true;
35271 case V4HImode:
35272 if (!mmx_ok)
35273 return false;
35274 if (TARGET_SSE || TARGET_3DNOW_A)
35276 rtx x;
35278 val = gen_lowpart (SImode, val);
35279 x = gen_rtx_TRUNCATE (HImode, val);
35280 x = gen_rtx_VEC_DUPLICATE (mode, x);
35281 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35282 return true;
35284 goto widen;
35286 case V8QImode:
35287 if (!mmx_ok)
35288 return false;
35289 goto widen;
35291 case V8HImode:
35292 if (TARGET_SSE2)
35294 struct expand_vec_perm_d dperm;
35295 rtx tmp1, tmp2;
35297 permute:
35298 memset (&dperm, 0, sizeof (dperm));
35299 dperm.target = target;
35300 dperm.vmode = mode;
35301 dperm.nelt = GET_MODE_NUNITS (mode);
35302 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35303 dperm.one_operand_p = true;
35305 /* Extend to SImode using a paradoxical SUBREG. */
35306 tmp1 = gen_reg_rtx (SImode);
35307 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35309 /* Insert the SImode value as low element of a V4SImode vector. */
35310 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35311 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35313 ok = (expand_vec_perm_1 (&dperm)
35314 || expand_vec_perm_broadcast_1 (&dperm));
35315 gcc_assert (ok);
35316 return ok;
35318 goto widen;
35320 case V16QImode:
35321 if (TARGET_SSE2)
35322 goto permute;
35323 goto widen;
35325 widen:
35326 /* Replicate the value once into the next wider mode and recurse. */
35328 enum machine_mode smode, wsmode, wvmode;
35329 rtx x;
35331 smode = GET_MODE_INNER (mode);
35332 wvmode = get_mode_wider_vector (mode);
35333 wsmode = GET_MODE_INNER (wvmode);
35335 val = convert_modes (wsmode, smode, val, true);
35336 x = expand_simple_binop (wsmode, ASHIFT, val,
35337 GEN_INT (GET_MODE_BITSIZE (smode)),
35338 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35339 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35341 x = gen_lowpart (wvmode, target);
35342 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35343 gcc_assert (ok);
35344 return ok;
35347 case V16HImode:
35348 case V32QImode:
35350 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35351 rtx x = gen_reg_rtx (hvmode);
35353 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35354 gcc_assert (ok);
35356 x = gen_rtx_VEC_CONCAT (mode, x, x);
35357 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35359 return true;
35361 default:
35362 return false;
35366 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35367 whose ONE_VAR element is VAR, and other elements are zero. Return true
35368 if successful. */
35370 static bool
35371 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35372 rtx target, rtx var, int one_var)
35374 enum machine_mode vsimode;
35375 rtx new_target;
35376 rtx x, tmp;
35377 bool use_vector_set = false;
35379 switch (mode)
35381 case V2DImode:
35382 /* For SSE4.1, we normally use vector set. But if the second
35383 element is zero and inter-unit moves are OK, we use movq
35384 instead. */
35385 use_vector_set = (TARGET_64BIT
35386 && TARGET_SSE4_1
35387 && !(TARGET_INTER_UNIT_MOVES
35388 && one_var == 0));
35389 break;
35390 case V16QImode:
35391 case V4SImode:
35392 case V4SFmode:
35393 use_vector_set = TARGET_SSE4_1;
35394 break;
35395 case V8HImode:
35396 use_vector_set = TARGET_SSE2;
35397 break;
35398 case V4HImode:
35399 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35400 break;
35401 case V32QImode:
35402 case V16HImode:
35403 case V8SImode:
35404 case V8SFmode:
35405 case V4DFmode:
35406 use_vector_set = TARGET_AVX;
35407 break;
35408 case V4DImode:
35409 /* Use ix86_expand_vector_set in 64bit mode only. */
35410 use_vector_set = TARGET_AVX && TARGET_64BIT;
35411 break;
35412 default:
35413 break;
35416 if (use_vector_set)
35418 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35419 var = force_reg (GET_MODE_INNER (mode), var);
35420 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35421 return true;
35424 switch (mode)
35426 case V2SFmode:
35427 case V2SImode:
35428 if (!mmx_ok)
35429 return false;
35430 /* FALLTHRU */
35432 case V2DFmode:
35433 case V2DImode:
35434 if (one_var != 0)
35435 return false;
35436 var = force_reg (GET_MODE_INNER (mode), var);
35437 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35438 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35439 return true;
35441 case V4SFmode:
35442 case V4SImode:
35443 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35444 new_target = gen_reg_rtx (mode);
35445 else
35446 new_target = target;
35447 var = force_reg (GET_MODE_INNER (mode), var);
35448 x = gen_rtx_VEC_DUPLICATE (mode, var);
35449 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35450 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35451 if (one_var != 0)
35453 /* We need to shuffle the value to the correct position, so
35454 create a new pseudo to store the intermediate result. */
35456 /* With SSE2, we can use the integer shuffle insns. */
35457 if (mode != V4SFmode && TARGET_SSE2)
35459 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35460 const1_rtx,
35461 GEN_INT (one_var == 1 ? 0 : 1),
35462 GEN_INT (one_var == 2 ? 0 : 1),
35463 GEN_INT (one_var == 3 ? 0 : 1)));
35464 if (target != new_target)
35465 emit_move_insn (target, new_target);
35466 return true;
35469 /* Otherwise convert the intermediate result to V4SFmode and
35470 use the SSE1 shuffle instructions. */
35471 if (mode != V4SFmode)
35473 tmp = gen_reg_rtx (V4SFmode);
35474 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35476 else
35477 tmp = new_target;
35479 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
35480 const1_rtx,
35481 GEN_INT (one_var == 1 ? 0 : 1),
35482 GEN_INT (one_var == 2 ? 0+4 : 1+4),
35483 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
35485 if (mode != V4SFmode)
35486 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
35487 else if (tmp != target)
35488 emit_move_insn (target, tmp);
35490 else if (target != new_target)
35491 emit_move_insn (target, new_target);
35492 return true;
35494 case V8HImode:
35495 case V16QImode:
35496 vsimode = V4SImode;
35497 goto widen;
35498 case V4HImode:
35499 case V8QImode:
35500 if (!mmx_ok)
35501 return false;
35502 vsimode = V2SImode;
35503 goto widen;
35504 widen:
35505 if (one_var != 0)
35506 return false;
35508 /* Zero extend the variable element to SImode and recurse. */
35509 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
35511 x = gen_reg_rtx (vsimode);
35512 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
35513 var, one_var))
35514 gcc_unreachable ();
35516 emit_move_insn (target, gen_lowpart (mode, x));
35517 return true;
35519 default:
35520 return false;
35524 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35525 consisting of the values in VALS. It is known that all elements
35526 except ONE_VAR are constants. Return true if successful. */
35528 static bool
35529 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
35530 rtx target, rtx vals, int one_var)
35532 rtx var = XVECEXP (vals, 0, one_var);
35533 enum machine_mode wmode;
35534 rtx const_vec, x;
35536 const_vec = copy_rtx (vals);
35537 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
35538 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
35540 switch (mode)
35542 case V2DFmode:
35543 case V2DImode:
35544 case V2SFmode:
35545 case V2SImode:
35546 /* For the two element vectors, it's just as easy to use
35547 the general case. */
35548 return false;
35550 case V4DImode:
35551 /* Use ix86_expand_vector_set in 64bit mode only. */
35552 if (!TARGET_64BIT)
35553 return false;
35554 case V4DFmode:
35555 case V8SFmode:
35556 case V8SImode:
35557 case V16HImode:
35558 case V32QImode:
35559 case V4SFmode:
35560 case V4SImode:
35561 case V8HImode:
35562 case V4HImode:
35563 break;
35565 case V16QImode:
35566 if (TARGET_SSE4_1)
35567 break;
35568 wmode = V8HImode;
35569 goto widen;
35570 case V8QImode:
35571 wmode = V4HImode;
35572 goto widen;
35573 widen:
35574 /* There's no way to set one QImode entry easily. Combine
35575 the variable value with its adjacent constant value, and
35576 promote to an HImode set. */
35577 x = XVECEXP (vals, 0, one_var ^ 1);
35578 if (one_var & 1)
35580 var = convert_modes (HImode, QImode, var, true);
35581 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
35582 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35583 x = GEN_INT (INTVAL (x) & 0xff);
35585 else
35587 var = convert_modes (HImode, QImode, var, true);
35588 x = gen_int_mode (INTVAL (x) << 8, HImode);
35590 if (x != const0_rtx)
35591 var = expand_simple_binop (HImode, IOR, var, x, var,
35592 1, OPTAB_LIB_WIDEN);
35594 x = gen_reg_rtx (wmode);
35595 emit_move_insn (x, gen_lowpart (wmode, const_vec));
35596 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
35598 emit_move_insn (target, gen_lowpart (mode, x));
35599 return true;
35601 default:
35602 return false;
35605 emit_move_insn (target, const_vec);
35606 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35607 return true;
35610 /* A subroutine of ix86_expand_vector_init_general. Use vector
35611 concatenate to handle the most general case: all values variable,
35612 and none identical. */
35614 static void
35615 ix86_expand_vector_init_concat (enum machine_mode mode,
35616 rtx target, rtx *ops, int n)
35618 enum machine_mode cmode, hmode = VOIDmode;
35619 rtx first[8], second[4];
35620 rtvec v;
35621 int i, j;
35623 switch (n)
35625 case 2:
35626 switch (mode)
35628 case V8SImode:
35629 cmode = V4SImode;
35630 break;
35631 case V8SFmode:
35632 cmode = V4SFmode;
35633 break;
35634 case V4DImode:
35635 cmode = V2DImode;
35636 break;
35637 case V4DFmode:
35638 cmode = V2DFmode;
35639 break;
35640 case V4SImode:
35641 cmode = V2SImode;
35642 break;
35643 case V4SFmode:
35644 cmode = V2SFmode;
35645 break;
35646 case V2DImode:
35647 cmode = DImode;
35648 break;
35649 case V2SImode:
35650 cmode = SImode;
35651 break;
35652 case V2DFmode:
35653 cmode = DFmode;
35654 break;
35655 case V2SFmode:
35656 cmode = SFmode;
35657 break;
35658 default:
35659 gcc_unreachable ();
35662 if (!register_operand (ops[1], cmode))
35663 ops[1] = force_reg (cmode, ops[1]);
35664 if (!register_operand (ops[0], cmode))
35665 ops[0] = force_reg (cmode, ops[0]);
35666 emit_insn (gen_rtx_SET (VOIDmode, target,
35667 gen_rtx_VEC_CONCAT (mode, ops[0],
35668 ops[1])));
35669 break;
35671 case 4:
35672 switch (mode)
35674 case V4DImode:
35675 cmode = V2DImode;
35676 break;
35677 case V4DFmode:
35678 cmode = V2DFmode;
35679 break;
35680 case V4SImode:
35681 cmode = V2SImode;
35682 break;
35683 case V4SFmode:
35684 cmode = V2SFmode;
35685 break;
35686 default:
35687 gcc_unreachable ();
35689 goto half;
35691 case 8:
35692 switch (mode)
35694 case V8SImode:
35695 cmode = V2SImode;
35696 hmode = V4SImode;
35697 break;
35698 case V8SFmode:
35699 cmode = V2SFmode;
35700 hmode = V4SFmode;
35701 break;
35702 default:
35703 gcc_unreachable ();
35705 goto half;
35707 half:
35708 /* FIXME: We process inputs backward to help RA. PR 36222. */
35709 i = n - 1;
35710 j = (n >> 1) - 1;
35711 for (; i > 0; i -= 2, j--)
35713 first[j] = gen_reg_rtx (cmode);
35714 v = gen_rtvec (2, ops[i - 1], ops[i]);
35715 ix86_expand_vector_init (false, first[j],
35716 gen_rtx_PARALLEL (cmode, v));
35719 n >>= 1;
35720 if (n > 2)
35722 gcc_assert (hmode != VOIDmode);
35723 for (i = j = 0; i < n; i += 2, j++)
35725 second[j] = gen_reg_rtx (hmode);
35726 ix86_expand_vector_init_concat (hmode, second [j],
35727 &first [i], 2);
35729 n >>= 1;
35730 ix86_expand_vector_init_concat (mode, target, second, n);
35732 else
35733 ix86_expand_vector_init_concat (mode, target, first, n);
35734 break;
35736 default:
35737 gcc_unreachable ();
35741 /* A subroutine of ix86_expand_vector_init_general. Use vector
35742 interleave to handle the most general case: all values variable,
35743 and none identical. */
35745 static void
35746 ix86_expand_vector_init_interleave (enum machine_mode mode,
35747 rtx target, rtx *ops, int n)
35749 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
35750 int i, j;
35751 rtx op0, op1;
35752 rtx (*gen_load_even) (rtx, rtx, rtx);
35753 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
35754 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
35756 switch (mode)
35758 case V8HImode:
35759 gen_load_even = gen_vec_setv8hi;
35760 gen_interleave_first_low = gen_vec_interleave_lowv4si;
35761 gen_interleave_second_low = gen_vec_interleave_lowv2di;
35762 inner_mode = HImode;
35763 first_imode = V4SImode;
35764 second_imode = V2DImode;
35765 third_imode = VOIDmode;
35766 break;
35767 case V16QImode:
35768 gen_load_even = gen_vec_setv16qi;
35769 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
35770 gen_interleave_second_low = gen_vec_interleave_lowv4si;
35771 inner_mode = QImode;
35772 first_imode = V8HImode;
35773 second_imode = V4SImode;
35774 third_imode = V2DImode;
35775 break;
35776 default:
35777 gcc_unreachable ();
35780 for (i = 0; i < n; i++)
35782 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
35783 op0 = gen_reg_rtx (SImode);
35784 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
35786 /* Insert the SImode value as low element of V4SImode vector. */
35787 op1 = gen_reg_rtx (V4SImode);
35788 op0 = gen_rtx_VEC_MERGE (V4SImode,
35789 gen_rtx_VEC_DUPLICATE (V4SImode,
35790 op0),
35791 CONST0_RTX (V4SImode),
35792 const1_rtx);
35793 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
35795 /* Cast the V4SImode vector back to a vector in orignal mode. */
35796 op0 = gen_reg_rtx (mode);
35797 emit_move_insn (op0, gen_lowpart (mode, op1));
35799 /* Load even elements into the second positon. */
35800 emit_insn (gen_load_even (op0,
35801 force_reg (inner_mode,
35802 ops [i + i + 1]),
35803 const1_rtx));
35805 /* Cast vector to FIRST_IMODE vector. */
35806 ops[i] = gen_reg_rtx (first_imode);
35807 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
35810 /* Interleave low FIRST_IMODE vectors. */
35811 for (i = j = 0; i < n; i += 2, j++)
35813 op0 = gen_reg_rtx (first_imode);
35814 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
35816 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
35817 ops[j] = gen_reg_rtx (second_imode);
35818 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
35821 /* Interleave low SECOND_IMODE vectors. */
35822 switch (second_imode)
35824 case V4SImode:
35825 for (i = j = 0; i < n / 2; i += 2, j++)
35827 op0 = gen_reg_rtx (second_imode);
35828 emit_insn (gen_interleave_second_low (op0, ops[i],
35829 ops[i + 1]));
35831 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
35832 vector. */
35833 ops[j] = gen_reg_rtx (third_imode);
35834 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
35836 second_imode = V2DImode;
35837 gen_interleave_second_low = gen_vec_interleave_lowv2di;
35838 /* FALLTHRU */
35840 case V2DImode:
35841 op0 = gen_reg_rtx (second_imode);
35842 emit_insn (gen_interleave_second_low (op0, ops[0],
35843 ops[1]));
35845 /* Cast the SECOND_IMODE vector back to a vector on original
35846 mode. */
35847 emit_insn (gen_rtx_SET (VOIDmode, target,
35848 gen_lowpart (mode, op0)));
35849 break;
35851 default:
35852 gcc_unreachable ();
35856 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
35857 all values variable, and none identical. */
35859 static void
35860 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
35861 rtx target, rtx vals)
35863 rtx ops[32], op0, op1;
35864 enum machine_mode half_mode = VOIDmode;
35865 int n, i;
35867 switch (mode)
35869 case V2SFmode:
35870 case V2SImode:
35871 if (!mmx_ok && !TARGET_SSE)
35872 break;
35873 /* FALLTHRU */
35875 case V8SFmode:
35876 case V8SImode:
35877 case V4DFmode:
35878 case V4DImode:
35879 case V4SFmode:
35880 case V4SImode:
35881 case V2DFmode:
35882 case V2DImode:
35883 n = GET_MODE_NUNITS (mode);
35884 for (i = 0; i < n; i++)
35885 ops[i] = XVECEXP (vals, 0, i);
35886 ix86_expand_vector_init_concat (mode, target, ops, n);
35887 return;
35889 case V32QImode:
35890 half_mode = V16QImode;
35891 goto half;
35893 case V16HImode:
35894 half_mode = V8HImode;
35895 goto half;
35897 half:
35898 n = GET_MODE_NUNITS (mode);
35899 for (i = 0; i < n; i++)
35900 ops[i] = XVECEXP (vals, 0, i);
35901 op0 = gen_reg_rtx (half_mode);
35902 op1 = gen_reg_rtx (half_mode);
35903 ix86_expand_vector_init_interleave (half_mode, op0, ops,
35904 n >> 2);
35905 ix86_expand_vector_init_interleave (half_mode, op1,
35906 &ops [n >> 1], n >> 2);
35907 emit_insn (gen_rtx_SET (VOIDmode, target,
35908 gen_rtx_VEC_CONCAT (mode, op0, op1)));
35909 return;
35911 case V16QImode:
35912 if (!TARGET_SSE4_1)
35913 break;
35914 /* FALLTHRU */
35916 case V8HImode:
35917 if (!TARGET_SSE2)
35918 break;
35920 /* Don't use ix86_expand_vector_init_interleave if we can't
35921 move from GPR to SSE register directly. */
35922 if (!TARGET_INTER_UNIT_MOVES)
35923 break;
35925 n = GET_MODE_NUNITS (mode);
35926 for (i = 0; i < n; i++)
35927 ops[i] = XVECEXP (vals, 0, i);
35928 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
35929 return;
35931 case V4HImode:
35932 case V8QImode:
35933 break;
35935 default:
35936 gcc_unreachable ();
35940 int i, j, n_elts, n_words, n_elt_per_word;
35941 enum machine_mode inner_mode;
35942 rtx words[4], shift;
35944 inner_mode = GET_MODE_INNER (mode);
35945 n_elts = GET_MODE_NUNITS (mode);
35946 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
35947 n_elt_per_word = n_elts / n_words;
35948 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
35950 for (i = 0; i < n_words; ++i)
35952 rtx word = NULL_RTX;
35954 for (j = 0; j < n_elt_per_word; ++j)
35956 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
35957 elt = convert_modes (word_mode, inner_mode, elt, true);
35959 if (j == 0)
35960 word = elt;
35961 else
35963 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
35964 word, 1, OPTAB_LIB_WIDEN);
35965 word = expand_simple_binop (word_mode, IOR, word, elt,
35966 word, 1, OPTAB_LIB_WIDEN);
35970 words[i] = word;
35973 if (n_words == 1)
35974 emit_move_insn (target, gen_lowpart (mode, words[0]));
35975 else if (n_words == 2)
35977 rtx tmp = gen_reg_rtx (mode);
35978 emit_clobber (tmp);
35979 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
35980 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
35981 emit_move_insn (target, tmp);
35983 else if (n_words == 4)
35985 rtx tmp = gen_reg_rtx (V4SImode);
35986 gcc_assert (word_mode == SImode);
35987 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
35988 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
35989 emit_move_insn (target, gen_lowpart (mode, tmp));
35991 else
35992 gcc_unreachable ();
35996 /* Initialize vector TARGET via VALS. Suppress the use of MMX
35997 instructions unless MMX_OK is true. */
35999 void
36000 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36002 enum machine_mode mode = GET_MODE (target);
36003 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36004 int n_elts = GET_MODE_NUNITS (mode);
36005 int n_var = 0, one_var = -1;
36006 bool all_same = true, all_const_zero = true;
36007 int i;
36008 rtx x;
36010 for (i = 0; i < n_elts; ++i)
36012 x = XVECEXP (vals, 0, i);
36013 if (!(CONST_INT_P (x)
36014 || GET_CODE (x) == CONST_DOUBLE
36015 || GET_CODE (x) == CONST_FIXED))
36016 n_var++, one_var = i;
36017 else if (x != CONST0_RTX (inner_mode))
36018 all_const_zero = false;
36019 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36020 all_same = false;
36023 /* Constants are best loaded from the constant pool. */
36024 if (n_var == 0)
36026 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36027 return;
36030 /* If all values are identical, broadcast the value. */
36031 if (all_same
36032 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36033 XVECEXP (vals, 0, 0)))
36034 return;
36036 /* Values where only one field is non-constant are best loaded from
36037 the pool and overwritten via move later. */
36038 if (n_var == 1)
36040 if (all_const_zero
36041 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36042 XVECEXP (vals, 0, one_var),
36043 one_var))
36044 return;
36046 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36047 return;
36050 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36053 void
36054 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36056 enum machine_mode mode = GET_MODE (target);
36057 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36058 enum machine_mode half_mode;
36059 bool use_vec_merge = false;
36060 rtx tmp;
36061 static rtx (*gen_extract[6][2]) (rtx, rtx)
36063 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36064 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36065 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36066 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36067 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36068 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36070 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36072 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36073 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36074 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36075 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36076 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36077 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36079 int i, j, n;
36081 switch (mode)
36083 case V2SFmode:
36084 case V2SImode:
36085 if (mmx_ok)
36087 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36088 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36089 if (elt == 0)
36090 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36091 else
36092 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36093 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36094 return;
36096 break;
36098 case V2DImode:
36099 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36100 if (use_vec_merge)
36101 break;
36103 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36104 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36105 if (elt == 0)
36106 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36107 else
36108 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36109 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36110 return;
36112 case V2DFmode:
36114 rtx op0, op1;
36116 /* For the two element vectors, we implement a VEC_CONCAT with
36117 the extraction of the other element. */
36119 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36120 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36122 if (elt == 0)
36123 op0 = val, op1 = tmp;
36124 else
36125 op0 = tmp, op1 = val;
36127 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36128 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36130 return;
36132 case V4SFmode:
36133 use_vec_merge = TARGET_SSE4_1;
36134 if (use_vec_merge)
36135 break;
36137 switch (elt)
36139 case 0:
36140 use_vec_merge = true;
36141 break;
36143 case 1:
36144 /* tmp = target = A B C D */
36145 tmp = copy_to_reg (target);
36146 /* target = A A B B */
36147 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36148 /* target = X A B B */
36149 ix86_expand_vector_set (false, target, val, 0);
36150 /* target = A X C D */
36151 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36152 const1_rtx, const0_rtx,
36153 GEN_INT (2+4), GEN_INT (3+4)));
36154 return;
36156 case 2:
36157 /* tmp = target = A B C D */
36158 tmp = copy_to_reg (target);
36159 /* tmp = X B C D */
36160 ix86_expand_vector_set (false, tmp, val, 0);
36161 /* target = A B X D */
36162 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36163 const0_rtx, const1_rtx,
36164 GEN_INT (0+4), GEN_INT (3+4)));
36165 return;
36167 case 3:
36168 /* tmp = target = A B C D */
36169 tmp = copy_to_reg (target);
36170 /* tmp = X B C D */
36171 ix86_expand_vector_set (false, tmp, val, 0);
36172 /* target = A B X D */
36173 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36174 const0_rtx, const1_rtx,
36175 GEN_INT (2+4), GEN_INT (0+4)));
36176 return;
36178 default:
36179 gcc_unreachable ();
36181 break;
36183 case V4SImode:
36184 use_vec_merge = TARGET_SSE4_1;
36185 if (use_vec_merge)
36186 break;
36188 /* Element 0 handled by vec_merge below. */
36189 if (elt == 0)
36191 use_vec_merge = true;
36192 break;
36195 if (TARGET_SSE2)
36197 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36198 store into element 0, then shuffle them back. */
36200 rtx order[4];
36202 order[0] = GEN_INT (elt);
36203 order[1] = const1_rtx;
36204 order[2] = const2_rtx;
36205 order[3] = GEN_INT (3);
36206 order[elt] = const0_rtx;
36208 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36209 order[1], order[2], order[3]));
36211 ix86_expand_vector_set (false, target, val, 0);
36213 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36214 order[1], order[2], order[3]));
36216 else
36218 /* For SSE1, we have to reuse the V4SF code. */
36219 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36220 gen_lowpart (SFmode, val), elt);
36222 return;
36224 case V8HImode:
36225 use_vec_merge = TARGET_SSE2;
36226 break;
36227 case V4HImode:
36228 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36229 break;
36231 case V16QImode:
36232 use_vec_merge = TARGET_SSE4_1;
36233 break;
36235 case V8QImode:
36236 break;
36238 case V32QImode:
36239 half_mode = V16QImode;
36240 j = 0;
36241 n = 16;
36242 goto half;
36244 case V16HImode:
36245 half_mode = V8HImode;
36246 j = 1;
36247 n = 8;
36248 goto half;
36250 case V8SImode:
36251 half_mode = V4SImode;
36252 j = 2;
36253 n = 4;
36254 goto half;
36256 case V4DImode:
36257 half_mode = V2DImode;
36258 j = 3;
36259 n = 2;
36260 goto half;
36262 case V8SFmode:
36263 half_mode = V4SFmode;
36264 j = 4;
36265 n = 4;
36266 goto half;
36268 case V4DFmode:
36269 half_mode = V2DFmode;
36270 j = 5;
36271 n = 2;
36272 goto half;
36274 half:
36275 /* Compute offset. */
36276 i = elt / n;
36277 elt %= n;
36279 gcc_assert (i <= 1);
36281 /* Extract the half. */
36282 tmp = gen_reg_rtx (half_mode);
36283 emit_insn (gen_extract[j][i] (tmp, target));
36285 /* Put val in tmp at elt. */
36286 ix86_expand_vector_set (false, tmp, val, elt);
36288 /* Put it back. */
36289 emit_insn (gen_insert[j][i] (target, target, tmp));
36290 return;
36292 default:
36293 break;
36296 if (use_vec_merge)
36298 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36299 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36300 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36302 else
36304 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36306 emit_move_insn (mem, target);
36308 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36309 emit_move_insn (tmp, val);
36311 emit_move_insn (target, mem);
36315 void
36316 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36318 enum machine_mode mode = GET_MODE (vec);
36319 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36320 bool use_vec_extr = false;
36321 rtx tmp;
36323 switch (mode)
36325 case V2SImode:
36326 case V2SFmode:
36327 if (!mmx_ok)
36328 break;
36329 /* FALLTHRU */
36331 case V2DFmode:
36332 case V2DImode:
36333 use_vec_extr = true;
36334 break;
36336 case V4SFmode:
36337 use_vec_extr = TARGET_SSE4_1;
36338 if (use_vec_extr)
36339 break;
36341 switch (elt)
36343 case 0:
36344 tmp = vec;
36345 break;
36347 case 1:
36348 case 3:
36349 tmp = gen_reg_rtx (mode);
36350 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36351 GEN_INT (elt), GEN_INT (elt),
36352 GEN_INT (elt+4), GEN_INT (elt+4)));
36353 break;
36355 case 2:
36356 tmp = gen_reg_rtx (mode);
36357 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36358 break;
36360 default:
36361 gcc_unreachable ();
36363 vec = tmp;
36364 use_vec_extr = true;
36365 elt = 0;
36366 break;
36368 case V4SImode:
36369 use_vec_extr = TARGET_SSE4_1;
36370 if (use_vec_extr)
36371 break;
36373 if (TARGET_SSE2)
36375 switch (elt)
36377 case 0:
36378 tmp = vec;
36379 break;
36381 case 1:
36382 case 3:
36383 tmp = gen_reg_rtx (mode);
36384 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36385 GEN_INT (elt), GEN_INT (elt),
36386 GEN_INT (elt), GEN_INT (elt)));
36387 break;
36389 case 2:
36390 tmp = gen_reg_rtx (mode);
36391 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36392 break;
36394 default:
36395 gcc_unreachable ();
36397 vec = tmp;
36398 use_vec_extr = true;
36399 elt = 0;
36401 else
36403 /* For SSE1, we have to reuse the V4SF code. */
36404 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36405 gen_lowpart (V4SFmode, vec), elt);
36406 return;
36408 break;
36410 case V8HImode:
36411 use_vec_extr = TARGET_SSE2;
36412 break;
36413 case V4HImode:
36414 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36415 break;
36417 case V16QImode:
36418 use_vec_extr = TARGET_SSE4_1;
36419 break;
36421 case V8SFmode:
36422 if (TARGET_AVX)
36424 tmp = gen_reg_rtx (V4SFmode);
36425 if (elt < 4)
36426 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36427 else
36428 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36429 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36430 return;
36432 break;
36434 case V4DFmode:
36435 if (TARGET_AVX)
36437 tmp = gen_reg_rtx (V2DFmode);
36438 if (elt < 2)
36439 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36440 else
36441 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36442 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36443 return;
36445 break;
36447 case V32QImode:
36448 if (TARGET_AVX)
36450 tmp = gen_reg_rtx (V16QImode);
36451 if (elt < 16)
36452 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36453 else
36454 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36455 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36456 return;
36458 break;
36460 case V16HImode:
36461 if (TARGET_AVX)
36463 tmp = gen_reg_rtx (V8HImode);
36464 if (elt < 8)
36465 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36466 else
36467 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36468 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36469 return;
36471 break;
36473 case V8SImode:
36474 if (TARGET_AVX)
36476 tmp = gen_reg_rtx (V4SImode);
36477 if (elt < 4)
36478 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
36479 else
36480 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
36481 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36482 return;
36484 break;
36486 case V4DImode:
36487 if (TARGET_AVX)
36489 tmp = gen_reg_rtx (V2DImode);
36490 if (elt < 2)
36491 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
36492 else
36493 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
36494 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36495 return;
36497 break;
36499 case V8QImode:
36500 /* ??? Could extract the appropriate HImode element and shift. */
36501 default:
36502 break;
36505 if (use_vec_extr)
36507 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
36508 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
36510 /* Let the rtl optimizers know about the zero extension performed. */
36511 if (inner_mode == QImode || inner_mode == HImode)
36513 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
36514 target = gen_lowpart (SImode, target);
36517 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36519 else
36521 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36523 emit_move_insn (mem, vec);
36525 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36526 emit_move_insn (target, tmp);
36530 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
36531 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
36532 The upper bits of DEST are undefined, though they shouldn't cause
36533 exceptions (some bits from src or all zeros are ok). */
36535 static void
36536 emit_reduc_half (rtx dest, rtx src, int i)
36538 rtx tem;
36539 switch (GET_MODE (src))
36541 case V4SFmode:
36542 if (i == 128)
36543 tem = gen_sse_movhlps (dest, src, src);
36544 else
36545 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
36546 GEN_INT (1 + 4), GEN_INT (1 + 4));
36547 break;
36548 case V2DFmode:
36549 tem = gen_vec_interleave_highv2df (dest, src, src);
36550 break;
36551 case V16QImode:
36552 case V8HImode:
36553 case V4SImode:
36554 case V2DImode:
36555 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
36556 gen_lowpart (V1TImode, src),
36557 GEN_INT (i / 2));
36558 break;
36559 case V8SFmode:
36560 if (i == 256)
36561 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
36562 else
36563 tem = gen_avx_shufps256 (dest, src, src,
36564 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
36565 break;
36566 case V4DFmode:
36567 if (i == 256)
36568 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
36569 else
36570 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
36571 break;
36572 case V32QImode:
36573 case V16HImode:
36574 case V8SImode:
36575 case V4DImode:
36576 if (i == 256)
36577 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
36578 gen_lowpart (V4DImode, src),
36579 gen_lowpart (V4DImode, src),
36580 const1_rtx);
36581 else
36582 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
36583 gen_lowpart (V2TImode, src),
36584 GEN_INT (i / 2));
36585 break;
36586 default:
36587 gcc_unreachable ();
36589 emit_insn (tem);
36592 /* Expand a vector reduction. FN is the binary pattern to reduce;
36593 DEST is the destination; IN is the input vector. */
36595 void
36596 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
36598 rtx half, dst, vec = in;
36599 enum machine_mode mode = GET_MODE (in);
36600 int i;
36602 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
36603 if (TARGET_SSE4_1
36604 && mode == V8HImode
36605 && fn == gen_uminv8hi3)
36607 emit_insn (gen_sse4_1_phminposuw (dest, in));
36608 return;
36611 for (i = GET_MODE_BITSIZE (mode);
36612 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
36613 i >>= 1)
36615 half = gen_reg_rtx (mode);
36616 emit_reduc_half (half, vec, i);
36617 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
36618 dst = dest;
36619 else
36620 dst = gen_reg_rtx (mode);
36621 emit_insn (fn (dst, half, vec));
36622 vec = dst;
36626 /* Target hook for scalar_mode_supported_p. */
36627 static bool
36628 ix86_scalar_mode_supported_p (enum machine_mode mode)
36630 if (DECIMAL_FLOAT_MODE_P (mode))
36631 return default_decimal_float_supported_p ();
36632 else if (mode == TFmode)
36633 return true;
36634 else
36635 return default_scalar_mode_supported_p (mode);
36638 /* Implements target hook vector_mode_supported_p. */
36639 static bool
36640 ix86_vector_mode_supported_p (enum machine_mode mode)
36642 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
36643 return true;
36644 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
36645 return true;
36646 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
36647 return true;
36648 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
36649 return true;
36650 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
36651 return true;
36652 return false;
36655 /* Target hook for c_mode_for_suffix. */
36656 static enum machine_mode
36657 ix86_c_mode_for_suffix (char suffix)
36659 if (suffix == 'q')
36660 return TFmode;
36661 if (suffix == 'w')
36662 return XFmode;
36664 return VOIDmode;
36667 /* Worker function for TARGET_MD_ASM_CLOBBERS.
36669 We do this in the new i386 backend to maintain source compatibility
36670 with the old cc0-based compiler. */
36672 static tree
36673 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
36674 tree inputs ATTRIBUTE_UNUSED,
36675 tree clobbers)
36677 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
36678 clobbers);
36679 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
36680 clobbers);
36681 return clobbers;
36684 /* Implements target vector targetm.asm.encode_section_info. */
36686 static void ATTRIBUTE_UNUSED
36687 ix86_encode_section_info (tree decl, rtx rtl, int first)
36689 default_encode_section_info (decl, rtl, first);
36691 if (TREE_CODE (decl) == VAR_DECL
36692 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
36693 && ix86_in_large_data_p (decl))
36694 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
36697 /* Worker function for REVERSE_CONDITION. */
36699 enum rtx_code
36700 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
36702 return (mode != CCFPmode && mode != CCFPUmode
36703 ? reverse_condition (code)
36704 : reverse_condition_maybe_unordered (code));
36707 /* Output code to perform an x87 FP register move, from OPERANDS[1]
36708 to OPERANDS[0]. */
36710 const char *
36711 output_387_reg_move (rtx insn, rtx *operands)
36713 if (REG_P (operands[0]))
36715 if (REG_P (operands[1])
36716 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36718 if (REGNO (operands[0]) == FIRST_STACK_REG)
36719 return output_387_ffreep (operands, 0);
36720 return "fstp\t%y0";
36722 if (STACK_TOP_P (operands[0]))
36723 return "fld%Z1\t%y1";
36724 return "fst\t%y0";
36726 else if (MEM_P (operands[0]))
36728 gcc_assert (REG_P (operands[1]));
36729 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36730 return "fstp%Z0\t%y0";
36731 else
36733 /* There is no non-popping store to memory for XFmode.
36734 So if we need one, follow the store with a load. */
36735 if (GET_MODE (operands[0]) == XFmode)
36736 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
36737 else
36738 return "fst%Z0\t%y0";
36741 else
36742 gcc_unreachable();
36745 /* Output code to perform a conditional jump to LABEL, if C2 flag in
36746 FP status register is set. */
36748 void
36749 ix86_emit_fp_unordered_jump (rtx label)
36751 rtx reg = gen_reg_rtx (HImode);
36752 rtx temp;
36754 emit_insn (gen_x86_fnstsw_1 (reg));
36756 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
36758 emit_insn (gen_x86_sahf_1 (reg));
36760 temp = gen_rtx_REG (CCmode, FLAGS_REG);
36761 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
36763 else
36765 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
36767 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
36768 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
36771 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
36772 gen_rtx_LABEL_REF (VOIDmode, label),
36773 pc_rtx);
36774 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
36776 emit_jump_insn (temp);
36777 predict_jump (REG_BR_PROB_BASE * 10 / 100);
36780 /* Output code to perform a log1p XFmode calculation. */
36782 void ix86_emit_i387_log1p (rtx op0, rtx op1)
36784 rtx label1 = gen_label_rtx ();
36785 rtx label2 = gen_label_rtx ();
36787 rtx tmp = gen_reg_rtx (XFmode);
36788 rtx tmp2 = gen_reg_rtx (XFmode);
36789 rtx test;
36791 emit_insn (gen_absxf2 (tmp, op1));
36792 test = gen_rtx_GE (VOIDmode, tmp,
36793 CONST_DOUBLE_FROM_REAL_VALUE (
36794 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
36795 XFmode));
36796 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
36798 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
36799 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
36800 emit_jump (label2);
36802 emit_label (label1);
36803 emit_move_insn (tmp, CONST1_RTX (XFmode));
36804 emit_insn (gen_addxf3 (tmp, op1, tmp));
36805 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
36806 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
36808 emit_label (label2);
36811 /* Emit code for round calculation. */
36812 void ix86_emit_i387_round (rtx op0, rtx op1)
36814 enum machine_mode inmode = GET_MODE (op1);
36815 enum machine_mode outmode = GET_MODE (op0);
36816 rtx e1, e2, res, tmp, tmp1, half;
36817 rtx scratch = gen_reg_rtx (HImode);
36818 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
36819 rtx jump_label = gen_label_rtx ();
36820 rtx insn;
36821 rtx (*gen_abs) (rtx, rtx);
36822 rtx (*gen_neg) (rtx, rtx);
36824 switch (inmode)
36826 case SFmode:
36827 gen_abs = gen_abssf2;
36828 break;
36829 case DFmode:
36830 gen_abs = gen_absdf2;
36831 break;
36832 case XFmode:
36833 gen_abs = gen_absxf2;
36834 break;
36835 default:
36836 gcc_unreachable ();
36839 switch (outmode)
36841 case SFmode:
36842 gen_neg = gen_negsf2;
36843 break;
36844 case DFmode:
36845 gen_neg = gen_negdf2;
36846 break;
36847 case XFmode:
36848 gen_neg = gen_negxf2;
36849 break;
36850 case HImode:
36851 gen_neg = gen_neghi2;
36852 break;
36853 case SImode:
36854 gen_neg = gen_negsi2;
36855 break;
36856 case DImode:
36857 gen_neg = gen_negdi2;
36858 break;
36859 default:
36860 gcc_unreachable ();
36863 e1 = gen_reg_rtx (inmode);
36864 e2 = gen_reg_rtx (inmode);
36865 res = gen_reg_rtx (outmode);
36867 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
36869 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
36871 /* scratch = fxam(op1) */
36872 emit_insn (gen_rtx_SET (VOIDmode, scratch,
36873 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
36874 UNSPEC_FXAM)));
36875 /* e1 = fabs(op1) */
36876 emit_insn (gen_abs (e1, op1));
36878 /* e2 = e1 + 0.5 */
36879 half = force_reg (inmode, half);
36880 emit_insn (gen_rtx_SET (VOIDmode, e2,
36881 gen_rtx_PLUS (inmode, e1, half)));
36883 /* res = floor(e2) */
36884 if (inmode != XFmode)
36886 tmp1 = gen_reg_rtx (XFmode);
36888 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
36889 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
36891 else
36892 tmp1 = e2;
36894 switch (outmode)
36896 case SFmode:
36897 case DFmode:
36899 rtx tmp0 = gen_reg_rtx (XFmode);
36901 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
36903 emit_insn (gen_rtx_SET (VOIDmode, res,
36904 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
36905 UNSPEC_TRUNC_NOOP)));
36907 break;
36908 case XFmode:
36909 emit_insn (gen_frndintxf2_floor (res, tmp1));
36910 break;
36911 case HImode:
36912 emit_insn (gen_lfloorxfhi2 (res, tmp1));
36913 break;
36914 case SImode:
36915 emit_insn (gen_lfloorxfsi2 (res, tmp1));
36916 break;
36917 case DImode:
36918 emit_insn (gen_lfloorxfdi2 (res, tmp1));
36919 break;
36920 default:
36921 gcc_unreachable ();
36924 /* flags = signbit(a) */
36925 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
36927 /* if (flags) then res = -res */
36928 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
36929 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
36930 gen_rtx_LABEL_REF (VOIDmode, jump_label),
36931 pc_rtx);
36932 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
36933 predict_jump (REG_BR_PROB_BASE * 50 / 100);
36934 JUMP_LABEL (insn) = jump_label;
36936 emit_insn (gen_neg (res, res));
36938 emit_label (jump_label);
36939 LABEL_NUSES (jump_label) = 1;
36941 emit_move_insn (op0, res);
36944 /* Output code to perform a Newton-Rhapson approximation of a single precision
36945 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
36947 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
36949 rtx x0, x1, e0, e1;
36951 x0 = gen_reg_rtx (mode);
36952 e0 = gen_reg_rtx (mode);
36953 e1 = gen_reg_rtx (mode);
36954 x1 = gen_reg_rtx (mode);
36956 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
36958 b = force_reg (mode, b);
36960 /* x0 = rcp(b) estimate */
36961 emit_insn (gen_rtx_SET (VOIDmode, x0,
36962 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
36963 UNSPEC_RCP)));
36964 /* e0 = x0 * b */
36965 emit_insn (gen_rtx_SET (VOIDmode, e0,
36966 gen_rtx_MULT (mode, x0, b)));
36968 /* e0 = x0 * e0 */
36969 emit_insn (gen_rtx_SET (VOIDmode, e0,
36970 gen_rtx_MULT (mode, x0, e0)));
36972 /* e1 = x0 + x0 */
36973 emit_insn (gen_rtx_SET (VOIDmode, e1,
36974 gen_rtx_PLUS (mode, x0, x0)));
36976 /* x1 = e1 - e0 */
36977 emit_insn (gen_rtx_SET (VOIDmode, x1,
36978 gen_rtx_MINUS (mode, e1, e0)));
36980 /* res = a * x1 */
36981 emit_insn (gen_rtx_SET (VOIDmode, res,
36982 gen_rtx_MULT (mode, a, x1)));
36985 /* Output code to perform a Newton-Rhapson approximation of a
36986 single precision floating point [reciprocal] square root. */
36988 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
36989 bool recip)
36991 rtx x0, e0, e1, e2, e3, mthree, mhalf;
36992 REAL_VALUE_TYPE r;
36994 x0 = gen_reg_rtx (mode);
36995 e0 = gen_reg_rtx (mode);
36996 e1 = gen_reg_rtx (mode);
36997 e2 = gen_reg_rtx (mode);
36998 e3 = gen_reg_rtx (mode);
37000 real_from_integer (&r, VOIDmode, -3, -1, 0);
37001 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37003 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37004 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37006 if (VECTOR_MODE_P (mode))
37008 mthree = ix86_build_const_vector (mode, true, mthree);
37009 mhalf = ix86_build_const_vector (mode, true, mhalf);
37012 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37013 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37015 a = force_reg (mode, a);
37017 /* x0 = rsqrt(a) estimate */
37018 emit_insn (gen_rtx_SET (VOIDmode, x0,
37019 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37020 UNSPEC_RSQRT)));
37022 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37023 if (!recip)
37025 rtx zero, mask;
37027 zero = gen_reg_rtx (mode);
37028 mask = gen_reg_rtx (mode);
37030 zero = force_reg (mode, CONST0_RTX(mode));
37031 emit_insn (gen_rtx_SET (VOIDmode, mask,
37032 gen_rtx_NE (mode, zero, a)));
37034 emit_insn (gen_rtx_SET (VOIDmode, x0,
37035 gen_rtx_AND (mode, x0, mask)));
37038 /* e0 = x0 * a */
37039 emit_insn (gen_rtx_SET (VOIDmode, e0,
37040 gen_rtx_MULT (mode, x0, a)));
37041 /* e1 = e0 * x0 */
37042 emit_insn (gen_rtx_SET (VOIDmode, e1,
37043 gen_rtx_MULT (mode, e0, x0)));
37045 /* e2 = e1 - 3. */
37046 mthree = force_reg (mode, mthree);
37047 emit_insn (gen_rtx_SET (VOIDmode, e2,
37048 gen_rtx_PLUS (mode, e1, mthree)));
37050 mhalf = force_reg (mode, mhalf);
37051 if (recip)
37052 /* e3 = -.5 * x0 */
37053 emit_insn (gen_rtx_SET (VOIDmode, e3,
37054 gen_rtx_MULT (mode, x0, mhalf)));
37055 else
37056 /* e3 = -.5 * e0 */
37057 emit_insn (gen_rtx_SET (VOIDmode, e3,
37058 gen_rtx_MULT (mode, e0, mhalf)));
37059 /* ret = e2 * e3 */
37060 emit_insn (gen_rtx_SET (VOIDmode, res,
37061 gen_rtx_MULT (mode, e2, e3)));
37064 #ifdef TARGET_SOLARIS
37065 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37067 static void
37068 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37069 tree decl)
37071 /* With Binutils 2.15, the "@unwind" marker must be specified on
37072 every occurrence of the ".eh_frame" section, not just the first
37073 one. */
37074 if (TARGET_64BIT
37075 && strcmp (name, ".eh_frame") == 0)
37077 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37078 flags & SECTION_WRITE ? "aw" : "a");
37079 return;
37082 #ifndef USE_GAS
37083 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37085 solaris_elf_asm_comdat_section (name, flags, decl);
37086 return;
37088 #endif
37090 default_elf_asm_named_section (name, flags, decl);
37092 #endif /* TARGET_SOLARIS */
37094 /* Return the mangling of TYPE if it is an extended fundamental type. */
37096 static const char *
37097 ix86_mangle_type (const_tree type)
37099 type = TYPE_MAIN_VARIANT (type);
37101 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37102 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37103 return NULL;
37105 switch (TYPE_MODE (type))
37107 case TFmode:
37108 /* __float128 is "g". */
37109 return "g";
37110 case XFmode:
37111 /* "long double" or __float80 is "e". */
37112 return "e";
37113 default:
37114 return NULL;
37118 /* For 32-bit code we can save PIC register setup by using
37119 __stack_chk_fail_local hidden function instead of calling
37120 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37121 register, so it is better to call __stack_chk_fail directly. */
37123 static tree ATTRIBUTE_UNUSED
37124 ix86_stack_protect_fail (void)
37126 return TARGET_64BIT
37127 ? default_external_stack_protect_fail ()
37128 : default_hidden_stack_protect_fail ();
37131 /* Select a format to encode pointers in exception handling data. CODE
37132 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37133 true if the symbol may be affected by dynamic relocations.
37135 ??? All x86 object file formats are capable of representing this.
37136 After all, the relocation needed is the same as for the call insn.
37137 Whether or not a particular assembler allows us to enter such, I
37138 guess we'll have to see. */
37140 asm_preferred_eh_data_format (int code, int global)
37142 if (flag_pic)
37144 int type = DW_EH_PE_sdata8;
37145 if (!TARGET_64BIT
37146 || ix86_cmodel == CM_SMALL_PIC
37147 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37148 type = DW_EH_PE_sdata4;
37149 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37151 if (ix86_cmodel == CM_SMALL
37152 || (ix86_cmodel == CM_MEDIUM && code))
37153 return DW_EH_PE_udata4;
37154 return DW_EH_PE_absptr;
37157 /* Expand copysign from SIGN to the positive value ABS_VALUE
37158 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37159 the sign-bit. */
37160 static void
37161 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37163 enum machine_mode mode = GET_MODE (sign);
37164 rtx sgn = gen_reg_rtx (mode);
37165 if (mask == NULL_RTX)
37167 enum machine_mode vmode;
37169 if (mode == SFmode)
37170 vmode = V4SFmode;
37171 else if (mode == DFmode)
37172 vmode = V2DFmode;
37173 else
37174 vmode = mode;
37176 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37177 if (!VECTOR_MODE_P (mode))
37179 /* We need to generate a scalar mode mask in this case. */
37180 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37181 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37182 mask = gen_reg_rtx (mode);
37183 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37186 else
37187 mask = gen_rtx_NOT (mode, mask);
37188 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37189 gen_rtx_AND (mode, mask, sign)));
37190 emit_insn (gen_rtx_SET (VOIDmode, result,
37191 gen_rtx_IOR (mode, abs_value, sgn)));
37194 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37195 mask for masking out the sign-bit is stored in *SMASK, if that is
37196 non-null. */
37197 static rtx
37198 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37200 enum machine_mode vmode, mode = GET_MODE (op0);
37201 rtx xa, mask;
37203 xa = gen_reg_rtx (mode);
37204 if (mode == SFmode)
37205 vmode = V4SFmode;
37206 else if (mode == DFmode)
37207 vmode = V2DFmode;
37208 else
37209 vmode = mode;
37210 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37211 if (!VECTOR_MODE_P (mode))
37213 /* We need to generate a scalar mode mask in this case. */
37214 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37215 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37216 mask = gen_reg_rtx (mode);
37217 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37219 emit_insn (gen_rtx_SET (VOIDmode, xa,
37220 gen_rtx_AND (mode, op0, mask)));
37222 if (smask)
37223 *smask = mask;
37225 return xa;
37228 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37229 swapping the operands if SWAP_OPERANDS is true. The expanded
37230 code is a forward jump to a newly created label in case the
37231 comparison is true. The generated label rtx is returned. */
37232 static rtx
37233 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37234 bool swap_operands)
37236 rtx label, tmp;
37238 if (swap_operands)
37240 tmp = op0;
37241 op0 = op1;
37242 op1 = tmp;
37245 label = gen_label_rtx ();
37246 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37247 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37248 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37249 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37250 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37251 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37252 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37253 JUMP_LABEL (tmp) = label;
37255 return label;
37258 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37259 using comparison code CODE. Operands are swapped for the comparison if
37260 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37261 static rtx
37262 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37263 bool swap_operands)
37265 rtx (*insn)(rtx, rtx, rtx, rtx);
37266 enum machine_mode mode = GET_MODE (op0);
37267 rtx mask = gen_reg_rtx (mode);
37269 if (swap_operands)
37271 rtx tmp = op0;
37272 op0 = op1;
37273 op1 = tmp;
37276 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37278 emit_insn (insn (mask, op0, op1,
37279 gen_rtx_fmt_ee (code, mode, op0, op1)));
37280 return mask;
37283 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37284 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37285 static rtx
37286 ix86_gen_TWO52 (enum machine_mode mode)
37288 REAL_VALUE_TYPE TWO52r;
37289 rtx TWO52;
37291 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37292 TWO52 = const_double_from_real_value (TWO52r, mode);
37293 TWO52 = force_reg (mode, TWO52);
37295 return TWO52;
37298 /* Expand SSE sequence for computing lround from OP1 storing
37299 into OP0. */
37300 void
37301 ix86_expand_lround (rtx op0, rtx op1)
37303 /* C code for the stuff we're doing below:
37304 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37305 return (long)tmp;
37307 enum machine_mode mode = GET_MODE (op1);
37308 const struct real_format *fmt;
37309 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37310 rtx adj;
37312 /* load nextafter (0.5, 0.0) */
37313 fmt = REAL_MODE_FORMAT (mode);
37314 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37315 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37317 /* adj = copysign (0.5, op1) */
37318 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37319 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37321 /* adj = op1 + adj */
37322 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37324 /* op0 = (imode)adj */
37325 expand_fix (op0, adj, 0);
37328 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37329 into OPERAND0. */
37330 void
37331 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37333 /* C code for the stuff we're doing below (for do_floor):
37334 xi = (long)op1;
37335 xi -= (double)xi > op1 ? 1 : 0;
37336 return xi;
37338 enum machine_mode fmode = GET_MODE (op1);
37339 enum machine_mode imode = GET_MODE (op0);
37340 rtx ireg, freg, label, tmp;
37342 /* reg = (long)op1 */
37343 ireg = gen_reg_rtx (imode);
37344 expand_fix (ireg, op1, 0);
37346 /* freg = (double)reg */
37347 freg = gen_reg_rtx (fmode);
37348 expand_float (freg, ireg, 0);
37350 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37351 label = ix86_expand_sse_compare_and_jump (UNLE,
37352 freg, op1, !do_floor);
37353 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37354 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37355 emit_move_insn (ireg, tmp);
37357 emit_label (label);
37358 LABEL_NUSES (label) = 1;
37360 emit_move_insn (op0, ireg);
37363 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37364 result in OPERAND0. */
37365 void
37366 ix86_expand_rint (rtx operand0, rtx operand1)
37368 /* C code for the stuff we're doing below:
37369 xa = fabs (operand1);
37370 if (!isless (xa, 2**52))
37371 return operand1;
37372 xa = xa + 2**52 - 2**52;
37373 return copysign (xa, operand1);
37375 enum machine_mode mode = GET_MODE (operand0);
37376 rtx res, xa, label, TWO52, mask;
37378 res = gen_reg_rtx (mode);
37379 emit_move_insn (res, operand1);
37381 /* xa = abs (operand1) */
37382 xa = ix86_expand_sse_fabs (res, &mask);
37384 /* if (!isless (xa, TWO52)) goto label; */
37385 TWO52 = ix86_gen_TWO52 (mode);
37386 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37388 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37389 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37391 ix86_sse_copysign_to_positive (res, xa, res, mask);
37393 emit_label (label);
37394 LABEL_NUSES (label) = 1;
37396 emit_move_insn (operand0, res);
37399 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37400 into OPERAND0. */
37401 void
37402 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37404 /* C code for the stuff we expand below.
37405 double xa = fabs (x), x2;
37406 if (!isless (xa, TWO52))
37407 return x;
37408 xa = xa + TWO52 - TWO52;
37409 x2 = copysign (xa, x);
37410 Compensate. Floor:
37411 if (x2 > x)
37412 x2 -= 1;
37413 Compensate. Ceil:
37414 if (x2 < x)
37415 x2 -= -1;
37416 return x2;
37418 enum machine_mode mode = GET_MODE (operand0);
37419 rtx xa, TWO52, tmp, label, one, res, mask;
37421 TWO52 = ix86_gen_TWO52 (mode);
37423 /* Temporary for holding the result, initialized to the input
37424 operand to ease control flow. */
37425 res = gen_reg_rtx (mode);
37426 emit_move_insn (res, operand1);
37428 /* xa = abs (operand1) */
37429 xa = ix86_expand_sse_fabs (res, &mask);
37431 /* if (!isless (xa, TWO52)) goto label; */
37432 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37434 /* xa = xa + TWO52 - TWO52; */
37435 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37436 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37438 /* xa = copysign (xa, operand1) */
37439 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37441 /* generate 1.0 or -1.0 */
37442 one = force_reg (mode,
37443 const_double_from_real_value (do_floor
37444 ? dconst1 : dconstm1, mode));
37446 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37447 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37448 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37449 gen_rtx_AND (mode, one, tmp)));
37450 /* We always need to subtract here to preserve signed zero. */
37451 tmp = expand_simple_binop (mode, MINUS,
37452 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37453 emit_move_insn (res, tmp);
37455 emit_label (label);
37456 LABEL_NUSES (label) = 1;
37458 emit_move_insn (operand0, res);
37461 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37462 into OPERAND0. */
37463 void
37464 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37466 /* C code for the stuff we expand below.
37467 double xa = fabs (x), x2;
37468 if (!isless (xa, TWO52))
37469 return x;
37470 x2 = (double)(long)x;
37471 Compensate. Floor:
37472 if (x2 > x)
37473 x2 -= 1;
37474 Compensate. Ceil:
37475 if (x2 < x)
37476 x2 += 1;
37477 if (HONOR_SIGNED_ZEROS (mode))
37478 return copysign (x2, x);
37479 return x2;
37481 enum machine_mode mode = GET_MODE (operand0);
37482 rtx xa, xi, TWO52, tmp, label, one, res, mask;
37484 TWO52 = ix86_gen_TWO52 (mode);
37486 /* Temporary for holding the result, initialized to the input
37487 operand to ease control flow. */
37488 res = gen_reg_rtx (mode);
37489 emit_move_insn (res, operand1);
37491 /* xa = abs (operand1) */
37492 xa = ix86_expand_sse_fabs (res, &mask);
37494 /* if (!isless (xa, TWO52)) goto label; */
37495 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37497 /* xa = (double)(long)x */
37498 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37499 expand_fix (xi, res, 0);
37500 expand_float (xa, xi, 0);
37502 /* generate 1.0 */
37503 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37505 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37506 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37507 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37508 gen_rtx_AND (mode, one, tmp)));
37509 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
37510 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37511 emit_move_insn (res, tmp);
37513 if (HONOR_SIGNED_ZEROS (mode))
37514 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37516 emit_label (label);
37517 LABEL_NUSES (label) = 1;
37519 emit_move_insn (operand0, res);
37522 /* Expand SSE sequence for computing round from OPERAND1 storing
37523 into OPERAND0. Sequence that works without relying on DImode truncation
37524 via cvttsd2siq that is only available on 64bit targets. */
37525 void
37526 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
37528 /* C code for the stuff we expand below.
37529 double xa = fabs (x), xa2, x2;
37530 if (!isless (xa, TWO52))
37531 return x;
37532 Using the absolute value and copying back sign makes
37533 -0.0 -> -0.0 correct.
37534 xa2 = xa + TWO52 - TWO52;
37535 Compensate.
37536 dxa = xa2 - xa;
37537 if (dxa <= -0.5)
37538 xa2 += 1;
37539 else if (dxa > 0.5)
37540 xa2 -= 1;
37541 x2 = copysign (xa2, x);
37542 return x2;
37544 enum machine_mode mode = GET_MODE (operand0);
37545 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
37547 TWO52 = ix86_gen_TWO52 (mode);
37549 /* Temporary for holding the result, initialized to the input
37550 operand to ease control flow. */
37551 res = gen_reg_rtx (mode);
37552 emit_move_insn (res, operand1);
37554 /* xa = abs (operand1) */
37555 xa = ix86_expand_sse_fabs (res, &mask);
37557 /* if (!isless (xa, TWO52)) goto label; */
37558 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37560 /* xa2 = xa + TWO52 - TWO52; */
37561 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37562 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
37564 /* dxa = xa2 - xa; */
37565 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
37567 /* generate 0.5, 1.0 and -0.5 */
37568 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
37569 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
37570 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
37571 0, OPTAB_DIRECT);
37573 /* Compensate. */
37574 tmp = gen_reg_rtx (mode);
37575 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
37576 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
37577 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37578 gen_rtx_AND (mode, one, tmp)));
37579 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37580 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
37581 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
37582 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37583 gen_rtx_AND (mode, one, tmp)));
37584 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37586 /* res = copysign (xa2, operand1) */
37587 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
37589 emit_label (label);
37590 LABEL_NUSES (label) = 1;
37592 emit_move_insn (operand0, res);
37595 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37596 into OPERAND0. */
37597 void
37598 ix86_expand_trunc (rtx operand0, rtx operand1)
37600 /* C code for SSE variant we expand below.
37601 double xa = fabs (x), x2;
37602 if (!isless (xa, TWO52))
37603 return x;
37604 x2 = (double)(long)x;
37605 if (HONOR_SIGNED_ZEROS (mode))
37606 return copysign (x2, x);
37607 return x2;
37609 enum machine_mode mode = GET_MODE (operand0);
37610 rtx xa, xi, TWO52, label, res, mask;
37612 TWO52 = ix86_gen_TWO52 (mode);
37614 /* Temporary for holding the result, initialized to the input
37615 operand to ease control flow. */
37616 res = gen_reg_rtx (mode);
37617 emit_move_insn (res, operand1);
37619 /* xa = abs (operand1) */
37620 xa = ix86_expand_sse_fabs (res, &mask);
37622 /* if (!isless (xa, TWO52)) goto label; */
37623 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37625 /* x = (double)(long)x */
37626 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37627 expand_fix (xi, res, 0);
37628 expand_float (res, xi, 0);
37630 if (HONOR_SIGNED_ZEROS (mode))
37631 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37633 emit_label (label);
37634 LABEL_NUSES (label) = 1;
37636 emit_move_insn (operand0, res);
37639 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37640 into OPERAND0. */
37641 void
37642 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
37644 enum machine_mode mode = GET_MODE (operand0);
37645 rtx xa, mask, TWO52, label, one, res, smask, tmp;
37647 /* C code for SSE variant we expand below.
37648 double xa = fabs (x), x2;
37649 if (!isless (xa, TWO52))
37650 return x;
37651 xa2 = xa + TWO52 - TWO52;
37652 Compensate:
37653 if (xa2 > xa)
37654 xa2 -= 1.0;
37655 x2 = copysign (xa2, x);
37656 return x2;
37659 TWO52 = ix86_gen_TWO52 (mode);
37661 /* Temporary for holding the result, initialized to the input
37662 operand to ease control flow. */
37663 res = gen_reg_rtx (mode);
37664 emit_move_insn (res, operand1);
37666 /* xa = abs (operand1) */
37667 xa = ix86_expand_sse_fabs (res, &smask);
37669 /* if (!isless (xa, TWO52)) goto label; */
37670 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37672 /* res = xa + TWO52 - TWO52; */
37673 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37674 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
37675 emit_move_insn (res, tmp);
37677 /* generate 1.0 */
37678 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37680 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
37681 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
37682 emit_insn (gen_rtx_SET (VOIDmode, mask,
37683 gen_rtx_AND (mode, mask, one)));
37684 tmp = expand_simple_binop (mode, MINUS,
37685 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
37686 emit_move_insn (res, tmp);
37688 /* res = copysign (res, operand1) */
37689 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
37691 emit_label (label);
37692 LABEL_NUSES (label) = 1;
37694 emit_move_insn (operand0, res);
37697 /* Expand SSE sequence for computing round from OPERAND1 storing
37698 into OPERAND0. */
37699 void
37700 ix86_expand_round (rtx operand0, rtx operand1)
37702 /* C code for the stuff we're doing below:
37703 double xa = fabs (x);
37704 if (!isless (xa, TWO52))
37705 return x;
37706 xa = (double)(long)(xa + nextafter (0.5, 0.0));
37707 return copysign (xa, x);
37709 enum machine_mode mode = GET_MODE (operand0);
37710 rtx res, TWO52, xa, label, xi, half, mask;
37711 const struct real_format *fmt;
37712 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37714 /* Temporary for holding the result, initialized to the input
37715 operand to ease control flow. */
37716 res = gen_reg_rtx (mode);
37717 emit_move_insn (res, operand1);
37719 TWO52 = ix86_gen_TWO52 (mode);
37720 xa = ix86_expand_sse_fabs (res, &mask);
37721 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37723 /* load nextafter (0.5, 0.0) */
37724 fmt = REAL_MODE_FORMAT (mode);
37725 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37726 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37728 /* xa = xa + 0.5 */
37729 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
37730 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
37732 /* xa = (double)(int64_t)xa */
37733 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37734 expand_fix (xi, xa, 0);
37735 expand_float (xa, xi, 0);
37737 /* res = copysign (xa, operand1) */
37738 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
37740 emit_label (label);
37741 LABEL_NUSES (label) = 1;
37743 emit_move_insn (operand0, res);
37746 /* Expand SSE sequence for computing round
37747 from OP1 storing into OP0 using sse4 round insn. */
37748 void
37749 ix86_expand_round_sse4 (rtx op0, rtx op1)
37751 enum machine_mode mode = GET_MODE (op0);
37752 rtx e1, e2, res, half;
37753 const struct real_format *fmt;
37754 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37755 rtx (*gen_copysign) (rtx, rtx, rtx);
37756 rtx (*gen_round) (rtx, rtx, rtx);
37758 switch (mode)
37760 case SFmode:
37761 gen_copysign = gen_copysignsf3;
37762 gen_round = gen_sse4_1_roundsf2;
37763 break;
37764 case DFmode:
37765 gen_copysign = gen_copysigndf3;
37766 gen_round = gen_sse4_1_rounddf2;
37767 break;
37768 default:
37769 gcc_unreachable ();
37772 /* round (a) = trunc (a + copysign (0.5, a)) */
37774 /* load nextafter (0.5, 0.0) */
37775 fmt = REAL_MODE_FORMAT (mode);
37776 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37777 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37778 half = const_double_from_real_value (pred_half, mode);
37780 /* e1 = copysign (0.5, op1) */
37781 e1 = gen_reg_rtx (mode);
37782 emit_insn (gen_copysign (e1, half, op1));
37784 /* e2 = op1 + e1 */
37785 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
37787 /* res = trunc (e2) */
37788 res = gen_reg_rtx (mode);
37789 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
37791 emit_move_insn (op0, res);
37795 /* Table of valid machine attributes. */
37796 static const struct attribute_spec ix86_attribute_table[] =
37798 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
37799 affects_type_identity } */
37800 /* Stdcall attribute says callee is responsible for popping arguments
37801 if they are not variable. */
37802 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37803 true },
37804 /* Fastcall attribute says callee is responsible for popping arguments
37805 if they are not variable. */
37806 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37807 true },
37808 /* Thiscall attribute says callee is responsible for popping arguments
37809 if they are not variable. */
37810 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37811 true },
37812 /* Cdecl attribute says the callee is a normal C declaration */
37813 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37814 true },
37815 /* Regparm attribute specifies how many integer arguments are to be
37816 passed in registers. */
37817 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
37818 true },
37819 /* Sseregparm attribute says we are using x86_64 calling conventions
37820 for FP arguments. */
37821 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37822 true },
37823 /* The transactional memory builtins are implicitly regparm or fastcall
37824 depending on the ABI. Override the generic do-nothing attribute that
37825 these builtins were declared with. */
37826 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
37827 true },
37828 /* force_align_arg_pointer says this function realigns the stack at entry. */
37829 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
37830 false, true, true, ix86_handle_cconv_attribute, false },
37831 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
37832 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
37833 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
37834 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
37835 false },
37836 #endif
37837 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
37838 false },
37839 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
37840 false },
37841 #ifdef SUBTARGET_ATTRIBUTE_TABLE
37842 SUBTARGET_ATTRIBUTE_TABLE,
37843 #endif
37844 /* ms_abi and sysv_abi calling convention function attributes. */
37845 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
37846 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
37847 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
37848 false },
37849 { "callee_pop_aggregate_return", 1, 1, false, true, true,
37850 ix86_handle_callee_pop_aggregate_return, true },
37851 /* End element. */
37852 { NULL, 0, 0, false, false, false, NULL, false }
37855 /* Implement targetm.vectorize.builtin_vectorization_cost. */
37856 static int
37857 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
37858 tree vectype,
37859 int misalign ATTRIBUTE_UNUSED)
37861 unsigned elements;
37863 switch (type_of_cost)
37865 case scalar_stmt:
37866 return ix86_cost->scalar_stmt_cost;
37868 case scalar_load:
37869 return ix86_cost->scalar_load_cost;
37871 case scalar_store:
37872 return ix86_cost->scalar_store_cost;
37874 case vector_stmt:
37875 return ix86_cost->vec_stmt_cost;
37877 case vector_load:
37878 return ix86_cost->vec_align_load_cost;
37880 case vector_store:
37881 return ix86_cost->vec_store_cost;
37883 case vec_to_scalar:
37884 return ix86_cost->vec_to_scalar_cost;
37886 case scalar_to_vec:
37887 return ix86_cost->scalar_to_vec_cost;
37889 case unaligned_load:
37890 case unaligned_store:
37891 return ix86_cost->vec_unalign_load_cost;
37893 case cond_branch_taken:
37894 return ix86_cost->cond_taken_branch_cost;
37896 case cond_branch_not_taken:
37897 return ix86_cost->cond_not_taken_branch_cost;
37899 case vec_perm:
37900 case vec_promote_demote:
37901 return ix86_cost->vec_stmt_cost;
37903 case vec_construct:
37904 elements = TYPE_VECTOR_SUBPARTS (vectype);
37905 return elements / 2 + 1;
37907 default:
37908 gcc_unreachable ();
37912 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
37913 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
37914 insn every time. */
37916 static GTY(()) rtx vselect_insn;
37918 /* Initialize vselect_insn. */
37920 static void
37921 init_vselect_insn (void)
37923 unsigned i;
37924 rtx x;
37926 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
37927 for (i = 0; i < MAX_VECT_LEN; ++i)
37928 XVECEXP (x, 0, i) = const0_rtx;
37929 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
37930 const0_rtx), x);
37931 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
37932 start_sequence ();
37933 vselect_insn = emit_insn (x);
37934 end_sequence ();
37937 /* Construct (set target (vec_select op0 (parallel perm))) and
37938 return true if that's a valid instruction in the active ISA. */
37940 static bool
37941 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
37942 unsigned nelt, bool testing_p)
37944 unsigned int i;
37945 rtx x, save_vconcat;
37946 int icode;
37948 if (vselect_insn == NULL_RTX)
37949 init_vselect_insn ();
37951 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
37952 PUT_NUM_ELEM (XVEC (x, 0), nelt);
37953 for (i = 0; i < nelt; ++i)
37954 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
37955 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
37956 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
37957 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
37958 SET_DEST (PATTERN (vselect_insn)) = target;
37959 icode = recog_memoized (vselect_insn);
37961 if (icode >= 0 && !testing_p)
37962 emit_insn (copy_rtx (PATTERN (vselect_insn)));
37964 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
37965 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
37966 INSN_CODE (vselect_insn) = -1;
37968 return icode >= 0;
37971 /* Similar, but generate a vec_concat from op0 and op1 as well. */
37973 static bool
37974 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
37975 const unsigned char *perm, unsigned nelt,
37976 bool testing_p)
37978 enum machine_mode v2mode;
37979 rtx x;
37980 bool ok;
37982 if (vselect_insn == NULL_RTX)
37983 init_vselect_insn ();
37985 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
37986 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
37987 PUT_MODE (x, v2mode);
37988 XEXP (x, 0) = op0;
37989 XEXP (x, 1) = op1;
37990 ok = expand_vselect (target, x, perm, nelt, testing_p);
37991 XEXP (x, 0) = const0_rtx;
37992 XEXP (x, 1) = const0_rtx;
37993 return ok;
37996 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37997 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
37999 static bool
38000 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38002 enum machine_mode vmode = d->vmode;
38003 unsigned i, mask, nelt = d->nelt;
38004 rtx target, op0, op1, x;
38005 rtx rperm[32], vperm;
38007 if (d->one_operand_p)
38008 return false;
38009 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38011 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38013 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38015 else
38016 return false;
38018 /* This is a blend, not a permute. Elements must stay in their
38019 respective lanes. */
38020 for (i = 0; i < nelt; ++i)
38022 unsigned e = d->perm[i];
38023 if (!(e == i || e == i + nelt))
38024 return false;
38027 if (d->testing_p)
38028 return true;
38030 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38031 decision should be extracted elsewhere, so that we only try that
38032 sequence once all budget==3 options have been tried. */
38033 target = d->target;
38034 op0 = d->op0;
38035 op1 = d->op1;
38036 mask = 0;
38038 switch (vmode)
38040 case V4DFmode:
38041 case V8SFmode:
38042 case V2DFmode:
38043 case V4SFmode:
38044 case V8HImode:
38045 case V8SImode:
38046 for (i = 0; i < nelt; ++i)
38047 mask |= (d->perm[i] >= nelt) << i;
38048 break;
38050 case V2DImode:
38051 for (i = 0; i < 2; ++i)
38052 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38053 vmode = V8HImode;
38054 goto do_subreg;
38056 case V4SImode:
38057 for (i = 0; i < 4; ++i)
38058 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38059 vmode = V8HImode;
38060 goto do_subreg;
38062 case V16QImode:
38063 /* See if bytes move in pairs so we can use pblendw with
38064 an immediate argument, rather than pblendvb with a vector
38065 argument. */
38066 for (i = 0; i < 16; i += 2)
38067 if (d->perm[i] + 1 != d->perm[i + 1])
38069 use_pblendvb:
38070 for (i = 0; i < nelt; ++i)
38071 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38073 finish_pblendvb:
38074 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38075 vperm = force_reg (vmode, vperm);
38077 if (GET_MODE_SIZE (vmode) == 16)
38078 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38079 else
38080 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38081 return true;
38084 for (i = 0; i < 8; ++i)
38085 mask |= (d->perm[i * 2] >= 16) << i;
38086 vmode = V8HImode;
38087 /* FALLTHRU */
38089 do_subreg:
38090 target = gen_lowpart (vmode, target);
38091 op0 = gen_lowpart (vmode, op0);
38092 op1 = gen_lowpart (vmode, op1);
38093 break;
38095 case V32QImode:
38096 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38097 for (i = 0; i < 32; i += 2)
38098 if (d->perm[i] + 1 != d->perm[i + 1])
38099 goto use_pblendvb;
38100 /* See if bytes move in quadruplets. If yes, vpblendd
38101 with immediate can be used. */
38102 for (i = 0; i < 32; i += 4)
38103 if (d->perm[i] + 2 != d->perm[i + 2])
38104 break;
38105 if (i < 32)
38107 /* See if bytes move the same in both lanes. If yes,
38108 vpblendw with immediate can be used. */
38109 for (i = 0; i < 16; i += 2)
38110 if (d->perm[i] + 16 != d->perm[i + 16])
38111 goto use_pblendvb;
38113 /* Use vpblendw. */
38114 for (i = 0; i < 16; ++i)
38115 mask |= (d->perm[i * 2] >= 32) << i;
38116 vmode = V16HImode;
38117 goto do_subreg;
38120 /* Use vpblendd. */
38121 for (i = 0; i < 8; ++i)
38122 mask |= (d->perm[i * 4] >= 32) << i;
38123 vmode = V8SImode;
38124 goto do_subreg;
38126 case V16HImode:
38127 /* See if words move in pairs. If yes, vpblendd can be used. */
38128 for (i = 0; i < 16; i += 2)
38129 if (d->perm[i] + 1 != d->perm[i + 1])
38130 break;
38131 if (i < 16)
38133 /* See if words move the same in both lanes. If not,
38134 vpblendvb must be used. */
38135 for (i = 0; i < 8; i++)
38136 if (d->perm[i] + 8 != d->perm[i + 8])
38138 /* Use vpblendvb. */
38139 for (i = 0; i < 32; ++i)
38140 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38142 vmode = V32QImode;
38143 nelt = 32;
38144 target = gen_lowpart (vmode, target);
38145 op0 = gen_lowpart (vmode, op0);
38146 op1 = gen_lowpart (vmode, op1);
38147 goto finish_pblendvb;
38150 /* Use vpblendw. */
38151 for (i = 0; i < 16; ++i)
38152 mask |= (d->perm[i] >= 16) << i;
38153 break;
38156 /* Use vpblendd. */
38157 for (i = 0; i < 8; ++i)
38158 mask |= (d->perm[i * 2] >= 16) << i;
38159 vmode = V8SImode;
38160 goto do_subreg;
38162 case V4DImode:
38163 /* Use vpblendd. */
38164 for (i = 0; i < 4; ++i)
38165 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38166 vmode = V8SImode;
38167 goto do_subreg;
38169 default:
38170 gcc_unreachable ();
38173 /* This matches five different patterns with the different modes. */
38174 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38175 x = gen_rtx_SET (VOIDmode, target, x);
38176 emit_insn (x);
38178 return true;
38181 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38182 in terms of the variable form of vpermilps.
38184 Note that we will have already failed the immediate input vpermilps,
38185 which requires that the high and low part shuffle be identical; the
38186 variable form doesn't require that. */
38188 static bool
38189 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38191 rtx rperm[8], vperm;
38192 unsigned i;
38194 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38195 return false;
38197 /* We can only permute within the 128-bit lane. */
38198 for (i = 0; i < 8; ++i)
38200 unsigned e = d->perm[i];
38201 if (i < 4 ? e >= 4 : e < 4)
38202 return false;
38205 if (d->testing_p)
38206 return true;
38208 for (i = 0; i < 8; ++i)
38210 unsigned e = d->perm[i];
38212 /* Within each 128-bit lane, the elements of op0 are numbered
38213 from 0 and the elements of op1 are numbered from 4. */
38214 if (e >= 8 + 4)
38215 e -= 8;
38216 else if (e >= 4)
38217 e -= 4;
38219 rperm[i] = GEN_INT (e);
38222 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38223 vperm = force_reg (V8SImode, vperm);
38224 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38226 return true;
38229 /* Return true if permutation D can be performed as VMODE permutation
38230 instead. */
38232 static bool
38233 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38235 unsigned int i, j, chunk;
38237 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38238 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38239 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38240 return false;
38242 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38243 return true;
38245 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38246 for (i = 0; i < d->nelt; i += chunk)
38247 if (d->perm[i] & (chunk - 1))
38248 return false;
38249 else
38250 for (j = 1; j < chunk; ++j)
38251 if (d->perm[i] + j != d->perm[i + j])
38252 return false;
38254 return true;
38257 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38258 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38260 static bool
38261 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38263 unsigned i, nelt, eltsz, mask;
38264 unsigned char perm[32];
38265 enum machine_mode vmode = V16QImode;
38266 rtx rperm[32], vperm, target, op0, op1;
38268 nelt = d->nelt;
38270 if (!d->one_operand_p)
38272 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38274 if (TARGET_AVX2
38275 && valid_perm_using_mode_p (V2TImode, d))
38277 if (d->testing_p)
38278 return true;
38280 /* Use vperm2i128 insn. The pattern uses
38281 V4DImode instead of V2TImode. */
38282 target = gen_lowpart (V4DImode, d->target);
38283 op0 = gen_lowpart (V4DImode, d->op0);
38284 op1 = gen_lowpart (V4DImode, d->op1);
38285 rperm[0]
38286 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38287 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38288 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38289 return true;
38291 return false;
38294 else
38296 if (GET_MODE_SIZE (d->vmode) == 16)
38298 if (!TARGET_SSSE3)
38299 return false;
38301 else if (GET_MODE_SIZE (d->vmode) == 32)
38303 if (!TARGET_AVX2)
38304 return false;
38306 /* V4DImode should be already handled through
38307 expand_vselect by vpermq instruction. */
38308 gcc_assert (d->vmode != V4DImode);
38310 vmode = V32QImode;
38311 if (d->vmode == V8SImode
38312 || d->vmode == V16HImode
38313 || d->vmode == V32QImode)
38315 /* First see if vpermq can be used for
38316 V8SImode/V16HImode/V32QImode. */
38317 if (valid_perm_using_mode_p (V4DImode, d))
38319 for (i = 0; i < 4; i++)
38320 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38321 if (d->testing_p)
38322 return true;
38323 return expand_vselect (gen_lowpart (V4DImode, d->target),
38324 gen_lowpart (V4DImode, d->op0),
38325 perm, 4, false);
38328 /* Next see if vpermd can be used. */
38329 if (valid_perm_using_mode_p (V8SImode, d))
38330 vmode = V8SImode;
38332 /* Or if vpermps can be used. */
38333 else if (d->vmode == V8SFmode)
38334 vmode = V8SImode;
38336 if (vmode == V32QImode)
38338 /* vpshufb only works intra lanes, it is not
38339 possible to shuffle bytes in between the lanes. */
38340 for (i = 0; i < nelt; ++i)
38341 if ((d->perm[i] ^ i) & (nelt / 2))
38342 return false;
38345 else
38346 return false;
38349 if (d->testing_p)
38350 return true;
38352 if (vmode == V8SImode)
38353 for (i = 0; i < 8; ++i)
38354 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38355 else
38357 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38358 if (!d->one_operand_p)
38359 mask = 2 * nelt - 1;
38360 else if (vmode == V16QImode)
38361 mask = nelt - 1;
38362 else
38363 mask = nelt / 2 - 1;
38365 for (i = 0; i < nelt; ++i)
38367 unsigned j, e = d->perm[i] & mask;
38368 for (j = 0; j < eltsz; ++j)
38369 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38373 vperm = gen_rtx_CONST_VECTOR (vmode,
38374 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38375 vperm = force_reg (vmode, vperm);
38377 target = gen_lowpart (vmode, d->target);
38378 op0 = gen_lowpart (vmode, d->op0);
38379 if (d->one_operand_p)
38381 if (vmode == V16QImode)
38382 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38383 else if (vmode == V32QImode)
38384 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38385 else if (vmode == V8SFmode)
38386 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38387 else
38388 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38390 else
38392 op1 = gen_lowpart (vmode, d->op1);
38393 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38396 return true;
38399 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38400 in a single instruction. */
38402 static bool
38403 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38405 unsigned i, nelt = d->nelt;
38406 unsigned char perm2[MAX_VECT_LEN];
38408 /* Check plain VEC_SELECT first, because AVX has instructions that could
38409 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38410 input where SEL+CONCAT may not. */
38411 if (d->one_operand_p)
38413 int mask = nelt - 1;
38414 bool identity_perm = true;
38415 bool broadcast_perm = true;
38417 for (i = 0; i < nelt; i++)
38419 perm2[i] = d->perm[i] & mask;
38420 if (perm2[i] != i)
38421 identity_perm = false;
38422 if (perm2[i])
38423 broadcast_perm = false;
38426 if (identity_perm)
38428 if (!d->testing_p)
38429 emit_move_insn (d->target, d->op0);
38430 return true;
38432 else if (broadcast_perm && TARGET_AVX2)
38434 /* Use vpbroadcast{b,w,d}. */
38435 rtx (*gen) (rtx, rtx) = NULL;
38436 switch (d->vmode)
38438 case V32QImode:
38439 gen = gen_avx2_pbroadcastv32qi_1;
38440 break;
38441 case V16HImode:
38442 gen = gen_avx2_pbroadcastv16hi_1;
38443 break;
38444 case V8SImode:
38445 gen = gen_avx2_pbroadcastv8si_1;
38446 break;
38447 case V16QImode:
38448 gen = gen_avx2_pbroadcastv16qi;
38449 break;
38450 case V8HImode:
38451 gen = gen_avx2_pbroadcastv8hi;
38452 break;
38453 case V8SFmode:
38454 gen = gen_avx2_vec_dupv8sf_1;
38455 break;
38456 /* For other modes prefer other shuffles this function creates. */
38457 default: break;
38459 if (gen != NULL)
38461 if (!d->testing_p)
38462 emit_insn (gen (d->target, d->op0));
38463 return true;
38467 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38468 return true;
38470 /* There are plenty of patterns in sse.md that are written for
38471 SEL+CONCAT and are not replicated for a single op. Perhaps
38472 that should be changed, to avoid the nastiness here. */
38474 /* Recognize interleave style patterns, which means incrementing
38475 every other permutation operand. */
38476 for (i = 0; i < nelt; i += 2)
38478 perm2[i] = d->perm[i] & mask;
38479 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
38481 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38482 d->testing_p))
38483 return true;
38485 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
38486 if (nelt >= 4)
38488 for (i = 0; i < nelt; i += 4)
38490 perm2[i + 0] = d->perm[i + 0] & mask;
38491 perm2[i + 1] = d->perm[i + 1] & mask;
38492 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
38493 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
38496 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38497 d->testing_p))
38498 return true;
38502 /* Finally, try the fully general two operand permute. */
38503 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
38504 d->testing_p))
38505 return true;
38507 /* Recognize interleave style patterns with reversed operands. */
38508 if (!d->one_operand_p)
38510 for (i = 0; i < nelt; ++i)
38512 unsigned e = d->perm[i];
38513 if (e >= nelt)
38514 e -= nelt;
38515 else
38516 e += nelt;
38517 perm2[i] = e;
38520 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
38521 d->testing_p))
38522 return true;
38525 /* Try the SSE4.1 blend variable merge instructions. */
38526 if (expand_vec_perm_blend (d))
38527 return true;
38529 /* Try one of the AVX vpermil variable permutations. */
38530 if (expand_vec_perm_vpermil (d))
38531 return true;
38533 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
38534 vpshufb, vpermd, vpermps or vpermq variable permutation. */
38535 if (expand_vec_perm_pshufb (d))
38536 return true;
38538 return false;
38541 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38542 in terms of a pair of pshuflw + pshufhw instructions. */
38544 static bool
38545 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
38547 unsigned char perm2[MAX_VECT_LEN];
38548 unsigned i;
38549 bool ok;
38551 if (d->vmode != V8HImode || !d->one_operand_p)
38552 return false;
38554 /* The two permutations only operate in 64-bit lanes. */
38555 for (i = 0; i < 4; ++i)
38556 if (d->perm[i] >= 4)
38557 return false;
38558 for (i = 4; i < 8; ++i)
38559 if (d->perm[i] < 4)
38560 return false;
38562 if (d->testing_p)
38563 return true;
38565 /* Emit the pshuflw. */
38566 memcpy (perm2, d->perm, 4);
38567 for (i = 4; i < 8; ++i)
38568 perm2[i] = i;
38569 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
38570 gcc_assert (ok);
38572 /* Emit the pshufhw. */
38573 memcpy (perm2 + 4, d->perm + 4, 4);
38574 for (i = 0; i < 4; ++i)
38575 perm2[i] = i;
38576 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
38577 gcc_assert (ok);
38579 return true;
38582 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38583 the permutation using the SSSE3 palignr instruction. This succeeds
38584 when all of the elements in PERM fit within one vector and we merely
38585 need to shift them down so that a single vector permutation has a
38586 chance to succeed. */
38588 static bool
38589 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
38591 unsigned i, nelt = d->nelt;
38592 unsigned min, max;
38593 bool in_order, ok;
38594 rtx shift;
38596 /* Even with AVX, palignr only operates on 128-bit vectors. */
38597 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38598 return false;
38600 min = nelt, max = 0;
38601 for (i = 0; i < nelt; ++i)
38603 unsigned e = d->perm[i];
38604 if (e < min)
38605 min = e;
38606 if (e > max)
38607 max = e;
38609 if (min == 0 || max - min >= nelt)
38610 return false;
38612 /* Given that we have SSSE3, we know we'll be able to implement the
38613 single operand permutation after the palignr with pshufb. */
38614 if (d->testing_p)
38615 return true;
38617 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
38618 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
38619 gen_lowpart (TImode, d->op1),
38620 gen_lowpart (TImode, d->op0), shift));
38622 d->op0 = d->op1 = d->target;
38623 d->one_operand_p = true;
38625 in_order = true;
38626 for (i = 0; i < nelt; ++i)
38628 unsigned e = d->perm[i] - min;
38629 if (e != i)
38630 in_order = false;
38631 d->perm[i] = e;
38634 /* Test for the degenerate case where the alignment by itself
38635 produces the desired permutation. */
38636 if (in_order)
38637 return true;
38639 ok = expand_vec_perm_1 (d);
38640 gcc_assert (ok);
38642 return ok;
38645 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
38647 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38648 a two vector permutation into a single vector permutation by using
38649 an interleave operation to merge the vectors. */
38651 static bool
38652 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
38654 struct expand_vec_perm_d dremap, dfinal;
38655 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
38656 unsigned HOST_WIDE_INT contents;
38657 unsigned char remap[2 * MAX_VECT_LEN];
38658 rtx seq;
38659 bool ok, same_halves = false;
38661 if (GET_MODE_SIZE (d->vmode) == 16)
38663 if (d->one_operand_p)
38664 return false;
38666 else if (GET_MODE_SIZE (d->vmode) == 32)
38668 if (!TARGET_AVX)
38669 return false;
38670 /* For 32-byte modes allow even d->one_operand_p.
38671 The lack of cross-lane shuffling in some instructions
38672 might prevent a single insn shuffle. */
38673 dfinal = *d;
38674 dfinal.testing_p = true;
38675 /* If expand_vec_perm_interleave3 can expand this into
38676 a 3 insn sequence, give up and let it be expanded as
38677 3 insn sequence. While that is one insn longer,
38678 it doesn't need a memory operand and in the common
38679 case that both interleave low and high permutations
38680 with the same operands are adjacent needs 4 insns
38681 for both after CSE. */
38682 if (expand_vec_perm_interleave3 (&dfinal))
38683 return false;
38685 else
38686 return false;
38688 /* Examine from whence the elements come. */
38689 contents = 0;
38690 for (i = 0; i < nelt; ++i)
38691 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
38693 memset (remap, 0xff, sizeof (remap));
38694 dremap = *d;
38696 if (GET_MODE_SIZE (d->vmode) == 16)
38698 unsigned HOST_WIDE_INT h1, h2, h3, h4;
38700 /* Split the two input vectors into 4 halves. */
38701 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
38702 h2 = h1 << nelt2;
38703 h3 = h2 << nelt2;
38704 h4 = h3 << nelt2;
38706 /* If the elements from the low halves use interleave low, and similarly
38707 for interleave high. If the elements are from mis-matched halves, we
38708 can use shufps for V4SF/V4SI or do a DImode shuffle. */
38709 if ((contents & (h1 | h3)) == contents)
38711 /* punpckl* */
38712 for (i = 0; i < nelt2; ++i)
38714 remap[i] = i * 2;
38715 remap[i + nelt] = i * 2 + 1;
38716 dremap.perm[i * 2] = i;
38717 dremap.perm[i * 2 + 1] = i + nelt;
38719 if (!TARGET_SSE2 && d->vmode == V4SImode)
38720 dremap.vmode = V4SFmode;
38722 else if ((contents & (h2 | h4)) == contents)
38724 /* punpckh* */
38725 for (i = 0; i < nelt2; ++i)
38727 remap[i + nelt2] = i * 2;
38728 remap[i + nelt + nelt2] = i * 2 + 1;
38729 dremap.perm[i * 2] = i + nelt2;
38730 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
38732 if (!TARGET_SSE2 && d->vmode == V4SImode)
38733 dremap.vmode = V4SFmode;
38735 else if ((contents & (h1 | h4)) == contents)
38737 /* shufps */
38738 for (i = 0; i < nelt2; ++i)
38740 remap[i] = i;
38741 remap[i + nelt + nelt2] = i + nelt2;
38742 dremap.perm[i] = i;
38743 dremap.perm[i + nelt2] = i + nelt + nelt2;
38745 if (nelt != 4)
38747 /* shufpd */
38748 dremap.vmode = V2DImode;
38749 dremap.nelt = 2;
38750 dremap.perm[0] = 0;
38751 dremap.perm[1] = 3;
38754 else if ((contents & (h2 | h3)) == contents)
38756 /* shufps */
38757 for (i = 0; i < nelt2; ++i)
38759 remap[i + nelt2] = i;
38760 remap[i + nelt] = i + nelt2;
38761 dremap.perm[i] = i + nelt2;
38762 dremap.perm[i + nelt2] = i + nelt;
38764 if (nelt != 4)
38766 /* shufpd */
38767 dremap.vmode = V2DImode;
38768 dremap.nelt = 2;
38769 dremap.perm[0] = 1;
38770 dremap.perm[1] = 2;
38773 else
38774 return false;
38776 else
38778 unsigned int nelt4 = nelt / 4, nzcnt = 0;
38779 unsigned HOST_WIDE_INT q[8];
38780 unsigned int nonzero_halves[4];
38782 /* Split the two input vectors into 8 quarters. */
38783 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
38784 for (i = 1; i < 8; ++i)
38785 q[i] = q[0] << (nelt4 * i);
38786 for (i = 0; i < 4; ++i)
38787 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
38789 nonzero_halves[nzcnt] = i;
38790 ++nzcnt;
38793 if (nzcnt == 1)
38795 gcc_assert (d->one_operand_p);
38796 nonzero_halves[1] = nonzero_halves[0];
38797 same_halves = true;
38799 else if (d->one_operand_p)
38801 gcc_assert (nonzero_halves[0] == 0);
38802 gcc_assert (nonzero_halves[1] == 1);
38805 if (nzcnt <= 2)
38807 if (d->perm[0] / nelt2 == nonzero_halves[1])
38809 /* Attempt to increase the likelihood that dfinal
38810 shuffle will be intra-lane. */
38811 char tmph = nonzero_halves[0];
38812 nonzero_halves[0] = nonzero_halves[1];
38813 nonzero_halves[1] = tmph;
38816 /* vperm2f128 or vperm2i128. */
38817 for (i = 0; i < nelt2; ++i)
38819 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
38820 remap[i + nonzero_halves[0] * nelt2] = i;
38821 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
38822 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
38825 if (d->vmode != V8SFmode
38826 && d->vmode != V4DFmode
38827 && d->vmode != V8SImode)
38829 dremap.vmode = V8SImode;
38830 dremap.nelt = 8;
38831 for (i = 0; i < 4; ++i)
38833 dremap.perm[i] = i + nonzero_halves[0] * 4;
38834 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
38838 else if (d->one_operand_p)
38839 return false;
38840 else if (TARGET_AVX2
38841 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
38843 /* vpunpckl* */
38844 for (i = 0; i < nelt4; ++i)
38846 remap[i] = i * 2;
38847 remap[i + nelt] = i * 2 + 1;
38848 remap[i + nelt2] = i * 2 + nelt2;
38849 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
38850 dremap.perm[i * 2] = i;
38851 dremap.perm[i * 2 + 1] = i + nelt;
38852 dremap.perm[i * 2 + nelt2] = i + nelt2;
38853 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
38856 else if (TARGET_AVX2
38857 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
38859 /* vpunpckh* */
38860 for (i = 0; i < nelt4; ++i)
38862 remap[i + nelt4] = i * 2;
38863 remap[i + nelt + nelt4] = i * 2 + 1;
38864 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
38865 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
38866 dremap.perm[i * 2] = i + nelt4;
38867 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
38868 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
38869 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
38872 else
38873 return false;
38876 /* Use the remapping array set up above to move the elements from their
38877 swizzled locations into their final destinations. */
38878 dfinal = *d;
38879 for (i = 0; i < nelt; ++i)
38881 unsigned e = remap[d->perm[i]];
38882 gcc_assert (e < nelt);
38883 /* If same_halves is true, both halves of the remapped vector are the
38884 same. Avoid cross-lane accesses if possible. */
38885 if (same_halves && i >= nelt2)
38887 gcc_assert (e < nelt2);
38888 dfinal.perm[i] = e + nelt2;
38890 else
38891 dfinal.perm[i] = e;
38893 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
38894 dfinal.op1 = dfinal.op0;
38895 dfinal.one_operand_p = true;
38896 dremap.target = dfinal.op0;
38898 /* Test if the final remap can be done with a single insn. For V4SFmode or
38899 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
38900 start_sequence ();
38901 ok = expand_vec_perm_1 (&dfinal);
38902 seq = get_insns ();
38903 end_sequence ();
38905 if (!ok)
38906 return false;
38908 if (d->testing_p)
38909 return true;
38911 if (dremap.vmode != dfinal.vmode)
38913 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
38914 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
38915 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
38918 ok = expand_vec_perm_1 (&dremap);
38919 gcc_assert (ok);
38921 emit_insn (seq);
38922 return true;
38925 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38926 a single vector cross-lane permutation into vpermq followed
38927 by any of the single insn permutations. */
38929 static bool
38930 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
38932 struct expand_vec_perm_d dremap, dfinal;
38933 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
38934 unsigned contents[2];
38935 bool ok;
38937 if (!(TARGET_AVX2
38938 && (d->vmode == V32QImode || d->vmode == V16HImode)
38939 && d->one_operand_p))
38940 return false;
38942 contents[0] = 0;
38943 contents[1] = 0;
38944 for (i = 0; i < nelt2; ++i)
38946 contents[0] |= 1u << (d->perm[i] / nelt4);
38947 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
38950 for (i = 0; i < 2; ++i)
38952 unsigned int cnt = 0;
38953 for (j = 0; j < 4; ++j)
38954 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
38955 return false;
38958 if (d->testing_p)
38959 return true;
38961 dremap = *d;
38962 dremap.vmode = V4DImode;
38963 dremap.nelt = 4;
38964 dremap.target = gen_reg_rtx (V4DImode);
38965 dremap.op0 = gen_lowpart (V4DImode, d->op0);
38966 dremap.op1 = dremap.op0;
38967 dremap.one_operand_p = true;
38968 for (i = 0; i < 2; ++i)
38970 unsigned int cnt = 0;
38971 for (j = 0; j < 4; ++j)
38972 if ((contents[i] & (1u << j)) != 0)
38973 dremap.perm[2 * i + cnt++] = j;
38974 for (; cnt < 2; ++cnt)
38975 dremap.perm[2 * i + cnt] = 0;
38978 dfinal = *d;
38979 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
38980 dfinal.op1 = dfinal.op0;
38981 dfinal.one_operand_p = true;
38982 for (i = 0, j = 0; i < nelt; ++i)
38984 if (i == nelt2)
38985 j = 2;
38986 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
38987 if ((d->perm[i] / nelt4) == dremap.perm[j])
38989 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
38990 dfinal.perm[i] |= nelt4;
38991 else
38992 gcc_unreachable ();
38995 ok = expand_vec_perm_1 (&dremap);
38996 gcc_assert (ok);
38998 ok = expand_vec_perm_1 (&dfinal);
38999 gcc_assert (ok);
39001 return true;
39004 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39005 a vector permutation using two instructions, vperm2f128 resp.
39006 vperm2i128 followed by any single in-lane permutation. */
39008 static bool
39009 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39011 struct expand_vec_perm_d dfirst, dsecond;
39012 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39013 bool ok;
39015 if (!TARGET_AVX
39016 || GET_MODE_SIZE (d->vmode) != 32
39017 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39018 return false;
39020 dsecond = *d;
39021 dsecond.one_operand_p = false;
39022 dsecond.testing_p = true;
39024 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39025 immediate. For perm < 16 the second permutation uses
39026 d->op0 as first operand, for perm >= 16 it uses d->op1
39027 as first operand. The second operand is the result of
39028 vperm2[fi]128. */
39029 for (perm = 0; perm < 32; perm++)
39031 /* Ignore permutations which do not move anything cross-lane. */
39032 if (perm < 16)
39034 /* The second shuffle for e.g. V4DFmode has
39035 0123 and ABCD operands.
39036 Ignore AB23, as 23 is already in the second lane
39037 of the first operand. */
39038 if ((perm & 0xc) == (1 << 2)) continue;
39039 /* And 01CD, as 01 is in the first lane of the first
39040 operand. */
39041 if ((perm & 3) == 0) continue;
39042 /* And 4567, as then the vperm2[fi]128 doesn't change
39043 anything on the original 4567 second operand. */
39044 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39046 else
39048 /* The second shuffle for e.g. V4DFmode has
39049 4567 and ABCD operands.
39050 Ignore AB67, as 67 is already in the second lane
39051 of the first operand. */
39052 if ((perm & 0xc) == (3 << 2)) continue;
39053 /* And 45CD, as 45 is in the first lane of the first
39054 operand. */
39055 if ((perm & 3) == 2) continue;
39056 /* And 0123, as then the vperm2[fi]128 doesn't change
39057 anything on the original 0123 first operand. */
39058 if ((perm & 0xf) == (1 << 2)) continue;
39061 for (i = 0; i < nelt; i++)
39063 j = d->perm[i] / nelt2;
39064 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39065 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39066 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39067 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39068 else
39069 break;
39072 if (i == nelt)
39074 start_sequence ();
39075 ok = expand_vec_perm_1 (&dsecond);
39076 end_sequence ();
39078 else
39079 ok = false;
39081 if (ok)
39083 if (d->testing_p)
39084 return true;
39086 /* Found a usable second shuffle. dfirst will be
39087 vperm2f128 on d->op0 and d->op1. */
39088 dsecond.testing_p = false;
39089 dfirst = *d;
39090 dfirst.target = gen_reg_rtx (d->vmode);
39091 for (i = 0; i < nelt; i++)
39092 dfirst.perm[i] = (i & (nelt2 - 1))
39093 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39095 ok = expand_vec_perm_1 (&dfirst);
39096 gcc_assert (ok);
39098 /* And dsecond is some single insn shuffle, taking
39099 d->op0 and result of vperm2f128 (if perm < 16) or
39100 d->op1 and result of vperm2f128 (otherwise). */
39101 dsecond.op1 = dfirst.target;
39102 if (perm >= 16)
39103 dsecond.op0 = dfirst.op1;
39105 ok = expand_vec_perm_1 (&dsecond);
39106 gcc_assert (ok);
39108 return true;
39111 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39112 if (d->one_operand_p)
39113 return false;
39116 return false;
39119 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39120 a two vector permutation using 2 intra-lane interleave insns
39121 and cross-lane shuffle for 32-byte vectors. */
39123 static bool
39124 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39126 unsigned i, nelt;
39127 rtx (*gen) (rtx, rtx, rtx);
39129 if (d->one_operand_p)
39130 return false;
39131 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39133 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39135 else
39136 return false;
39138 nelt = d->nelt;
39139 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39140 return false;
39141 for (i = 0; i < nelt; i += 2)
39142 if (d->perm[i] != d->perm[0] + i / 2
39143 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39144 return false;
39146 if (d->testing_p)
39147 return true;
39149 switch (d->vmode)
39151 case V32QImode:
39152 if (d->perm[0])
39153 gen = gen_vec_interleave_highv32qi;
39154 else
39155 gen = gen_vec_interleave_lowv32qi;
39156 break;
39157 case V16HImode:
39158 if (d->perm[0])
39159 gen = gen_vec_interleave_highv16hi;
39160 else
39161 gen = gen_vec_interleave_lowv16hi;
39162 break;
39163 case V8SImode:
39164 if (d->perm[0])
39165 gen = gen_vec_interleave_highv8si;
39166 else
39167 gen = gen_vec_interleave_lowv8si;
39168 break;
39169 case V4DImode:
39170 if (d->perm[0])
39171 gen = gen_vec_interleave_highv4di;
39172 else
39173 gen = gen_vec_interleave_lowv4di;
39174 break;
39175 case V8SFmode:
39176 if (d->perm[0])
39177 gen = gen_vec_interleave_highv8sf;
39178 else
39179 gen = gen_vec_interleave_lowv8sf;
39180 break;
39181 case V4DFmode:
39182 if (d->perm[0])
39183 gen = gen_vec_interleave_highv4df;
39184 else
39185 gen = gen_vec_interleave_lowv4df;
39186 break;
39187 default:
39188 gcc_unreachable ();
39191 emit_insn (gen (d->target, d->op0, d->op1));
39192 return true;
39195 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39196 a single vector permutation using a single intra-lane vector
39197 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39198 the non-swapped and swapped vectors together. */
39200 static bool
39201 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39203 struct expand_vec_perm_d dfirst, dsecond;
39204 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39205 rtx seq;
39206 bool ok;
39207 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39209 if (!TARGET_AVX
39210 || TARGET_AVX2
39211 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39212 || !d->one_operand_p)
39213 return false;
39215 dfirst = *d;
39216 for (i = 0; i < nelt; i++)
39217 dfirst.perm[i] = 0xff;
39218 for (i = 0, msk = 0; i < nelt; i++)
39220 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39221 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39222 return false;
39223 dfirst.perm[j] = d->perm[i];
39224 if (j != i)
39225 msk |= (1 << i);
39227 for (i = 0; i < nelt; i++)
39228 if (dfirst.perm[i] == 0xff)
39229 dfirst.perm[i] = i;
39231 if (!d->testing_p)
39232 dfirst.target = gen_reg_rtx (dfirst.vmode);
39234 start_sequence ();
39235 ok = expand_vec_perm_1 (&dfirst);
39236 seq = get_insns ();
39237 end_sequence ();
39239 if (!ok)
39240 return false;
39242 if (d->testing_p)
39243 return true;
39245 emit_insn (seq);
39247 dsecond = *d;
39248 dsecond.op0 = dfirst.target;
39249 dsecond.op1 = dfirst.target;
39250 dsecond.one_operand_p = true;
39251 dsecond.target = gen_reg_rtx (dsecond.vmode);
39252 for (i = 0; i < nelt; i++)
39253 dsecond.perm[i] = i ^ nelt2;
39255 ok = expand_vec_perm_1 (&dsecond);
39256 gcc_assert (ok);
39258 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39259 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39260 return true;
39263 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39264 permutation using two vperm2f128, followed by a vshufpd insn blending
39265 the two vectors together. */
39267 static bool
39268 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39270 struct expand_vec_perm_d dfirst, dsecond, dthird;
39271 bool ok;
39273 if (!TARGET_AVX || (d->vmode != V4DFmode))
39274 return false;
39276 if (d->testing_p)
39277 return true;
39279 dfirst = *d;
39280 dsecond = *d;
39281 dthird = *d;
39283 dfirst.perm[0] = (d->perm[0] & ~1);
39284 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39285 dfirst.perm[2] = (d->perm[2] & ~1);
39286 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39287 dsecond.perm[0] = (d->perm[1] & ~1);
39288 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39289 dsecond.perm[2] = (d->perm[3] & ~1);
39290 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39291 dthird.perm[0] = (d->perm[0] % 2);
39292 dthird.perm[1] = (d->perm[1] % 2) + 4;
39293 dthird.perm[2] = (d->perm[2] % 2) + 2;
39294 dthird.perm[3] = (d->perm[3] % 2) + 6;
39296 dfirst.target = gen_reg_rtx (dfirst.vmode);
39297 dsecond.target = gen_reg_rtx (dsecond.vmode);
39298 dthird.op0 = dfirst.target;
39299 dthird.op1 = dsecond.target;
39300 dthird.one_operand_p = false;
39302 canonicalize_perm (&dfirst);
39303 canonicalize_perm (&dsecond);
39305 ok = expand_vec_perm_1 (&dfirst)
39306 && expand_vec_perm_1 (&dsecond)
39307 && expand_vec_perm_1 (&dthird);
39309 gcc_assert (ok);
39311 return true;
39314 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39315 permutation with two pshufb insns and an ior. We should have already
39316 failed all two instruction sequences. */
39318 static bool
39319 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39321 rtx rperm[2][16], vperm, l, h, op, m128;
39322 unsigned int i, nelt, eltsz;
39324 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39325 return false;
39326 gcc_assert (!d->one_operand_p);
39328 nelt = d->nelt;
39329 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39331 /* Generate two permutation masks. If the required element is within
39332 the given vector it is shuffled into the proper lane. If the required
39333 element is in the other vector, force a zero into the lane by setting
39334 bit 7 in the permutation mask. */
39335 m128 = GEN_INT (-128);
39336 for (i = 0; i < nelt; ++i)
39338 unsigned j, e = d->perm[i];
39339 unsigned which = (e >= nelt);
39340 if (e >= nelt)
39341 e -= nelt;
39343 for (j = 0; j < eltsz; ++j)
39345 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39346 rperm[1-which][i*eltsz + j] = m128;
39350 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39351 vperm = force_reg (V16QImode, vperm);
39353 l = gen_reg_rtx (V16QImode);
39354 op = gen_lowpart (V16QImode, d->op0);
39355 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39357 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39358 vperm = force_reg (V16QImode, vperm);
39360 h = gen_reg_rtx (V16QImode);
39361 op = gen_lowpart (V16QImode, d->op1);
39362 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39364 op = gen_lowpart (V16QImode, d->target);
39365 emit_insn (gen_iorv16qi3 (op, l, h));
39367 return true;
39370 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39371 with two vpshufb insns, vpermq and vpor. We should have already failed
39372 all two or three instruction sequences. */
39374 static bool
39375 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39377 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39378 unsigned int i, nelt, eltsz;
39380 if (!TARGET_AVX2
39381 || !d->one_operand_p
39382 || (d->vmode != V32QImode && d->vmode != V16HImode))
39383 return false;
39385 if (d->testing_p)
39386 return true;
39388 nelt = d->nelt;
39389 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39391 /* Generate two permutation masks. If the required element is within
39392 the same lane, it is shuffled in. If the required element from the
39393 other lane, force a zero by setting bit 7 in the permutation mask.
39394 In the other mask the mask has non-negative elements if element
39395 is requested from the other lane, but also moved to the other lane,
39396 so that the result of vpshufb can have the two V2TImode halves
39397 swapped. */
39398 m128 = GEN_INT (-128);
39399 for (i = 0; i < nelt; ++i)
39401 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39402 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39404 for (j = 0; j < eltsz; ++j)
39406 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39407 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39411 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39412 vperm = force_reg (V32QImode, vperm);
39414 h = gen_reg_rtx (V32QImode);
39415 op = gen_lowpart (V32QImode, d->op0);
39416 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39418 /* Swap the 128-byte lanes of h into hp. */
39419 hp = gen_reg_rtx (V4DImode);
39420 op = gen_lowpart (V4DImode, h);
39421 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39422 const1_rtx));
39424 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39425 vperm = force_reg (V32QImode, vperm);
39427 l = gen_reg_rtx (V32QImode);
39428 op = gen_lowpart (V32QImode, d->op0);
39429 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39431 op = gen_lowpart (V32QImode, d->target);
39432 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39434 return true;
39437 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39438 and extract-odd permutations of two V32QImode and V16QImode operand
39439 with two vpshufb insns, vpor and vpermq. We should have already
39440 failed all two or three instruction sequences. */
39442 static bool
39443 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39445 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39446 unsigned int i, nelt, eltsz;
39448 if (!TARGET_AVX2
39449 || d->one_operand_p
39450 || (d->vmode != V32QImode && d->vmode != V16HImode))
39451 return false;
39453 for (i = 0; i < d->nelt; ++i)
39454 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39455 return false;
39457 if (d->testing_p)
39458 return true;
39460 nelt = d->nelt;
39461 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39463 /* Generate two permutation masks. In the first permutation mask
39464 the first quarter will contain indexes for the first half
39465 of the op0, the second quarter will contain bit 7 set, third quarter
39466 will contain indexes for the second half of the op0 and the
39467 last quarter bit 7 set. In the second permutation mask
39468 the first quarter will contain bit 7 set, the second quarter
39469 indexes for the first half of the op1, the third quarter bit 7 set
39470 and last quarter indexes for the second half of the op1.
39471 I.e. the first mask e.g. for V32QImode extract even will be:
39472 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39473 (all values masked with 0xf except for -128) and second mask
39474 for extract even will be
39475 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39476 m128 = GEN_INT (-128);
39477 for (i = 0; i < nelt; ++i)
39479 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39480 unsigned which = d->perm[i] >= nelt;
39481 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
39483 for (j = 0; j < eltsz; ++j)
39485 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
39486 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
39490 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39491 vperm = force_reg (V32QImode, vperm);
39493 l = gen_reg_rtx (V32QImode);
39494 op = gen_lowpart (V32QImode, d->op0);
39495 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39497 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39498 vperm = force_reg (V32QImode, vperm);
39500 h = gen_reg_rtx (V32QImode);
39501 op = gen_lowpart (V32QImode, d->op1);
39502 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39504 ior = gen_reg_rtx (V32QImode);
39505 emit_insn (gen_iorv32qi3 (ior, l, h));
39507 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
39508 op = gen_lowpart (V4DImode, d->target);
39509 ior = gen_lowpart (V4DImode, ior);
39510 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
39511 const1_rtx, GEN_INT (3)));
39513 return true;
39516 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
39517 and extract-odd permutations. */
39519 static bool
39520 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
39522 rtx t1, t2, t3;
39524 switch (d->vmode)
39526 case V4DFmode:
39527 t1 = gen_reg_rtx (V4DFmode);
39528 t2 = gen_reg_rtx (V4DFmode);
39530 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39531 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
39532 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
39534 /* Now an unpck[lh]pd will produce the result required. */
39535 if (odd)
39536 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
39537 else
39538 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
39539 emit_insn (t3);
39540 break;
39542 case V8SFmode:
39544 int mask = odd ? 0xdd : 0x88;
39546 t1 = gen_reg_rtx (V8SFmode);
39547 t2 = gen_reg_rtx (V8SFmode);
39548 t3 = gen_reg_rtx (V8SFmode);
39550 /* Shuffle within the 128-bit lanes to produce:
39551 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
39552 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
39553 GEN_INT (mask)));
39555 /* Shuffle the lanes around to produce:
39556 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
39557 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
39558 GEN_INT (0x3)));
39560 /* Shuffle within the 128-bit lanes to produce:
39561 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
39562 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
39564 /* Shuffle within the 128-bit lanes to produce:
39565 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
39566 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
39568 /* Shuffle the lanes around to produce:
39569 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
39570 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
39571 GEN_INT (0x20)));
39573 break;
39575 case V2DFmode:
39576 case V4SFmode:
39577 case V2DImode:
39578 case V4SImode:
39579 /* These are always directly implementable by expand_vec_perm_1. */
39580 gcc_unreachable ();
39582 case V8HImode:
39583 if (TARGET_SSSE3)
39584 return expand_vec_perm_pshufb2 (d);
39585 else
39587 /* We need 2*log2(N)-1 operations to achieve odd/even
39588 with interleave. */
39589 t1 = gen_reg_rtx (V8HImode);
39590 t2 = gen_reg_rtx (V8HImode);
39591 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
39592 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
39593 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
39594 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
39595 if (odd)
39596 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
39597 else
39598 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
39599 emit_insn (t3);
39601 break;
39603 case V16QImode:
39604 if (TARGET_SSSE3)
39605 return expand_vec_perm_pshufb2 (d);
39606 else
39608 t1 = gen_reg_rtx (V16QImode);
39609 t2 = gen_reg_rtx (V16QImode);
39610 t3 = gen_reg_rtx (V16QImode);
39611 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
39612 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
39613 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
39614 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
39615 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
39616 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
39617 if (odd)
39618 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
39619 else
39620 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
39621 emit_insn (t3);
39623 break;
39625 case V16HImode:
39626 case V32QImode:
39627 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
39629 case V4DImode:
39630 if (!TARGET_AVX2)
39632 struct expand_vec_perm_d d_copy = *d;
39633 d_copy.vmode = V4DFmode;
39634 d_copy.target = gen_lowpart (V4DFmode, d->target);
39635 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
39636 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
39637 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39640 t1 = gen_reg_rtx (V4DImode);
39641 t2 = gen_reg_rtx (V4DImode);
39643 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39644 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
39645 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
39647 /* Now an vpunpck[lh]qdq will produce the result required. */
39648 if (odd)
39649 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
39650 else
39651 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
39652 emit_insn (t3);
39653 break;
39655 case V8SImode:
39656 if (!TARGET_AVX2)
39658 struct expand_vec_perm_d d_copy = *d;
39659 d_copy.vmode = V8SFmode;
39660 d_copy.target = gen_lowpart (V8SFmode, d->target);
39661 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
39662 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
39663 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39666 t1 = gen_reg_rtx (V8SImode);
39667 t2 = gen_reg_rtx (V8SImode);
39669 /* Shuffle the lanes around into
39670 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
39671 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
39672 gen_lowpart (V4DImode, d->op0),
39673 gen_lowpart (V4DImode, d->op1),
39674 GEN_INT (0x20)));
39675 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
39676 gen_lowpart (V4DImode, d->op0),
39677 gen_lowpart (V4DImode, d->op1),
39678 GEN_INT (0x31)));
39680 /* Swap the 2nd and 3rd position in each lane into
39681 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
39682 emit_insn (gen_avx2_pshufdv3 (t1, t1,
39683 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39684 emit_insn (gen_avx2_pshufdv3 (t2, t2,
39685 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39687 /* Now an vpunpck[lh]qdq will produce
39688 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
39689 if (odd)
39690 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
39691 gen_lowpart (V4DImode, t1),
39692 gen_lowpart (V4DImode, t2));
39693 else
39694 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
39695 gen_lowpart (V4DImode, t1),
39696 gen_lowpart (V4DImode, t2));
39697 emit_insn (t3);
39698 break;
39700 default:
39701 gcc_unreachable ();
39704 return true;
39707 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
39708 extract-even and extract-odd permutations. */
39710 static bool
39711 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
39713 unsigned i, odd, nelt = d->nelt;
39715 odd = d->perm[0];
39716 if (odd != 0 && odd != 1)
39717 return false;
39719 for (i = 1; i < nelt; ++i)
39720 if (d->perm[i] != 2 * i + odd)
39721 return false;
39723 return expand_vec_perm_even_odd_1 (d, odd);
39726 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
39727 permutations. We assume that expand_vec_perm_1 has already failed. */
39729 static bool
39730 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
39732 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
39733 enum machine_mode vmode = d->vmode;
39734 unsigned char perm2[4];
39735 rtx op0 = d->op0;
39736 bool ok;
39738 switch (vmode)
39740 case V4DFmode:
39741 case V8SFmode:
39742 /* These are special-cased in sse.md so that we can optionally
39743 use the vbroadcast instruction. They expand to two insns
39744 if the input happens to be in a register. */
39745 gcc_unreachable ();
39747 case V2DFmode:
39748 case V2DImode:
39749 case V4SFmode:
39750 case V4SImode:
39751 /* These are always implementable using standard shuffle patterns. */
39752 gcc_unreachable ();
39754 case V8HImode:
39755 case V16QImode:
39756 /* These can be implemented via interleave. We save one insn by
39757 stopping once we have promoted to V4SImode and then use pshufd. */
39760 rtx dest;
39761 rtx (*gen) (rtx, rtx, rtx)
39762 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
39763 : gen_vec_interleave_lowv8hi;
39765 if (elt >= nelt2)
39767 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
39768 : gen_vec_interleave_highv8hi;
39769 elt -= nelt2;
39771 nelt2 /= 2;
39773 dest = gen_reg_rtx (vmode);
39774 emit_insn (gen (dest, op0, op0));
39775 vmode = get_mode_wider_vector (vmode);
39776 op0 = gen_lowpart (vmode, dest);
39778 while (vmode != V4SImode);
39780 memset (perm2, elt, 4);
39781 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
39782 d->testing_p);
39783 gcc_assert (ok);
39784 return true;
39786 case V32QImode:
39787 case V16HImode:
39788 case V8SImode:
39789 case V4DImode:
39790 /* For AVX2 broadcasts of the first element vpbroadcast* or
39791 vpermq should be used by expand_vec_perm_1. */
39792 gcc_assert (!TARGET_AVX2 || d->perm[0]);
39793 return false;
39795 default:
39796 gcc_unreachable ();
39800 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
39801 broadcast permutations. */
39803 static bool
39804 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
39806 unsigned i, elt, nelt = d->nelt;
39808 if (!d->one_operand_p)
39809 return false;
39811 elt = d->perm[0];
39812 for (i = 1; i < nelt; ++i)
39813 if (d->perm[i] != elt)
39814 return false;
39816 return expand_vec_perm_broadcast_1 (d);
39819 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
39820 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
39821 all the shorter instruction sequences. */
39823 static bool
39824 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
39826 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
39827 unsigned int i, nelt, eltsz;
39828 bool used[4];
39830 if (!TARGET_AVX2
39831 || d->one_operand_p
39832 || (d->vmode != V32QImode && d->vmode != V16HImode))
39833 return false;
39835 if (d->testing_p)
39836 return true;
39838 nelt = d->nelt;
39839 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39841 /* Generate 4 permutation masks. If the required element is within
39842 the same lane, it is shuffled in. If the required element from the
39843 other lane, force a zero by setting bit 7 in the permutation mask.
39844 In the other mask the mask has non-negative elements if element
39845 is requested from the other lane, but also moved to the other lane,
39846 so that the result of vpshufb can have the two V2TImode halves
39847 swapped. */
39848 m128 = GEN_INT (-128);
39849 for (i = 0; i < 32; ++i)
39851 rperm[0][i] = m128;
39852 rperm[1][i] = m128;
39853 rperm[2][i] = m128;
39854 rperm[3][i] = m128;
39856 used[0] = false;
39857 used[1] = false;
39858 used[2] = false;
39859 used[3] = false;
39860 for (i = 0; i < nelt; ++i)
39862 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39863 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39864 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
39866 for (j = 0; j < eltsz; ++j)
39867 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
39868 used[which] = true;
39871 for (i = 0; i < 2; ++i)
39873 if (!used[2 * i + 1])
39875 h[i] = NULL_RTX;
39876 continue;
39878 vperm = gen_rtx_CONST_VECTOR (V32QImode,
39879 gen_rtvec_v (32, rperm[2 * i + 1]));
39880 vperm = force_reg (V32QImode, vperm);
39881 h[i] = gen_reg_rtx (V32QImode);
39882 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
39883 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
39886 /* Swap the 128-byte lanes of h[X]. */
39887 for (i = 0; i < 2; ++i)
39889 if (h[i] == NULL_RTX)
39890 continue;
39891 op = gen_reg_rtx (V4DImode);
39892 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
39893 const2_rtx, GEN_INT (3), const0_rtx,
39894 const1_rtx));
39895 h[i] = gen_lowpart (V32QImode, op);
39898 for (i = 0; i < 2; ++i)
39900 if (!used[2 * i])
39902 l[i] = NULL_RTX;
39903 continue;
39905 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
39906 vperm = force_reg (V32QImode, vperm);
39907 l[i] = gen_reg_rtx (V32QImode);
39908 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
39909 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
39912 for (i = 0; i < 2; ++i)
39914 if (h[i] && l[i])
39916 op = gen_reg_rtx (V32QImode);
39917 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
39918 l[i] = op;
39920 else if (h[i])
39921 l[i] = h[i];
39924 gcc_assert (l[0] && l[1]);
39925 op = gen_lowpart (V32QImode, d->target);
39926 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
39927 return true;
39930 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
39931 With all of the interface bits taken care of, perform the expansion
39932 in D and return true on success. */
39934 static bool
39935 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
39937 /* Try a single instruction expansion. */
39938 if (expand_vec_perm_1 (d))
39939 return true;
39941 /* Try sequences of two instructions. */
39943 if (expand_vec_perm_pshuflw_pshufhw (d))
39944 return true;
39946 if (expand_vec_perm_palignr (d))
39947 return true;
39949 if (expand_vec_perm_interleave2 (d))
39950 return true;
39952 if (expand_vec_perm_broadcast (d))
39953 return true;
39955 if (expand_vec_perm_vpermq_perm_1 (d))
39956 return true;
39958 if (expand_vec_perm_vperm2f128 (d))
39959 return true;
39961 /* Try sequences of three instructions. */
39963 if (expand_vec_perm_2vperm2f128_vshuf (d))
39964 return true;
39966 if (expand_vec_perm_pshufb2 (d))
39967 return true;
39969 if (expand_vec_perm_interleave3 (d))
39970 return true;
39972 if (expand_vec_perm_vperm2f128_vblend (d))
39973 return true;
39975 /* Try sequences of four instructions. */
39977 if (expand_vec_perm_vpshufb2_vpermq (d))
39978 return true;
39980 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
39981 return true;
39983 /* ??? Look for narrow permutations whose element orderings would
39984 allow the promotion to a wider mode. */
39986 /* ??? Look for sequences of interleave or a wider permute that place
39987 the data into the correct lanes for a half-vector shuffle like
39988 pshuf[lh]w or vpermilps. */
39990 /* ??? Look for sequences of interleave that produce the desired results.
39991 The combinatorics of punpck[lh] get pretty ugly... */
39993 if (expand_vec_perm_even_odd (d))
39994 return true;
39996 /* Even longer sequences. */
39997 if (expand_vec_perm_vpshufb4_vpermq2 (d))
39998 return true;
40000 return false;
40003 /* If a permutation only uses one operand, make it clear. Returns true
40004 if the permutation references both operands. */
40006 static bool
40007 canonicalize_perm (struct expand_vec_perm_d *d)
40009 int i, which, nelt = d->nelt;
40011 for (i = which = 0; i < nelt; ++i)
40012 which |= (d->perm[i] < nelt ? 1 : 2);
40014 d->one_operand_p = true;
40015 switch (which)
40017 default:
40018 gcc_unreachable();
40020 case 3:
40021 if (!rtx_equal_p (d->op0, d->op1))
40023 d->one_operand_p = false;
40024 break;
40026 /* The elements of PERM do not suggest that only the first operand
40027 is used, but both operands are identical. Allow easier matching
40028 of the permutation by folding the permutation into the single
40029 input vector. */
40030 /* FALLTHRU */
40032 case 2:
40033 for (i = 0; i < nelt; ++i)
40034 d->perm[i] &= nelt - 1;
40035 d->op0 = d->op1;
40036 break;
40038 case 1:
40039 d->op1 = d->op0;
40040 break;
40043 return (which == 3);
40046 bool
40047 ix86_expand_vec_perm_const (rtx operands[4])
40049 struct expand_vec_perm_d d;
40050 unsigned char perm[MAX_VECT_LEN];
40051 int i, nelt;
40052 bool two_args;
40053 rtx sel;
40055 d.target = operands[0];
40056 d.op0 = operands[1];
40057 d.op1 = operands[2];
40058 sel = operands[3];
40060 d.vmode = GET_MODE (d.target);
40061 gcc_assert (VECTOR_MODE_P (d.vmode));
40062 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40063 d.testing_p = false;
40065 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40066 gcc_assert (XVECLEN (sel, 0) == nelt);
40067 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40069 for (i = 0; i < nelt; ++i)
40071 rtx e = XVECEXP (sel, 0, i);
40072 int ei = INTVAL (e) & (2 * nelt - 1);
40073 d.perm[i] = ei;
40074 perm[i] = ei;
40077 two_args = canonicalize_perm (&d);
40079 if (ix86_expand_vec_perm_const_1 (&d))
40080 return true;
40082 /* If the selector says both arguments are needed, but the operands are the
40083 same, the above tried to expand with one_operand_p and flattened selector.
40084 If that didn't work, retry without one_operand_p; we succeeded with that
40085 during testing. */
40086 if (two_args && d.one_operand_p)
40088 d.one_operand_p = false;
40089 memcpy (d.perm, perm, sizeof (perm));
40090 return ix86_expand_vec_perm_const_1 (&d);
40093 return false;
40096 /* Implement targetm.vectorize.vec_perm_const_ok. */
40098 static bool
40099 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40100 const unsigned char *sel)
40102 struct expand_vec_perm_d d;
40103 unsigned int i, nelt, which;
40104 bool ret;
40106 d.vmode = vmode;
40107 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40108 d.testing_p = true;
40110 /* Given sufficient ISA support we can just return true here
40111 for selected vector modes. */
40112 if (GET_MODE_SIZE (d.vmode) == 16)
40114 /* All implementable with a single vpperm insn. */
40115 if (TARGET_XOP)
40116 return true;
40117 /* All implementable with 2 pshufb + 1 ior. */
40118 if (TARGET_SSSE3)
40119 return true;
40120 /* All implementable with shufpd or unpck[lh]pd. */
40121 if (d.nelt == 2)
40122 return true;
40125 /* Extract the values from the vector CST into the permutation
40126 array in D. */
40127 memcpy (d.perm, sel, nelt);
40128 for (i = which = 0; i < nelt; ++i)
40130 unsigned char e = d.perm[i];
40131 gcc_assert (e < 2 * nelt);
40132 which |= (e < nelt ? 1 : 2);
40135 /* For all elements from second vector, fold the elements to first. */
40136 if (which == 2)
40137 for (i = 0; i < nelt; ++i)
40138 d.perm[i] -= nelt;
40140 /* Check whether the mask can be applied to the vector type. */
40141 d.one_operand_p = (which != 3);
40143 /* Implementable with shufps or pshufd. */
40144 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40145 return true;
40147 /* Otherwise we have to go through the motions and see if we can
40148 figure out how to generate the requested permutation. */
40149 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40150 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40151 if (!d.one_operand_p)
40152 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40154 start_sequence ();
40155 ret = ix86_expand_vec_perm_const_1 (&d);
40156 end_sequence ();
40158 return ret;
40161 void
40162 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40164 struct expand_vec_perm_d d;
40165 unsigned i, nelt;
40167 d.target = targ;
40168 d.op0 = op0;
40169 d.op1 = op1;
40170 d.vmode = GET_MODE (targ);
40171 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40172 d.one_operand_p = false;
40173 d.testing_p = false;
40175 for (i = 0; i < nelt; ++i)
40176 d.perm[i] = i * 2 + odd;
40178 /* We'll either be able to implement the permutation directly... */
40179 if (expand_vec_perm_1 (&d))
40180 return;
40182 /* ... or we use the special-case patterns. */
40183 expand_vec_perm_even_odd_1 (&d, odd);
40186 static void
40187 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40189 struct expand_vec_perm_d d;
40190 unsigned i, nelt, base;
40191 bool ok;
40193 d.target = targ;
40194 d.op0 = op0;
40195 d.op1 = op1;
40196 d.vmode = GET_MODE (targ);
40197 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40198 d.one_operand_p = false;
40199 d.testing_p = false;
40201 base = high_p ? nelt / 2 : 0;
40202 for (i = 0; i < nelt / 2; ++i)
40204 d.perm[i * 2] = i + base;
40205 d.perm[i * 2 + 1] = i + base + nelt;
40208 /* Note that for AVX this isn't one instruction. */
40209 ok = ix86_expand_vec_perm_const_1 (&d);
40210 gcc_assert (ok);
40214 /* Expand a vector operation CODE for a V*QImode in terms of the
40215 same operation on V*HImode. */
40217 void
40218 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40220 enum machine_mode qimode = GET_MODE (dest);
40221 enum machine_mode himode;
40222 rtx (*gen_il) (rtx, rtx, rtx);
40223 rtx (*gen_ih) (rtx, rtx, rtx);
40224 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40225 struct expand_vec_perm_d d;
40226 bool ok, full_interleave;
40227 bool uns_p = false;
40228 int i;
40230 switch (qimode)
40232 case V16QImode:
40233 himode = V8HImode;
40234 gen_il = gen_vec_interleave_lowv16qi;
40235 gen_ih = gen_vec_interleave_highv16qi;
40236 break;
40237 case V32QImode:
40238 himode = V16HImode;
40239 gen_il = gen_avx2_interleave_lowv32qi;
40240 gen_ih = gen_avx2_interleave_highv32qi;
40241 break;
40242 default:
40243 gcc_unreachable ();
40246 op2_l = op2_h = op2;
40247 switch (code)
40249 case MULT:
40250 /* Unpack data such that we've got a source byte in each low byte of
40251 each word. We don't care what goes into the high byte of each word.
40252 Rather than trying to get zero in there, most convenient is to let
40253 it be a copy of the low byte. */
40254 op2_l = gen_reg_rtx (qimode);
40255 op2_h = gen_reg_rtx (qimode);
40256 emit_insn (gen_il (op2_l, op2, op2));
40257 emit_insn (gen_ih (op2_h, op2, op2));
40258 /* FALLTHRU */
40260 op1_l = gen_reg_rtx (qimode);
40261 op1_h = gen_reg_rtx (qimode);
40262 emit_insn (gen_il (op1_l, op1, op1));
40263 emit_insn (gen_ih (op1_h, op1, op1));
40264 full_interleave = qimode == V16QImode;
40265 break;
40267 case ASHIFT:
40268 case LSHIFTRT:
40269 uns_p = true;
40270 /* FALLTHRU */
40271 case ASHIFTRT:
40272 op1_l = gen_reg_rtx (himode);
40273 op1_h = gen_reg_rtx (himode);
40274 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40275 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40276 full_interleave = true;
40277 break;
40278 default:
40279 gcc_unreachable ();
40282 /* Perform the operation. */
40283 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40284 1, OPTAB_DIRECT);
40285 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40286 1, OPTAB_DIRECT);
40287 gcc_assert (res_l && res_h);
40289 /* Merge the data back into the right place. */
40290 d.target = dest;
40291 d.op0 = gen_lowpart (qimode, res_l);
40292 d.op1 = gen_lowpart (qimode, res_h);
40293 d.vmode = qimode;
40294 d.nelt = GET_MODE_NUNITS (qimode);
40295 d.one_operand_p = false;
40296 d.testing_p = false;
40298 if (full_interleave)
40300 /* For SSE2, we used an full interleave, so the desired
40301 results are in the even elements. */
40302 for (i = 0; i < 32; ++i)
40303 d.perm[i] = i * 2;
40305 else
40307 /* For AVX, the interleave used above was not cross-lane. So the
40308 extraction is evens but with the second and third quarter swapped.
40309 Happily, that is even one insn shorter than even extraction. */
40310 for (i = 0; i < 32; ++i)
40311 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40314 ok = ix86_expand_vec_perm_const_1 (&d);
40315 gcc_assert (ok);
40317 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40318 gen_rtx_fmt_ee (code, qimode, op1, op2));
40321 void
40322 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40323 bool uns_p, bool odd_p)
40325 enum machine_mode mode = GET_MODE (op1);
40326 enum machine_mode wmode = GET_MODE (dest);
40327 rtx x;
40329 /* We only play even/odd games with vectors of SImode. */
40330 gcc_assert (mode == V4SImode || mode == V8SImode);
40332 /* If we're looking for the odd results, shift those members down to
40333 the even slots. For some cpus this is faster than a PSHUFD. */
40334 if (odd_p)
40336 if (TARGET_XOP && mode == V4SImode)
40338 x = force_reg (wmode, CONST0_RTX (wmode));
40339 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40340 return;
40343 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40344 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40345 x, NULL, 1, OPTAB_DIRECT);
40346 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40347 x, NULL, 1, OPTAB_DIRECT);
40348 op1 = gen_lowpart (mode, op1);
40349 op2 = gen_lowpart (mode, op2);
40352 if (mode == V8SImode)
40354 if (uns_p)
40355 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40356 else
40357 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40359 else if (uns_p)
40360 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40361 else if (TARGET_SSE4_1)
40362 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40363 else
40365 rtx s1, s2, t0, t1, t2;
40367 /* The easiest way to implement this without PMULDQ is to go through
40368 the motions as if we are performing a full 64-bit multiply. With
40369 the exception that we need to do less shuffling of the elements. */
40371 /* Compute the sign-extension, aka highparts, of the two operands. */
40372 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40373 op1, pc_rtx, pc_rtx);
40374 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40375 op2, pc_rtx, pc_rtx);
40377 /* Multiply LO(A) * HI(B), and vice-versa. */
40378 t1 = gen_reg_rtx (wmode);
40379 t2 = gen_reg_rtx (wmode);
40380 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40381 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40383 /* Multiply LO(A) * LO(B). */
40384 t0 = gen_reg_rtx (wmode);
40385 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40387 /* Combine and shift the highparts into place. */
40388 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40389 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40390 1, OPTAB_DIRECT);
40392 /* Combine high and low parts. */
40393 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40394 return;
40396 emit_insn (x);
40399 void
40400 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40401 bool uns_p, bool high_p)
40403 enum machine_mode wmode = GET_MODE (dest);
40404 enum machine_mode mode = GET_MODE (op1);
40405 rtx t1, t2, t3, t4, mask;
40407 switch (mode)
40409 case V4SImode:
40410 t1 = gen_reg_rtx (mode);
40411 t2 = gen_reg_rtx (mode);
40412 if (TARGET_XOP && !uns_p)
40414 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40415 shuffle the elements once so that all elements are in the right
40416 place for immediate use: { A C B D }. */
40417 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40418 const1_rtx, GEN_INT (3)));
40419 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40420 const1_rtx, GEN_INT (3)));
40422 else
40424 /* Put the elements into place for the multiply. */
40425 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40426 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40427 high_p = false;
40429 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40430 break;
40432 case V8SImode:
40433 /* Shuffle the elements between the lanes. After this we
40434 have { A B E F | C D G H } for each operand. */
40435 t1 = gen_reg_rtx (V4DImode);
40436 t2 = gen_reg_rtx (V4DImode);
40437 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40438 const0_rtx, const2_rtx,
40439 const1_rtx, GEN_INT (3)));
40440 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40441 const0_rtx, const2_rtx,
40442 const1_rtx, GEN_INT (3)));
40444 /* Shuffle the elements within the lanes. After this we
40445 have { A A B B | C C D D } or { E E F F | G G H H }. */
40446 t3 = gen_reg_rtx (V8SImode);
40447 t4 = gen_reg_rtx (V8SImode);
40448 mask = GEN_INT (high_p
40449 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40450 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40451 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40452 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40454 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40455 break;
40457 case V8HImode:
40458 case V16HImode:
40459 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
40460 uns_p, OPTAB_DIRECT);
40461 t2 = expand_binop (mode,
40462 uns_p ? umul_highpart_optab : smul_highpart_optab,
40463 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
40464 gcc_assert (t1 && t2);
40466 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
40467 break;
40469 case V16QImode:
40470 case V32QImode:
40471 t1 = gen_reg_rtx (wmode);
40472 t2 = gen_reg_rtx (wmode);
40473 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
40474 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
40476 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
40477 break;
40479 default:
40480 gcc_unreachable ();
40484 void
40485 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
40487 rtx res_1, res_2;
40489 res_1 = gen_reg_rtx (V4SImode);
40490 res_2 = gen_reg_rtx (V4SImode);
40491 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
40492 op1, op2, true, false);
40493 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
40494 op1, op2, true, true);
40496 /* Move the results in element 2 down to element 1; we don't care
40497 what goes in elements 2 and 3. Then we can merge the parts
40498 back together with an interleave.
40500 Note that two other sequences were tried:
40501 (1) Use interleaves at the start instead of psrldq, which allows
40502 us to use a single shufps to merge things back at the end.
40503 (2) Use shufps here to combine the two vectors, then pshufd to
40504 put the elements in the correct order.
40505 In both cases the cost of the reformatting stall was too high
40506 and the overall sequence slower. */
40508 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
40509 const0_rtx, const0_rtx));
40510 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
40511 const0_rtx, const0_rtx));
40512 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
40514 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
40517 void
40518 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
40520 enum machine_mode mode = GET_MODE (op0);
40521 rtx t1, t2, t3, t4, t5, t6;
40523 if (TARGET_XOP && mode == V2DImode)
40525 /* op1: A,B,C,D, op2: E,F,G,H */
40526 op1 = gen_lowpart (V4SImode, op1);
40527 op2 = gen_lowpart (V4SImode, op2);
40529 t1 = gen_reg_rtx (V4SImode);
40530 t2 = gen_reg_rtx (V4SImode);
40531 t3 = gen_reg_rtx (V2DImode);
40532 t4 = gen_reg_rtx (V2DImode);
40534 /* t1: B,A,D,C */
40535 emit_insn (gen_sse2_pshufd_1 (t1, op1,
40536 GEN_INT (1),
40537 GEN_INT (0),
40538 GEN_INT (3),
40539 GEN_INT (2)));
40541 /* t2: (B*E),(A*F),(D*G),(C*H) */
40542 emit_insn (gen_mulv4si3 (t2, t1, op2));
40544 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
40545 emit_insn (gen_xop_phadddq (t3, t2));
40547 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
40548 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
40550 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
40551 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
40553 else
40555 enum machine_mode nmode;
40556 rtx (*umul) (rtx, rtx, rtx);
40558 if (mode == V2DImode)
40560 umul = gen_vec_widen_umult_even_v4si;
40561 nmode = V4SImode;
40563 else if (mode == V4DImode)
40565 umul = gen_vec_widen_umult_even_v8si;
40566 nmode = V8SImode;
40568 else
40569 gcc_unreachable ();
40572 /* Multiply low parts. */
40573 t1 = gen_reg_rtx (mode);
40574 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
40576 /* Shift input vectors right 32 bits so we can multiply high parts. */
40577 t6 = GEN_INT (32);
40578 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
40579 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
40581 /* Multiply high parts by low parts. */
40582 t4 = gen_reg_rtx (mode);
40583 t5 = gen_reg_rtx (mode);
40584 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
40585 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
40587 /* Combine and shift the highparts back. */
40588 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
40589 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
40591 /* Combine high and low parts. */
40592 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
40595 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40596 gen_rtx_MULT (mode, op1, op2));
40599 /* Expand an insert into a vector register through pinsr insn.
40600 Return true if successful. */
40602 bool
40603 ix86_expand_pinsr (rtx *operands)
40605 rtx dst = operands[0];
40606 rtx src = operands[3];
40608 unsigned int size = INTVAL (operands[1]);
40609 unsigned int pos = INTVAL (operands[2]);
40611 if (GET_CODE (dst) == SUBREG)
40613 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
40614 dst = SUBREG_REG (dst);
40617 if (GET_CODE (src) == SUBREG)
40618 src = SUBREG_REG (src);
40620 switch (GET_MODE (dst))
40622 case V16QImode:
40623 case V8HImode:
40624 case V4SImode:
40625 case V2DImode:
40627 enum machine_mode srcmode, dstmode;
40628 rtx (*pinsr)(rtx, rtx, rtx, rtx);
40630 srcmode = mode_for_size (size, MODE_INT, 0);
40632 switch (srcmode)
40634 case QImode:
40635 if (!TARGET_SSE4_1)
40636 return false;
40637 dstmode = V16QImode;
40638 pinsr = gen_sse4_1_pinsrb;
40639 break;
40641 case HImode:
40642 if (!TARGET_SSE2)
40643 return false;
40644 dstmode = V8HImode;
40645 pinsr = gen_sse2_pinsrw;
40646 break;
40648 case SImode:
40649 if (!TARGET_SSE4_1)
40650 return false;
40651 dstmode = V4SImode;
40652 pinsr = gen_sse4_1_pinsrd;
40653 break;
40655 case DImode:
40656 gcc_assert (TARGET_64BIT);
40657 if (!TARGET_SSE4_1)
40658 return false;
40659 dstmode = V2DImode;
40660 pinsr = gen_sse4_1_pinsrq;
40661 break;
40663 default:
40664 return false;
40667 dst = gen_lowpart (dstmode, dst);
40668 src = gen_lowpart (srcmode, src);
40670 pos /= size;
40672 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
40673 return true;
40676 default:
40677 return false;
40681 /* This function returns the calling abi specific va_list type node.
40682 It returns the FNDECL specific va_list type. */
40684 static tree
40685 ix86_fn_abi_va_list (tree fndecl)
40687 if (!TARGET_64BIT)
40688 return va_list_type_node;
40689 gcc_assert (fndecl != NULL_TREE);
40691 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
40692 return ms_va_list_type_node;
40693 else
40694 return sysv_va_list_type_node;
40697 /* Returns the canonical va_list type specified by TYPE. If there
40698 is no valid TYPE provided, it return NULL_TREE. */
40700 static tree
40701 ix86_canonical_va_list_type (tree type)
40703 tree wtype, htype;
40705 /* Resolve references and pointers to va_list type. */
40706 if (TREE_CODE (type) == MEM_REF)
40707 type = TREE_TYPE (type);
40708 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
40709 type = TREE_TYPE (type);
40710 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
40711 type = TREE_TYPE (type);
40713 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
40715 wtype = va_list_type_node;
40716 gcc_assert (wtype != NULL_TREE);
40717 htype = type;
40718 if (TREE_CODE (wtype) == ARRAY_TYPE)
40720 /* If va_list is an array type, the argument may have decayed
40721 to a pointer type, e.g. by being passed to another function.
40722 In that case, unwrap both types so that we can compare the
40723 underlying records. */
40724 if (TREE_CODE (htype) == ARRAY_TYPE
40725 || POINTER_TYPE_P (htype))
40727 wtype = TREE_TYPE (wtype);
40728 htype = TREE_TYPE (htype);
40731 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40732 return va_list_type_node;
40733 wtype = sysv_va_list_type_node;
40734 gcc_assert (wtype != NULL_TREE);
40735 htype = type;
40736 if (TREE_CODE (wtype) == ARRAY_TYPE)
40738 /* If va_list is an array type, the argument may have decayed
40739 to a pointer type, e.g. by being passed to another function.
40740 In that case, unwrap both types so that we can compare the
40741 underlying records. */
40742 if (TREE_CODE (htype) == ARRAY_TYPE
40743 || POINTER_TYPE_P (htype))
40745 wtype = TREE_TYPE (wtype);
40746 htype = TREE_TYPE (htype);
40749 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40750 return sysv_va_list_type_node;
40751 wtype = ms_va_list_type_node;
40752 gcc_assert (wtype != NULL_TREE);
40753 htype = type;
40754 if (TREE_CODE (wtype) == ARRAY_TYPE)
40756 /* If va_list is an array type, the argument may have decayed
40757 to a pointer type, e.g. by being passed to another function.
40758 In that case, unwrap both types so that we can compare the
40759 underlying records. */
40760 if (TREE_CODE (htype) == ARRAY_TYPE
40761 || POINTER_TYPE_P (htype))
40763 wtype = TREE_TYPE (wtype);
40764 htype = TREE_TYPE (htype);
40767 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40768 return ms_va_list_type_node;
40769 return NULL_TREE;
40771 return std_canonical_va_list_type (type);
40774 /* Iterate through the target-specific builtin types for va_list.
40775 IDX denotes the iterator, *PTREE is set to the result type of
40776 the va_list builtin, and *PNAME to its internal type.
40777 Returns zero if there is no element for this index, otherwise
40778 IDX should be increased upon the next call.
40779 Note, do not iterate a base builtin's name like __builtin_va_list.
40780 Used from c_common_nodes_and_builtins. */
40782 static int
40783 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
40785 if (TARGET_64BIT)
40787 switch (idx)
40789 default:
40790 break;
40792 case 0:
40793 *ptree = ms_va_list_type_node;
40794 *pname = "__builtin_ms_va_list";
40795 return 1;
40797 case 1:
40798 *ptree = sysv_va_list_type_node;
40799 *pname = "__builtin_sysv_va_list";
40800 return 1;
40804 return 0;
40807 #undef TARGET_SCHED_DISPATCH
40808 #define TARGET_SCHED_DISPATCH has_dispatch
40809 #undef TARGET_SCHED_DISPATCH_DO
40810 #define TARGET_SCHED_DISPATCH_DO do_dispatch
40811 #undef TARGET_SCHED_REASSOCIATION_WIDTH
40812 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
40813 #undef TARGET_SCHED_REORDER
40814 #define TARGET_SCHED_REORDER ix86_sched_reorder
40815 #undef TARGET_SCHED_ADJUST_PRIORITY
40816 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
40817 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
40818 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
40820 /* The size of the dispatch window is the total number of bytes of
40821 object code allowed in a window. */
40822 #define DISPATCH_WINDOW_SIZE 16
40824 /* Number of dispatch windows considered for scheduling. */
40825 #define MAX_DISPATCH_WINDOWS 3
40827 /* Maximum number of instructions in a window. */
40828 #define MAX_INSN 4
40830 /* Maximum number of immediate operands in a window. */
40831 #define MAX_IMM 4
40833 /* Maximum number of immediate bits allowed in a window. */
40834 #define MAX_IMM_SIZE 128
40836 /* Maximum number of 32 bit immediates allowed in a window. */
40837 #define MAX_IMM_32 4
40839 /* Maximum number of 64 bit immediates allowed in a window. */
40840 #define MAX_IMM_64 2
40842 /* Maximum total of loads or prefetches allowed in a window. */
40843 #define MAX_LOAD 2
40845 /* Maximum total of stores allowed in a window. */
40846 #define MAX_STORE 1
40848 #undef BIG
40849 #define BIG 100
40852 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
40853 enum dispatch_group {
40854 disp_no_group = 0,
40855 disp_load,
40856 disp_store,
40857 disp_load_store,
40858 disp_prefetch,
40859 disp_imm,
40860 disp_imm_32,
40861 disp_imm_64,
40862 disp_branch,
40863 disp_cmp,
40864 disp_jcc,
40865 disp_last
40868 /* Number of allowable groups in a dispatch window. It is an array
40869 indexed by dispatch_group enum. 100 is used as a big number,
40870 because the number of these kind of operations does not have any
40871 effect in dispatch window, but we need them for other reasons in
40872 the table. */
40873 static unsigned int num_allowable_groups[disp_last] = {
40874 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
40877 char group_name[disp_last + 1][16] = {
40878 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
40879 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
40880 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
40883 /* Instruction path. */
40884 enum insn_path {
40885 no_path = 0,
40886 path_single, /* Single micro op. */
40887 path_double, /* Double micro op. */
40888 path_multi, /* Instructions with more than 2 micro op.. */
40889 last_path
40892 /* sched_insn_info defines a window to the instructions scheduled in
40893 the basic block. It contains a pointer to the insn_info table and
40894 the instruction scheduled.
40896 Windows are allocated for each basic block and are linked
40897 together. */
40898 typedef struct sched_insn_info_s {
40899 rtx insn;
40900 enum dispatch_group group;
40901 enum insn_path path;
40902 int byte_len;
40903 int imm_bytes;
40904 } sched_insn_info;
40906 /* Linked list of dispatch windows. This is a two way list of
40907 dispatch windows of a basic block. It contains information about
40908 the number of uops in the window and the total number of
40909 instructions and of bytes in the object code for this dispatch
40910 window. */
40911 typedef struct dispatch_windows_s {
40912 int num_insn; /* Number of insn in the window. */
40913 int num_uops; /* Number of uops in the window. */
40914 int window_size; /* Number of bytes in the window. */
40915 int window_num; /* Window number between 0 or 1. */
40916 int num_imm; /* Number of immediates in an insn. */
40917 int num_imm_32; /* Number of 32 bit immediates in an insn. */
40918 int num_imm_64; /* Number of 64 bit immediates in an insn. */
40919 int imm_size; /* Total immediates in the window. */
40920 int num_loads; /* Total memory loads in the window. */
40921 int num_stores; /* Total memory stores in the window. */
40922 int violation; /* Violation exists in window. */
40923 sched_insn_info *window; /* Pointer to the window. */
40924 struct dispatch_windows_s *next;
40925 struct dispatch_windows_s *prev;
40926 } dispatch_windows;
40928 /* Immediate valuse used in an insn. */
40929 typedef struct imm_info_s
40931 int imm;
40932 int imm32;
40933 int imm64;
40934 } imm_info;
40936 static dispatch_windows *dispatch_window_list;
40937 static dispatch_windows *dispatch_window_list1;
40939 /* Get dispatch group of insn. */
40941 static enum dispatch_group
40942 get_mem_group (rtx insn)
40944 enum attr_memory memory;
40946 if (INSN_CODE (insn) < 0)
40947 return disp_no_group;
40948 memory = get_attr_memory (insn);
40949 if (memory == MEMORY_STORE)
40950 return disp_store;
40952 if (memory == MEMORY_LOAD)
40953 return disp_load;
40955 if (memory == MEMORY_BOTH)
40956 return disp_load_store;
40958 return disp_no_group;
40961 /* Return true if insn is a compare instruction. */
40963 static bool
40964 is_cmp (rtx insn)
40966 enum attr_type type;
40968 type = get_attr_type (insn);
40969 return (type == TYPE_TEST
40970 || type == TYPE_ICMP
40971 || type == TYPE_FCMP
40972 || GET_CODE (PATTERN (insn)) == COMPARE);
40975 /* Return true if a dispatch violation encountered. */
40977 static bool
40978 dispatch_violation (void)
40980 if (dispatch_window_list->next)
40981 return dispatch_window_list->next->violation;
40982 return dispatch_window_list->violation;
40985 /* Return true if insn is a branch instruction. */
40987 static bool
40988 is_branch (rtx insn)
40990 return (CALL_P (insn) || JUMP_P (insn));
40993 /* Return true if insn is a prefetch instruction. */
40995 static bool
40996 is_prefetch (rtx insn)
40998 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41001 /* This function initializes a dispatch window and the list container holding a
41002 pointer to the window. */
41004 static void
41005 init_window (int window_num)
41007 int i;
41008 dispatch_windows *new_list;
41010 if (window_num == 0)
41011 new_list = dispatch_window_list;
41012 else
41013 new_list = dispatch_window_list1;
41015 new_list->num_insn = 0;
41016 new_list->num_uops = 0;
41017 new_list->window_size = 0;
41018 new_list->next = NULL;
41019 new_list->prev = NULL;
41020 new_list->window_num = window_num;
41021 new_list->num_imm = 0;
41022 new_list->num_imm_32 = 0;
41023 new_list->num_imm_64 = 0;
41024 new_list->imm_size = 0;
41025 new_list->num_loads = 0;
41026 new_list->num_stores = 0;
41027 new_list->violation = false;
41029 for (i = 0; i < MAX_INSN; i++)
41031 new_list->window[i].insn = NULL;
41032 new_list->window[i].group = disp_no_group;
41033 new_list->window[i].path = no_path;
41034 new_list->window[i].byte_len = 0;
41035 new_list->window[i].imm_bytes = 0;
41037 return;
41040 /* This function allocates and initializes a dispatch window and the
41041 list container holding a pointer to the window. */
41043 static dispatch_windows *
41044 allocate_window (void)
41046 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41047 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41049 return new_list;
41052 /* This routine initializes the dispatch scheduling information. It
41053 initiates building dispatch scheduler tables and constructs the
41054 first dispatch window. */
41056 static void
41057 init_dispatch_sched (void)
41059 /* Allocate a dispatch list and a window. */
41060 dispatch_window_list = allocate_window ();
41061 dispatch_window_list1 = allocate_window ();
41062 init_window (0);
41063 init_window (1);
41066 /* This function returns true if a branch is detected. End of a basic block
41067 does not have to be a branch, but here we assume only branches end a
41068 window. */
41070 static bool
41071 is_end_basic_block (enum dispatch_group group)
41073 return group == disp_branch;
41076 /* This function is called when the end of a window processing is reached. */
41078 static void
41079 process_end_window (void)
41081 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41082 if (dispatch_window_list->next)
41084 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41085 gcc_assert (dispatch_window_list->window_size
41086 + dispatch_window_list1->window_size <= 48);
41087 init_window (1);
41089 init_window (0);
41092 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41093 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41094 for 48 bytes of instructions. Note that these windows are not dispatch
41095 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41097 static dispatch_windows *
41098 allocate_next_window (int window_num)
41100 if (window_num == 0)
41102 if (dispatch_window_list->next)
41103 init_window (1);
41104 init_window (0);
41105 return dispatch_window_list;
41108 dispatch_window_list->next = dispatch_window_list1;
41109 dispatch_window_list1->prev = dispatch_window_list;
41111 return dispatch_window_list1;
41114 /* Increment the number of immediate operands of an instruction. */
41116 static int
41117 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41119 if (*in_rtx == 0)
41120 return 0;
41122 switch ( GET_CODE (*in_rtx))
41124 case CONST:
41125 case SYMBOL_REF:
41126 case CONST_INT:
41127 (imm_values->imm)++;
41128 if (x86_64_immediate_operand (*in_rtx, SImode))
41129 (imm_values->imm32)++;
41130 else
41131 (imm_values->imm64)++;
41132 break;
41134 case CONST_DOUBLE:
41135 (imm_values->imm)++;
41136 (imm_values->imm64)++;
41137 break;
41139 case CODE_LABEL:
41140 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41142 (imm_values->imm)++;
41143 (imm_values->imm32)++;
41145 break;
41147 default:
41148 break;
41151 return 0;
41154 /* Compute number of immediate operands of an instruction. */
41156 static void
41157 find_constant (rtx in_rtx, imm_info *imm_values)
41159 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41160 (rtx_function) find_constant_1, (void *) imm_values);
41163 /* Return total size of immediate operands of an instruction along with number
41164 of corresponding immediate-operands. It initializes its parameters to zero
41165 befor calling FIND_CONSTANT.
41166 INSN is the input instruction. IMM is the total of immediates.
41167 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41168 bit immediates. */
41170 static int
41171 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41173 imm_info imm_values = {0, 0, 0};
41175 find_constant (insn, &imm_values);
41176 *imm = imm_values.imm;
41177 *imm32 = imm_values.imm32;
41178 *imm64 = imm_values.imm64;
41179 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41182 /* This function indicates if an operand of an instruction is an
41183 immediate. */
41185 static bool
41186 has_immediate (rtx insn)
41188 int num_imm_operand;
41189 int num_imm32_operand;
41190 int num_imm64_operand;
41192 if (insn)
41193 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41194 &num_imm64_operand);
41195 return false;
41198 /* Return single or double path for instructions. */
41200 static enum insn_path
41201 get_insn_path (rtx insn)
41203 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41205 if ((int)path == 0)
41206 return path_single;
41208 if ((int)path == 1)
41209 return path_double;
41211 return path_multi;
41214 /* Return insn dispatch group. */
41216 static enum dispatch_group
41217 get_insn_group (rtx insn)
41219 enum dispatch_group group = get_mem_group (insn);
41220 if (group)
41221 return group;
41223 if (is_branch (insn))
41224 return disp_branch;
41226 if (is_cmp (insn))
41227 return disp_cmp;
41229 if (has_immediate (insn))
41230 return disp_imm;
41232 if (is_prefetch (insn))
41233 return disp_prefetch;
41235 return disp_no_group;
41238 /* Count number of GROUP restricted instructions in a dispatch
41239 window WINDOW_LIST. */
41241 static int
41242 count_num_restricted (rtx insn, dispatch_windows *window_list)
41244 enum dispatch_group group = get_insn_group (insn);
41245 int imm_size;
41246 int num_imm_operand;
41247 int num_imm32_operand;
41248 int num_imm64_operand;
41250 if (group == disp_no_group)
41251 return 0;
41253 if (group == disp_imm)
41255 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41256 &num_imm64_operand);
41257 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41258 || num_imm_operand + window_list->num_imm > MAX_IMM
41259 || (num_imm32_operand > 0
41260 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41261 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41262 || (num_imm64_operand > 0
41263 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41264 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41265 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41266 && num_imm64_operand > 0
41267 && ((window_list->num_imm_64 > 0
41268 && window_list->num_insn >= 2)
41269 || window_list->num_insn >= 3)))
41270 return BIG;
41272 return 1;
41275 if ((group == disp_load_store
41276 && (window_list->num_loads >= MAX_LOAD
41277 || window_list->num_stores >= MAX_STORE))
41278 || ((group == disp_load
41279 || group == disp_prefetch)
41280 && window_list->num_loads >= MAX_LOAD)
41281 || (group == disp_store
41282 && window_list->num_stores >= MAX_STORE))
41283 return BIG;
41285 return 1;
41288 /* This function returns true if insn satisfies dispatch rules on the
41289 last window scheduled. */
41291 static bool
41292 fits_dispatch_window (rtx insn)
41294 dispatch_windows *window_list = dispatch_window_list;
41295 dispatch_windows *window_list_next = dispatch_window_list->next;
41296 unsigned int num_restrict;
41297 enum dispatch_group group = get_insn_group (insn);
41298 enum insn_path path = get_insn_path (insn);
41299 int sum;
41301 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41302 instructions should be given the lowest priority in the
41303 scheduling process in Haifa scheduler to make sure they will be
41304 scheduled in the same dispatch window as the reference to them. */
41305 if (group == disp_jcc || group == disp_cmp)
41306 return false;
41308 /* Check nonrestricted. */
41309 if (group == disp_no_group || group == disp_branch)
41310 return true;
41312 /* Get last dispatch window. */
41313 if (window_list_next)
41314 window_list = window_list_next;
41316 if (window_list->window_num == 1)
41318 sum = window_list->prev->window_size + window_list->window_size;
41320 if (sum == 32
41321 || (min_insn_size (insn) + sum) >= 48)
41322 /* Window 1 is full. Go for next window. */
41323 return true;
41326 num_restrict = count_num_restricted (insn, window_list);
41328 if (num_restrict > num_allowable_groups[group])
41329 return false;
41331 /* See if it fits in the first window. */
41332 if (window_list->window_num == 0)
41334 /* The first widow should have only single and double path
41335 uops. */
41336 if (path == path_double
41337 && (window_list->num_uops + 2) > MAX_INSN)
41338 return false;
41339 else if (path != path_single)
41340 return false;
41342 return true;
41345 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41346 dispatch window WINDOW_LIST. */
41348 static void
41349 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41351 int byte_len = min_insn_size (insn);
41352 int num_insn = window_list->num_insn;
41353 int imm_size;
41354 sched_insn_info *window = window_list->window;
41355 enum dispatch_group group = get_insn_group (insn);
41356 enum insn_path path = get_insn_path (insn);
41357 int num_imm_operand;
41358 int num_imm32_operand;
41359 int num_imm64_operand;
41361 if (!window_list->violation && group != disp_cmp
41362 && !fits_dispatch_window (insn))
41363 window_list->violation = true;
41365 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41366 &num_imm64_operand);
41368 /* Initialize window with new instruction. */
41369 window[num_insn].insn = insn;
41370 window[num_insn].byte_len = byte_len;
41371 window[num_insn].group = group;
41372 window[num_insn].path = path;
41373 window[num_insn].imm_bytes = imm_size;
41375 window_list->window_size += byte_len;
41376 window_list->num_insn = num_insn + 1;
41377 window_list->num_uops = window_list->num_uops + num_uops;
41378 window_list->imm_size += imm_size;
41379 window_list->num_imm += num_imm_operand;
41380 window_list->num_imm_32 += num_imm32_operand;
41381 window_list->num_imm_64 += num_imm64_operand;
41383 if (group == disp_store)
41384 window_list->num_stores += 1;
41385 else if (group == disp_load
41386 || group == disp_prefetch)
41387 window_list->num_loads += 1;
41388 else if (group == disp_load_store)
41390 window_list->num_stores += 1;
41391 window_list->num_loads += 1;
41395 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41396 If the total bytes of instructions or the number of instructions in
41397 the window exceed allowable, it allocates a new window. */
41399 static void
41400 add_to_dispatch_window (rtx insn)
41402 int byte_len;
41403 dispatch_windows *window_list;
41404 dispatch_windows *next_list;
41405 dispatch_windows *window0_list;
41406 enum insn_path path;
41407 enum dispatch_group insn_group;
41408 bool insn_fits;
41409 int num_insn;
41410 int num_uops;
41411 int window_num;
41412 int insn_num_uops;
41413 int sum;
41415 if (INSN_CODE (insn) < 0)
41416 return;
41418 byte_len = min_insn_size (insn);
41419 window_list = dispatch_window_list;
41420 next_list = window_list->next;
41421 path = get_insn_path (insn);
41422 insn_group = get_insn_group (insn);
41424 /* Get the last dispatch window. */
41425 if (next_list)
41426 window_list = dispatch_window_list->next;
41428 if (path == path_single)
41429 insn_num_uops = 1;
41430 else if (path == path_double)
41431 insn_num_uops = 2;
41432 else
41433 insn_num_uops = (int) path;
41435 /* If current window is full, get a new window.
41436 Window number zero is full, if MAX_INSN uops are scheduled in it.
41437 Window number one is full, if window zero's bytes plus window
41438 one's bytes is 32, or if the bytes of the new instruction added
41439 to the total makes it greater than 48, or it has already MAX_INSN
41440 instructions in it. */
41441 num_insn = window_list->num_insn;
41442 num_uops = window_list->num_uops;
41443 window_num = window_list->window_num;
41444 insn_fits = fits_dispatch_window (insn);
41446 if (num_insn >= MAX_INSN
41447 || num_uops + insn_num_uops > MAX_INSN
41448 || !(insn_fits))
41450 window_num = ~window_num & 1;
41451 window_list = allocate_next_window (window_num);
41454 if (window_num == 0)
41456 add_insn_window (insn, window_list, insn_num_uops);
41457 if (window_list->num_insn >= MAX_INSN
41458 && insn_group == disp_branch)
41460 process_end_window ();
41461 return;
41464 else if (window_num == 1)
41466 window0_list = window_list->prev;
41467 sum = window0_list->window_size + window_list->window_size;
41468 if (sum == 32
41469 || (byte_len + sum) >= 48)
41471 process_end_window ();
41472 window_list = dispatch_window_list;
41475 add_insn_window (insn, window_list, insn_num_uops);
41477 else
41478 gcc_unreachable ();
41480 if (is_end_basic_block (insn_group))
41482 /* End of basic block is reached do end-basic-block process. */
41483 process_end_window ();
41484 return;
41488 /* Print the dispatch window, WINDOW_NUM, to FILE. */
41490 DEBUG_FUNCTION static void
41491 debug_dispatch_window_file (FILE *file, int window_num)
41493 dispatch_windows *list;
41494 int i;
41496 if (window_num == 0)
41497 list = dispatch_window_list;
41498 else
41499 list = dispatch_window_list1;
41501 fprintf (file, "Window #%d:\n", list->window_num);
41502 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
41503 list->num_insn, list->num_uops, list->window_size);
41504 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41505 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
41507 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
41508 list->num_stores);
41509 fprintf (file, " insn info:\n");
41511 for (i = 0; i < MAX_INSN; i++)
41513 if (!list->window[i].insn)
41514 break;
41515 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
41516 i, group_name[list->window[i].group],
41517 i, (void *)list->window[i].insn,
41518 i, list->window[i].path,
41519 i, list->window[i].byte_len,
41520 i, list->window[i].imm_bytes);
41524 /* Print to stdout a dispatch window. */
41526 DEBUG_FUNCTION void
41527 debug_dispatch_window (int window_num)
41529 debug_dispatch_window_file (stdout, window_num);
41532 /* Print INSN dispatch information to FILE. */
41534 DEBUG_FUNCTION static void
41535 debug_insn_dispatch_info_file (FILE *file, rtx insn)
41537 int byte_len;
41538 enum insn_path path;
41539 enum dispatch_group group;
41540 int imm_size;
41541 int num_imm_operand;
41542 int num_imm32_operand;
41543 int num_imm64_operand;
41545 if (INSN_CODE (insn) < 0)
41546 return;
41548 byte_len = min_insn_size (insn);
41549 path = get_insn_path (insn);
41550 group = get_insn_group (insn);
41551 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41552 &num_imm64_operand);
41554 fprintf (file, " insn info:\n");
41555 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
41556 group_name[group], path, byte_len);
41557 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41558 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
41561 /* Print to STDERR the status of the ready list with respect to
41562 dispatch windows. */
41564 DEBUG_FUNCTION void
41565 debug_ready_dispatch (void)
41567 int i;
41568 int no_ready = number_in_ready ();
41570 fprintf (stdout, "Number of ready: %d\n", no_ready);
41572 for (i = 0; i < no_ready; i++)
41573 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
41576 /* This routine is the driver of the dispatch scheduler. */
41578 static void
41579 do_dispatch (rtx insn, int mode)
41581 if (mode == DISPATCH_INIT)
41582 init_dispatch_sched ();
41583 else if (mode == ADD_TO_DISPATCH_WINDOW)
41584 add_to_dispatch_window (insn);
41587 /* Return TRUE if Dispatch Scheduling is supported. */
41589 static bool
41590 has_dispatch (rtx insn, int action)
41592 if ((TARGET_BDVER1 || TARGET_BDVER2)
41593 && flag_dispatch_scheduler)
41594 switch (action)
41596 default:
41597 return false;
41599 case IS_DISPATCH_ON:
41600 return true;
41601 break;
41603 case IS_CMP:
41604 return is_cmp (insn);
41606 case DISPATCH_VIOLATION:
41607 return dispatch_violation ();
41609 case FITS_DISPATCH_WINDOW:
41610 return fits_dispatch_window (insn);
41613 return false;
41616 /* Implementation of reassociation_width target hook used by
41617 reassoc phase to identify parallelism level in reassociated
41618 tree. Statements tree_code is passed in OPC. Arguments type
41619 is passed in MODE.
41621 Currently parallel reassociation is enabled for Atom
41622 processors only and we set reassociation width to be 2
41623 because Atom may issue up to 2 instructions per cycle.
41625 Return value should be fixed if parallel reassociation is
41626 enabled for other processors. */
41628 static int
41629 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
41630 enum machine_mode mode)
41632 int res = 1;
41634 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
41635 res = 2;
41636 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
41637 res = 2;
41639 return res;
41642 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
41643 place emms and femms instructions. */
41645 static enum machine_mode
41646 ix86_preferred_simd_mode (enum machine_mode mode)
41648 if (!TARGET_SSE)
41649 return word_mode;
41651 switch (mode)
41653 case QImode:
41654 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
41655 case HImode:
41656 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
41657 case SImode:
41658 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
41659 case DImode:
41660 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
41662 case SFmode:
41663 if (TARGET_AVX && !TARGET_PREFER_AVX128)
41664 return V8SFmode;
41665 else
41666 return V4SFmode;
41668 case DFmode:
41669 if (!TARGET_VECTORIZE_DOUBLE)
41670 return word_mode;
41671 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
41672 return V4DFmode;
41673 else if (TARGET_SSE2)
41674 return V2DFmode;
41675 /* FALLTHRU */
41677 default:
41678 return word_mode;
41682 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
41683 vectors. */
41685 static unsigned int
41686 ix86_autovectorize_vector_sizes (void)
41688 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
41693 /* Return class of registers which could be used for pseudo of MODE
41694 and of class RCLASS for spilling instead of memory. Return NO_REGS
41695 if it is not possible or non-profitable. */
41696 static reg_class_t
41697 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
41699 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
41700 && hard_reg_set_subset_p (reg_class_contents[rclass],
41701 reg_class_contents[GENERAL_REGS])
41702 && (mode == SImode || (TARGET_64BIT && mode == DImode)))
41703 return SSE_REGS;
41704 return NO_REGS;
41707 /* Implement targetm.vectorize.init_cost. */
41709 static void *
41710 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
41712 unsigned *cost = XNEWVEC (unsigned, 3);
41713 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
41714 return cost;
41717 /* Implement targetm.vectorize.add_stmt_cost. */
41719 static unsigned
41720 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
41721 struct _stmt_vec_info *stmt_info, int misalign,
41722 enum vect_cost_model_location where)
41724 unsigned *cost = (unsigned *) data;
41725 unsigned retval = 0;
41727 if (flag_vect_cost_model)
41729 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
41730 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
41732 /* Statements in an inner loop relative to the loop being
41733 vectorized are weighted more heavily. The value here is
41734 arbitrary and could potentially be improved with analysis. */
41735 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
41736 count *= 50; /* FIXME. */
41738 retval = (unsigned) (count * stmt_cost);
41739 cost[where] += retval;
41742 return retval;
41745 /* Implement targetm.vectorize.finish_cost. */
41747 static void
41748 ix86_finish_cost (void *data, unsigned *prologue_cost,
41749 unsigned *body_cost, unsigned *epilogue_cost)
41751 unsigned *cost = (unsigned *) data;
41752 *prologue_cost = cost[vect_prologue];
41753 *body_cost = cost[vect_body];
41754 *epilogue_cost = cost[vect_epilogue];
41757 /* Implement targetm.vectorize.destroy_cost_data. */
41759 static void
41760 ix86_destroy_cost_data (void *data)
41762 free (data);
41765 /* Validate target specific memory model bits in VAL. */
41767 static unsigned HOST_WIDE_INT
41768 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
41770 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
41771 unsigned HOST_WIDE_INT strong;
41773 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
41774 |MEMMODEL_MASK)
41775 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
41777 warning (OPT_Winvalid_memory_model,
41778 "Unknown architecture specific memory model");
41779 return MEMMODEL_SEQ_CST;
41781 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
41782 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
41784 warning (OPT_Winvalid_memory_model,
41785 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
41786 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
41788 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
41790 warning (OPT_Winvalid_memory_model,
41791 "HLE_RELEASE not used with RELEASE or stronger memory model");
41792 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
41794 return val;
41797 /* Initialize the GCC target structure. */
41798 #undef TARGET_RETURN_IN_MEMORY
41799 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
41801 #undef TARGET_LEGITIMIZE_ADDRESS
41802 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
41804 #undef TARGET_ATTRIBUTE_TABLE
41805 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
41806 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
41807 # undef TARGET_MERGE_DECL_ATTRIBUTES
41808 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
41809 #endif
41811 #undef TARGET_COMP_TYPE_ATTRIBUTES
41812 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
41814 #undef TARGET_INIT_BUILTINS
41815 #define TARGET_INIT_BUILTINS ix86_init_builtins
41816 #undef TARGET_BUILTIN_DECL
41817 #define TARGET_BUILTIN_DECL ix86_builtin_decl
41818 #undef TARGET_EXPAND_BUILTIN
41819 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
41821 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
41822 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
41823 ix86_builtin_vectorized_function
41825 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
41826 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
41828 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
41829 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
41831 #undef TARGET_VECTORIZE_BUILTIN_GATHER
41832 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
41834 #undef TARGET_BUILTIN_RECIPROCAL
41835 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
41837 #undef TARGET_ASM_FUNCTION_EPILOGUE
41838 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
41840 #undef TARGET_ENCODE_SECTION_INFO
41841 #ifndef SUBTARGET_ENCODE_SECTION_INFO
41842 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
41843 #else
41844 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
41845 #endif
41847 #undef TARGET_ASM_OPEN_PAREN
41848 #define TARGET_ASM_OPEN_PAREN ""
41849 #undef TARGET_ASM_CLOSE_PAREN
41850 #define TARGET_ASM_CLOSE_PAREN ""
41852 #undef TARGET_ASM_BYTE_OP
41853 #define TARGET_ASM_BYTE_OP ASM_BYTE
41855 #undef TARGET_ASM_ALIGNED_HI_OP
41856 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
41857 #undef TARGET_ASM_ALIGNED_SI_OP
41858 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
41859 #ifdef ASM_QUAD
41860 #undef TARGET_ASM_ALIGNED_DI_OP
41861 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
41862 #endif
41864 #undef TARGET_PROFILE_BEFORE_PROLOGUE
41865 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
41867 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
41868 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
41870 #undef TARGET_ASM_UNALIGNED_HI_OP
41871 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
41872 #undef TARGET_ASM_UNALIGNED_SI_OP
41873 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
41874 #undef TARGET_ASM_UNALIGNED_DI_OP
41875 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
41877 #undef TARGET_PRINT_OPERAND
41878 #define TARGET_PRINT_OPERAND ix86_print_operand
41879 #undef TARGET_PRINT_OPERAND_ADDRESS
41880 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
41881 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
41882 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
41883 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
41884 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
41886 #undef TARGET_SCHED_INIT_GLOBAL
41887 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
41888 #undef TARGET_SCHED_ADJUST_COST
41889 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
41890 #undef TARGET_SCHED_ISSUE_RATE
41891 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
41892 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
41893 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
41894 ia32_multipass_dfa_lookahead
41896 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
41897 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
41899 #undef TARGET_MEMMODEL_CHECK
41900 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
41902 #ifdef HAVE_AS_TLS
41903 #undef TARGET_HAVE_TLS
41904 #define TARGET_HAVE_TLS true
41905 #endif
41906 #undef TARGET_CANNOT_FORCE_CONST_MEM
41907 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
41908 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
41909 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
41911 #undef TARGET_DELEGITIMIZE_ADDRESS
41912 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
41914 #undef TARGET_MS_BITFIELD_LAYOUT_P
41915 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
41917 #if TARGET_MACHO
41918 #undef TARGET_BINDS_LOCAL_P
41919 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
41920 #endif
41921 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
41922 #undef TARGET_BINDS_LOCAL_P
41923 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
41924 #endif
41926 #undef TARGET_ASM_OUTPUT_MI_THUNK
41927 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
41928 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
41929 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
41931 #undef TARGET_ASM_FILE_START
41932 #define TARGET_ASM_FILE_START x86_file_start
41934 #undef TARGET_OPTION_OVERRIDE
41935 #define TARGET_OPTION_OVERRIDE ix86_option_override
41937 #undef TARGET_REGISTER_MOVE_COST
41938 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
41939 #undef TARGET_MEMORY_MOVE_COST
41940 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
41941 #undef TARGET_RTX_COSTS
41942 #define TARGET_RTX_COSTS ix86_rtx_costs
41943 #undef TARGET_ADDRESS_COST
41944 #define TARGET_ADDRESS_COST ix86_address_cost
41946 #undef TARGET_FIXED_CONDITION_CODE_REGS
41947 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
41948 #undef TARGET_CC_MODES_COMPATIBLE
41949 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
41951 #undef TARGET_MACHINE_DEPENDENT_REORG
41952 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
41954 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
41955 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
41957 #undef TARGET_BUILD_BUILTIN_VA_LIST
41958 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
41960 #undef TARGET_FOLD_BUILTIN
41961 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
41963 #undef TARGET_COMPARE_VERSION_PRIORITY
41964 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
41966 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
41967 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
41968 ix86_generate_version_dispatcher_body
41970 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
41971 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
41972 ix86_get_function_versions_dispatcher
41974 #undef TARGET_ENUM_VA_LIST_P
41975 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
41977 #undef TARGET_FN_ABI_VA_LIST
41978 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
41980 #undef TARGET_CANONICAL_VA_LIST_TYPE
41981 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
41983 #undef TARGET_EXPAND_BUILTIN_VA_START
41984 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
41986 #undef TARGET_MD_ASM_CLOBBERS
41987 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
41989 #undef TARGET_PROMOTE_PROTOTYPES
41990 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
41991 #undef TARGET_STRUCT_VALUE_RTX
41992 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
41993 #undef TARGET_SETUP_INCOMING_VARARGS
41994 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
41995 #undef TARGET_MUST_PASS_IN_STACK
41996 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
41997 #undef TARGET_FUNCTION_ARG_ADVANCE
41998 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
41999 #undef TARGET_FUNCTION_ARG
42000 #define TARGET_FUNCTION_ARG ix86_function_arg
42001 #undef TARGET_FUNCTION_ARG_BOUNDARY
42002 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42003 #undef TARGET_PASS_BY_REFERENCE
42004 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42005 #undef TARGET_INTERNAL_ARG_POINTER
42006 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42007 #undef TARGET_UPDATE_STACK_BOUNDARY
42008 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42009 #undef TARGET_GET_DRAP_RTX
42010 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42011 #undef TARGET_STRICT_ARGUMENT_NAMING
42012 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42013 #undef TARGET_STATIC_CHAIN
42014 #define TARGET_STATIC_CHAIN ix86_static_chain
42015 #undef TARGET_TRAMPOLINE_INIT
42016 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42017 #undef TARGET_RETURN_POPS_ARGS
42018 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42020 #undef TARGET_LEGITIMATE_COMBINED_INSN
42021 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42023 #undef TARGET_ASAN_SHADOW_OFFSET
42024 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42026 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42027 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42029 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42030 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42032 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42033 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42035 #undef TARGET_C_MODE_FOR_SUFFIX
42036 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42038 #ifdef HAVE_AS_TLS
42039 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42040 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42041 #endif
42043 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42044 #undef TARGET_INSERT_ATTRIBUTES
42045 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42046 #endif
42048 #undef TARGET_MANGLE_TYPE
42049 #define TARGET_MANGLE_TYPE ix86_mangle_type
42051 #if !TARGET_MACHO
42052 #undef TARGET_STACK_PROTECT_FAIL
42053 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42054 #endif
42056 #undef TARGET_FUNCTION_VALUE
42057 #define TARGET_FUNCTION_VALUE ix86_function_value
42059 #undef TARGET_FUNCTION_VALUE_REGNO_P
42060 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42062 #undef TARGET_PROMOTE_FUNCTION_MODE
42063 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42065 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42066 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42068 #undef TARGET_INSTANTIATE_DECLS
42069 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42071 #undef TARGET_SECONDARY_RELOAD
42072 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42074 #undef TARGET_CLASS_MAX_NREGS
42075 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42077 #undef TARGET_PREFERRED_RELOAD_CLASS
42078 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42079 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42080 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42081 #undef TARGET_CLASS_LIKELY_SPILLED_P
42082 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42084 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42085 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42086 ix86_builtin_vectorization_cost
42087 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42088 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42089 ix86_vectorize_vec_perm_const_ok
42090 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42091 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42092 ix86_preferred_simd_mode
42093 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42094 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42095 ix86_autovectorize_vector_sizes
42096 #undef TARGET_VECTORIZE_INIT_COST
42097 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42098 #undef TARGET_VECTORIZE_ADD_STMT_COST
42099 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42100 #undef TARGET_VECTORIZE_FINISH_COST
42101 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42102 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42103 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42105 #undef TARGET_SET_CURRENT_FUNCTION
42106 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42108 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42109 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42111 #undef TARGET_OPTION_SAVE
42112 #define TARGET_OPTION_SAVE ix86_function_specific_save
42114 #undef TARGET_OPTION_RESTORE
42115 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42117 #undef TARGET_OPTION_PRINT
42118 #define TARGET_OPTION_PRINT ix86_function_specific_print
42120 #undef TARGET_OPTION_FUNCTION_VERSIONS
42121 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42123 #undef TARGET_CAN_INLINE_P
42124 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42126 #undef TARGET_EXPAND_TO_RTL_HOOK
42127 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42129 #undef TARGET_LEGITIMATE_ADDRESS_P
42130 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42132 #undef TARGET_LRA_P
42133 #define TARGET_LRA_P ix86_lra_p
42135 #undef TARGET_REGISTER_PRIORITY
42136 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42138 #undef TARGET_LEGITIMATE_CONSTANT_P
42139 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42141 #undef TARGET_FRAME_POINTER_REQUIRED
42142 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42144 #undef TARGET_CAN_ELIMINATE
42145 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42147 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42148 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42150 #undef TARGET_ASM_CODE_END
42151 #define TARGET_ASM_CODE_END ix86_code_end
42153 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42154 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42156 #if TARGET_MACHO
42157 #undef TARGET_INIT_LIBFUNCS
42158 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42159 #endif
42161 #undef TARGET_SPILL_CLASS
42162 #define TARGET_SPILL_CLASS ix86_spill_class
42164 struct gcc_target targetm = TARGET_INITIALIZER;
42166 #include "gt-i386.h"