PR target/57098
[official-gcc.git] / gcc / config / i386 / i386.c
blobd28a0ff07f052cb877fd3cfe7c0e10d20aae66b6
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
66 static rtx legitimize_dllimport_symbol (rtx, bool);
67 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
68 static rtx legitimize_pe_coff_symbol (rtx, bool);
70 #ifndef CHECK_STACK_LIMIT
71 #define CHECK_STACK_LIMIT (-1)
72 #endif
74 /* Return index of given mode in mult and division cost tables. */
75 #define MODE_INDEX(mode) \
76 ((mode) == QImode ? 0 \
77 : (mode) == HImode ? 1 \
78 : (mode) == SImode ? 2 \
79 : (mode) == DImode ? 3 \
80 : 4)
82 /* Processor costs (relative to an add) */
83 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
84 #define COSTS_N_BYTES(N) ((N) * 2)
86 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
88 const
89 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
90 COSTS_N_BYTES (2), /* cost of an add instruction */
91 COSTS_N_BYTES (3), /* cost of a lea instruction */
92 COSTS_N_BYTES (2), /* variable shift costs */
93 COSTS_N_BYTES (3), /* constant shift costs */
94 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
95 COSTS_N_BYTES (3), /* HI */
96 COSTS_N_BYTES (3), /* SI */
97 COSTS_N_BYTES (3), /* DI */
98 COSTS_N_BYTES (5)}, /* other */
99 0, /* cost of multiply per each bit set */
100 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
101 COSTS_N_BYTES (3), /* HI */
102 COSTS_N_BYTES (3), /* SI */
103 COSTS_N_BYTES (3), /* DI */
104 COSTS_N_BYTES (5)}, /* other */
105 COSTS_N_BYTES (3), /* cost of movsx */
106 COSTS_N_BYTES (3), /* cost of movzx */
107 0, /* "large" insn */
108 2, /* MOVE_RATIO */
109 2, /* cost for loading QImode using movzbl */
110 {2, 2, 2}, /* cost of loading integer registers
111 in QImode, HImode and SImode.
112 Relative to reg-reg move (2). */
113 {2, 2, 2}, /* cost of storing integer registers */
114 2, /* cost of reg,reg fld/fst */
115 {2, 2, 2}, /* cost of loading fp registers
116 in SFmode, DFmode and XFmode */
117 {2, 2, 2}, /* cost of storing fp registers
118 in SFmode, DFmode and XFmode */
119 3, /* cost of moving MMX register */
120 {3, 3}, /* cost of loading MMX registers
121 in SImode and DImode */
122 {3, 3}, /* cost of storing MMX registers
123 in SImode and DImode */
124 3, /* cost of moving SSE register */
125 {3, 3, 3}, /* cost of loading SSE registers
126 in SImode, DImode and TImode */
127 {3, 3, 3}, /* cost of storing SSE registers
128 in SImode, DImode and TImode */
129 3, /* MMX or SSE register to integer */
130 0, /* size of l1 cache */
131 0, /* size of l2 cache */
132 0, /* size of prefetch block */
133 0, /* number of parallel prefetches */
134 2, /* Branch cost */
135 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
136 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
137 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
138 COSTS_N_BYTES (2), /* cost of FABS instruction. */
139 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
140 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
144 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
145 1, /* scalar_stmt_cost. */
146 1, /* scalar load_cost. */
147 1, /* scalar_store_cost. */
148 1, /* vec_stmt_cost. */
149 1, /* vec_to_scalar_cost. */
150 1, /* scalar_to_vec_cost. */
151 1, /* vec_align_load_cost. */
152 1, /* vec_unalign_load_cost. */
153 1, /* vec_store_cost. */
154 1, /* cond_taken_branch_cost. */
155 1, /* cond_not_taken_branch_cost. */
158 /* Processor costs (relative to an add) */
159 static const
160 struct processor_costs i386_cost = { /* 386 specific costs */
161 COSTS_N_INSNS (1), /* cost of an add instruction */
162 COSTS_N_INSNS (1), /* cost of a lea instruction */
163 COSTS_N_INSNS (3), /* variable shift costs */
164 COSTS_N_INSNS (2), /* constant shift costs */
165 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
166 COSTS_N_INSNS (6), /* HI */
167 COSTS_N_INSNS (6), /* SI */
168 COSTS_N_INSNS (6), /* DI */
169 COSTS_N_INSNS (6)}, /* other */
170 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
171 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
172 COSTS_N_INSNS (23), /* HI */
173 COSTS_N_INSNS (23), /* SI */
174 COSTS_N_INSNS (23), /* DI */
175 COSTS_N_INSNS (23)}, /* other */
176 COSTS_N_INSNS (3), /* cost of movsx */
177 COSTS_N_INSNS (2), /* cost of movzx */
178 15, /* "large" insn */
179 3, /* MOVE_RATIO */
180 4, /* cost for loading QImode using movzbl */
181 {2, 4, 2}, /* cost of loading integer registers
182 in QImode, HImode and SImode.
183 Relative to reg-reg move (2). */
184 {2, 4, 2}, /* cost of storing integer registers */
185 2, /* cost of reg,reg fld/fst */
186 {8, 8, 8}, /* cost of loading fp registers
187 in SFmode, DFmode and XFmode */
188 {8, 8, 8}, /* cost of storing fp registers
189 in SFmode, DFmode and XFmode */
190 2, /* cost of moving MMX register */
191 {4, 8}, /* cost of loading MMX registers
192 in SImode and DImode */
193 {4, 8}, /* cost of storing MMX registers
194 in SImode and DImode */
195 2, /* cost of moving SSE register */
196 {4, 8, 16}, /* cost of loading SSE registers
197 in SImode, DImode and TImode */
198 {4, 8, 16}, /* cost of storing SSE registers
199 in SImode, DImode and TImode */
200 3, /* MMX or SSE register to integer */
201 0, /* size of l1 cache */
202 0, /* size of l2 cache */
203 0, /* size of prefetch block */
204 0, /* number of parallel prefetches */
205 1, /* Branch cost */
206 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
207 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
208 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
209 COSTS_N_INSNS (22), /* cost of FABS instruction. */
210 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
211 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
215 DUMMY_STRINGOP_ALGS},
216 1, /* scalar_stmt_cost. */
217 1, /* scalar load_cost. */
218 1, /* scalar_store_cost. */
219 1, /* vec_stmt_cost. */
220 1, /* vec_to_scalar_cost. */
221 1, /* scalar_to_vec_cost. */
222 1, /* vec_align_load_cost. */
223 2, /* vec_unalign_load_cost. */
224 1, /* vec_store_cost. */
225 3, /* cond_taken_branch_cost. */
226 1, /* cond_not_taken_branch_cost. */
229 static const
230 struct processor_costs i486_cost = { /* 486 specific costs */
231 COSTS_N_INSNS (1), /* cost of an add instruction */
232 COSTS_N_INSNS (1), /* cost of a lea instruction */
233 COSTS_N_INSNS (3), /* variable shift costs */
234 COSTS_N_INSNS (2), /* constant shift costs */
235 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
236 COSTS_N_INSNS (12), /* HI */
237 COSTS_N_INSNS (12), /* SI */
238 COSTS_N_INSNS (12), /* DI */
239 COSTS_N_INSNS (12)}, /* other */
240 1, /* cost of multiply per each bit set */
241 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
242 COSTS_N_INSNS (40), /* HI */
243 COSTS_N_INSNS (40), /* SI */
244 COSTS_N_INSNS (40), /* DI */
245 COSTS_N_INSNS (40)}, /* other */
246 COSTS_N_INSNS (3), /* cost of movsx */
247 COSTS_N_INSNS (2), /* cost of movzx */
248 15, /* "large" insn */
249 3, /* MOVE_RATIO */
250 4, /* cost for loading QImode using movzbl */
251 {2, 4, 2}, /* cost of loading integer registers
252 in QImode, HImode and SImode.
253 Relative to reg-reg move (2). */
254 {2, 4, 2}, /* cost of storing integer registers */
255 2, /* cost of reg,reg fld/fst */
256 {8, 8, 8}, /* cost of loading fp registers
257 in SFmode, DFmode and XFmode */
258 {8, 8, 8}, /* cost of storing fp registers
259 in SFmode, DFmode and XFmode */
260 2, /* cost of moving MMX register */
261 {4, 8}, /* cost of loading MMX registers
262 in SImode and DImode */
263 {4, 8}, /* cost of storing MMX registers
264 in SImode and DImode */
265 2, /* cost of moving SSE register */
266 {4, 8, 16}, /* cost of loading SSE registers
267 in SImode, DImode and TImode */
268 {4, 8, 16}, /* cost of storing SSE registers
269 in SImode, DImode and TImode */
270 3, /* MMX or SSE register to integer */
271 4, /* size of l1 cache. 486 has 8kB cache
272 shared for code and data, so 4kB is
273 not really precise. */
274 4, /* size of l2 cache */
275 0, /* size of prefetch block */
276 0, /* number of parallel prefetches */
277 1, /* Branch cost */
278 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
279 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
280 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
281 COSTS_N_INSNS (3), /* cost of FABS instruction. */
282 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
283 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
287 DUMMY_STRINGOP_ALGS},
288 1, /* scalar_stmt_cost. */
289 1, /* scalar load_cost. */
290 1, /* scalar_store_cost. */
291 1, /* vec_stmt_cost. */
292 1, /* vec_to_scalar_cost. */
293 1, /* scalar_to_vec_cost. */
294 1, /* vec_align_load_cost. */
295 2, /* vec_unalign_load_cost. */
296 1, /* vec_store_cost. */
297 3, /* cond_taken_branch_cost. */
298 1, /* cond_not_taken_branch_cost. */
301 static const
302 struct processor_costs pentium_cost = {
303 COSTS_N_INSNS (1), /* cost of an add instruction */
304 COSTS_N_INSNS (1), /* cost of a lea instruction */
305 COSTS_N_INSNS (4), /* variable shift costs */
306 COSTS_N_INSNS (1), /* constant shift costs */
307 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
308 COSTS_N_INSNS (11), /* HI */
309 COSTS_N_INSNS (11), /* SI */
310 COSTS_N_INSNS (11), /* DI */
311 COSTS_N_INSNS (11)}, /* other */
312 0, /* cost of multiply per each bit set */
313 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
314 COSTS_N_INSNS (25), /* HI */
315 COSTS_N_INSNS (25), /* SI */
316 COSTS_N_INSNS (25), /* DI */
317 COSTS_N_INSNS (25)}, /* other */
318 COSTS_N_INSNS (3), /* cost of movsx */
319 COSTS_N_INSNS (2), /* cost of movzx */
320 8, /* "large" insn */
321 6, /* MOVE_RATIO */
322 6, /* cost for loading QImode using movzbl */
323 {2, 4, 2}, /* cost of loading integer registers
324 in QImode, HImode and SImode.
325 Relative to reg-reg move (2). */
326 {2, 4, 2}, /* cost of storing integer registers */
327 2, /* cost of reg,reg fld/fst */
328 {2, 2, 6}, /* cost of loading fp registers
329 in SFmode, DFmode and XFmode */
330 {4, 4, 6}, /* cost of storing fp registers
331 in SFmode, DFmode and XFmode */
332 8, /* cost of moving MMX register */
333 {8, 8}, /* cost of loading MMX registers
334 in SImode and DImode */
335 {8, 8}, /* cost of storing MMX registers
336 in SImode and DImode */
337 2, /* cost of moving SSE register */
338 {4, 8, 16}, /* cost of loading SSE registers
339 in SImode, DImode and TImode */
340 {4, 8, 16}, /* cost of storing SSE registers
341 in SImode, DImode and TImode */
342 3, /* MMX or SSE register to integer */
343 8, /* size of l1 cache. */
344 8, /* size of l2 cache */
345 0, /* size of prefetch block */
346 0, /* number of parallel prefetches */
347 2, /* Branch cost */
348 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
349 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
350 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
351 COSTS_N_INSNS (1), /* cost of FABS instruction. */
352 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
353 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
354 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
355 DUMMY_STRINGOP_ALGS},
356 {{libcall, {{-1, rep_prefix_4_byte, false}}},
357 DUMMY_STRINGOP_ALGS},
358 1, /* scalar_stmt_cost. */
359 1, /* scalar load_cost. */
360 1, /* scalar_store_cost. */
361 1, /* vec_stmt_cost. */
362 1, /* vec_to_scalar_cost. */
363 1, /* scalar_to_vec_cost. */
364 1, /* vec_align_load_cost. */
365 2, /* vec_unalign_load_cost. */
366 1, /* vec_store_cost. */
367 3, /* cond_taken_branch_cost. */
368 1, /* cond_not_taken_branch_cost. */
371 static const
372 struct processor_costs pentiumpro_cost = {
373 COSTS_N_INSNS (1), /* cost of an add instruction */
374 COSTS_N_INSNS (1), /* cost of a lea instruction */
375 COSTS_N_INSNS (1), /* variable shift costs */
376 COSTS_N_INSNS (1), /* constant shift costs */
377 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
378 COSTS_N_INSNS (4), /* HI */
379 COSTS_N_INSNS (4), /* SI */
380 COSTS_N_INSNS (4), /* DI */
381 COSTS_N_INSNS (4)}, /* other */
382 0, /* cost of multiply per each bit set */
383 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
384 COSTS_N_INSNS (17), /* HI */
385 COSTS_N_INSNS (17), /* SI */
386 COSTS_N_INSNS (17), /* DI */
387 COSTS_N_INSNS (17)}, /* other */
388 COSTS_N_INSNS (1), /* cost of movsx */
389 COSTS_N_INSNS (1), /* cost of movzx */
390 8, /* "large" insn */
391 6, /* MOVE_RATIO */
392 2, /* cost for loading QImode using movzbl */
393 {4, 4, 4}, /* cost of loading integer registers
394 in QImode, HImode and SImode.
395 Relative to reg-reg move (2). */
396 {2, 2, 2}, /* cost of storing integer registers */
397 2, /* cost of reg,reg fld/fst */
398 {2, 2, 6}, /* cost of loading fp registers
399 in SFmode, DFmode and XFmode */
400 {4, 4, 6}, /* cost of storing fp registers
401 in SFmode, DFmode and XFmode */
402 2, /* cost of moving MMX register */
403 {2, 2}, /* cost of loading MMX registers
404 in SImode and DImode */
405 {2, 2}, /* cost of storing MMX registers
406 in SImode and DImode */
407 2, /* cost of moving SSE register */
408 {2, 2, 8}, /* cost of loading SSE registers
409 in SImode, DImode and TImode */
410 {2, 2, 8}, /* cost of storing SSE registers
411 in SImode, DImode and TImode */
412 3, /* MMX or SSE register to integer */
413 8, /* size of l1 cache. */
414 256, /* size of l2 cache */
415 32, /* size of prefetch block */
416 6, /* number of parallel prefetches */
417 2, /* Branch cost */
418 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
419 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
420 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
421 COSTS_N_INSNS (2), /* cost of FABS instruction. */
422 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
423 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
424 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
425 (we ensure the alignment). For small blocks inline loop is still a
426 noticeable win, for bigger blocks either rep movsl or rep movsb is
427 way to go. Rep movsb has apparently more expensive startup time in CPU,
428 but after 4K the difference is down in the noise. */
429 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
430 {8192, rep_prefix_4_byte, false},
431 {-1, rep_prefix_1_byte, false}}},
432 DUMMY_STRINGOP_ALGS},
433 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
434 {8192, rep_prefix_4_byte, false},
435 {-1, libcall, false}}},
436 DUMMY_STRINGOP_ALGS},
437 1, /* scalar_stmt_cost. */
438 1, /* scalar load_cost. */
439 1, /* scalar_store_cost. */
440 1, /* vec_stmt_cost. */
441 1, /* vec_to_scalar_cost. */
442 1, /* scalar_to_vec_cost. */
443 1, /* vec_align_load_cost. */
444 2, /* vec_unalign_load_cost. */
445 1, /* vec_store_cost. */
446 3, /* cond_taken_branch_cost. */
447 1, /* cond_not_taken_branch_cost. */
450 static const
451 struct processor_costs geode_cost = {
452 COSTS_N_INSNS (1), /* cost of an add instruction */
453 COSTS_N_INSNS (1), /* cost of a lea instruction */
454 COSTS_N_INSNS (2), /* variable shift costs */
455 COSTS_N_INSNS (1), /* constant shift costs */
456 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
457 COSTS_N_INSNS (4), /* HI */
458 COSTS_N_INSNS (7), /* SI */
459 COSTS_N_INSNS (7), /* DI */
460 COSTS_N_INSNS (7)}, /* other */
461 0, /* cost of multiply per each bit set */
462 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
463 COSTS_N_INSNS (23), /* HI */
464 COSTS_N_INSNS (39), /* SI */
465 COSTS_N_INSNS (39), /* DI */
466 COSTS_N_INSNS (39)}, /* other */
467 COSTS_N_INSNS (1), /* cost of movsx */
468 COSTS_N_INSNS (1), /* cost of movzx */
469 8, /* "large" insn */
470 4, /* MOVE_RATIO */
471 1, /* cost for loading QImode using movzbl */
472 {1, 1, 1}, /* cost of loading integer registers
473 in QImode, HImode and SImode.
474 Relative to reg-reg move (2). */
475 {1, 1, 1}, /* cost of storing integer registers */
476 1, /* cost of reg,reg fld/fst */
477 {1, 1, 1}, /* cost of loading fp registers
478 in SFmode, DFmode and XFmode */
479 {4, 6, 6}, /* cost of storing fp registers
480 in SFmode, DFmode and XFmode */
482 1, /* cost of moving MMX register */
483 {1, 1}, /* cost of loading MMX registers
484 in SImode and DImode */
485 {1, 1}, /* cost of storing MMX registers
486 in SImode and DImode */
487 1, /* cost of moving SSE register */
488 {1, 1, 1}, /* cost of loading SSE registers
489 in SImode, DImode and TImode */
490 {1, 1, 1}, /* cost of storing SSE registers
491 in SImode, DImode and TImode */
492 1, /* MMX or SSE register to integer */
493 64, /* size of l1 cache. */
494 128, /* size of l2 cache. */
495 32, /* size of prefetch block */
496 1, /* number of parallel prefetches */
497 1, /* Branch cost */
498 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
499 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
500 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
501 COSTS_N_INSNS (1), /* cost of FABS instruction. */
502 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
503 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
507 DUMMY_STRINGOP_ALGS},
508 1, /* scalar_stmt_cost. */
509 1, /* scalar load_cost. */
510 1, /* scalar_store_cost. */
511 1, /* vec_stmt_cost. */
512 1, /* vec_to_scalar_cost. */
513 1, /* scalar_to_vec_cost. */
514 1, /* vec_align_load_cost. */
515 2, /* vec_unalign_load_cost. */
516 1, /* vec_store_cost. */
517 3, /* cond_taken_branch_cost. */
518 1, /* cond_not_taken_branch_cost. */
521 static const
522 struct processor_costs k6_cost = {
523 COSTS_N_INSNS (1), /* cost of an add instruction */
524 COSTS_N_INSNS (2), /* cost of a lea instruction */
525 COSTS_N_INSNS (1), /* variable shift costs */
526 COSTS_N_INSNS (1), /* constant shift costs */
527 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
528 COSTS_N_INSNS (3), /* HI */
529 COSTS_N_INSNS (3), /* SI */
530 COSTS_N_INSNS (3), /* DI */
531 COSTS_N_INSNS (3)}, /* other */
532 0, /* cost of multiply per each bit set */
533 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
534 COSTS_N_INSNS (18), /* HI */
535 COSTS_N_INSNS (18), /* SI */
536 COSTS_N_INSNS (18), /* DI */
537 COSTS_N_INSNS (18)}, /* other */
538 COSTS_N_INSNS (2), /* cost of movsx */
539 COSTS_N_INSNS (2), /* cost of movzx */
540 8, /* "large" insn */
541 4, /* MOVE_RATIO */
542 3, /* cost for loading QImode using movzbl */
543 {4, 5, 4}, /* cost of loading integer registers
544 in QImode, HImode and SImode.
545 Relative to reg-reg move (2). */
546 {2, 3, 2}, /* cost of storing integer registers */
547 4, /* cost of reg,reg fld/fst */
548 {6, 6, 6}, /* cost of loading fp registers
549 in SFmode, DFmode and XFmode */
550 {4, 4, 4}, /* cost of storing fp registers
551 in SFmode, DFmode and XFmode */
552 2, /* cost of moving MMX register */
553 {2, 2}, /* cost of loading MMX registers
554 in SImode and DImode */
555 {2, 2}, /* cost of storing MMX registers
556 in SImode and DImode */
557 2, /* cost of moving SSE register */
558 {2, 2, 8}, /* cost of loading SSE registers
559 in SImode, DImode and TImode */
560 {2, 2, 8}, /* cost of storing SSE registers
561 in SImode, DImode and TImode */
562 6, /* MMX or SSE register to integer */
563 32, /* size of l1 cache. */
564 32, /* size of l2 cache. Some models
565 have integrated l2 cache, but
566 optimizing for k6 is not important
567 enough to worry about that. */
568 32, /* size of prefetch block */
569 1, /* number of parallel prefetches */
570 1, /* Branch cost */
571 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
572 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
573 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
574 COSTS_N_INSNS (2), /* cost of FABS instruction. */
575 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
576 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
580 DUMMY_STRINGOP_ALGS},
581 1, /* scalar_stmt_cost. */
582 1, /* scalar load_cost. */
583 1, /* scalar_store_cost. */
584 1, /* vec_stmt_cost. */
585 1, /* vec_to_scalar_cost. */
586 1, /* scalar_to_vec_cost. */
587 1, /* vec_align_load_cost. */
588 2, /* vec_unalign_load_cost. */
589 1, /* vec_store_cost. */
590 3, /* cond_taken_branch_cost. */
591 1, /* cond_not_taken_branch_cost. */
594 static const
595 struct processor_costs athlon_cost = {
596 COSTS_N_INSNS (1), /* cost of an add instruction */
597 COSTS_N_INSNS (2), /* cost of a lea instruction */
598 COSTS_N_INSNS (1), /* variable shift costs */
599 COSTS_N_INSNS (1), /* constant shift costs */
600 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
601 COSTS_N_INSNS (5), /* HI */
602 COSTS_N_INSNS (5), /* SI */
603 COSTS_N_INSNS (5), /* DI */
604 COSTS_N_INSNS (5)}, /* other */
605 0, /* cost of multiply per each bit set */
606 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
607 COSTS_N_INSNS (26), /* HI */
608 COSTS_N_INSNS (42), /* SI */
609 COSTS_N_INSNS (74), /* DI */
610 COSTS_N_INSNS (74)}, /* other */
611 COSTS_N_INSNS (1), /* cost of movsx */
612 COSTS_N_INSNS (1), /* cost of movzx */
613 8, /* "large" insn */
614 9, /* MOVE_RATIO */
615 4, /* cost for loading QImode using movzbl */
616 {3, 4, 3}, /* cost of loading integer registers
617 in QImode, HImode and SImode.
618 Relative to reg-reg move (2). */
619 {3, 4, 3}, /* cost of storing integer registers */
620 4, /* cost of reg,reg fld/fst */
621 {4, 4, 12}, /* cost of loading fp registers
622 in SFmode, DFmode and XFmode */
623 {6, 6, 8}, /* cost of storing fp registers
624 in SFmode, DFmode and XFmode */
625 2, /* cost of moving MMX register */
626 {4, 4}, /* cost of loading MMX registers
627 in SImode and DImode */
628 {4, 4}, /* cost of storing MMX registers
629 in SImode and DImode */
630 2, /* cost of moving SSE register */
631 {4, 4, 6}, /* cost of loading SSE registers
632 in SImode, DImode and TImode */
633 {4, 4, 5}, /* cost of storing SSE registers
634 in SImode, DImode and TImode */
635 5, /* MMX or SSE register to integer */
636 64, /* size of l1 cache. */
637 256, /* size of l2 cache. */
638 64, /* size of prefetch block */
639 6, /* number of parallel prefetches */
640 5, /* Branch cost */
641 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
642 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
643 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
644 COSTS_N_INSNS (2), /* cost of FABS instruction. */
645 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
646 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
647 /* For some reason, Athlon deals better with REP prefix (relative to loops)
648 compared to K8. Alignment becomes important after 8 bytes for memcpy and
649 128 bytes for memset. */
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
653 DUMMY_STRINGOP_ALGS},
654 1, /* scalar_stmt_cost. */
655 1, /* scalar load_cost. */
656 1, /* scalar_store_cost. */
657 1, /* vec_stmt_cost. */
658 1, /* vec_to_scalar_cost. */
659 1, /* scalar_to_vec_cost. */
660 1, /* vec_align_load_cost. */
661 2, /* vec_unalign_load_cost. */
662 1, /* vec_store_cost. */
663 3, /* cond_taken_branch_cost. */
664 1, /* cond_not_taken_branch_cost. */
667 static const
668 struct processor_costs k8_cost = {
669 COSTS_N_INSNS (1), /* cost of an add instruction */
670 COSTS_N_INSNS (2), /* cost of a lea instruction */
671 COSTS_N_INSNS (1), /* variable shift costs */
672 COSTS_N_INSNS (1), /* constant shift costs */
673 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
674 COSTS_N_INSNS (4), /* HI */
675 COSTS_N_INSNS (3), /* SI */
676 COSTS_N_INSNS (4), /* DI */
677 COSTS_N_INSNS (5)}, /* other */
678 0, /* cost of multiply per each bit set */
679 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
680 COSTS_N_INSNS (26), /* HI */
681 COSTS_N_INSNS (42), /* SI */
682 COSTS_N_INSNS (74), /* DI */
683 COSTS_N_INSNS (74)}, /* other */
684 COSTS_N_INSNS (1), /* cost of movsx */
685 COSTS_N_INSNS (1), /* cost of movzx */
686 8, /* "large" insn */
687 9, /* MOVE_RATIO */
688 4, /* cost for loading QImode using movzbl */
689 {3, 4, 3}, /* cost of loading integer registers
690 in QImode, HImode and SImode.
691 Relative to reg-reg move (2). */
692 {3, 4, 3}, /* cost of storing integer registers */
693 4, /* cost of reg,reg fld/fst */
694 {4, 4, 12}, /* cost of loading fp registers
695 in SFmode, DFmode and XFmode */
696 {6, 6, 8}, /* cost of storing fp registers
697 in SFmode, DFmode and XFmode */
698 2, /* cost of moving MMX register */
699 {3, 3}, /* cost of loading MMX registers
700 in SImode and DImode */
701 {4, 4}, /* cost of storing MMX registers
702 in SImode and DImode */
703 2, /* cost of moving SSE register */
704 {4, 3, 6}, /* cost of loading SSE registers
705 in SImode, DImode and TImode */
706 {4, 4, 5}, /* cost of storing SSE registers
707 in SImode, DImode and TImode */
708 5, /* MMX or SSE register to integer */
709 64, /* size of l1 cache. */
710 512, /* size of l2 cache. */
711 64, /* size of prefetch block */
712 /* New AMD processors never drop prefetches; if they cannot be performed
713 immediately, they are queued. We set number of simultaneous prefetches
714 to a large constant to reflect this (it probably is not a good idea not
715 to limit number of prefetches at all, as their execution also takes some
716 time). */
717 100, /* number of parallel prefetches */
718 3, /* Branch cost */
719 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
720 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
721 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
722 COSTS_N_INSNS (2), /* cost of FABS instruction. */
723 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
724 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
725 /* K8 has optimized REP instruction for medium sized blocks, but for very
726 small blocks it is better to use loop. For large blocks, libcall can
727 do nontemporary accesses and beat inline considerably. */
728 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
729 {-1, rep_prefix_4_byte, false}}},
730 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
731 {-1, libcall, false}}}},
732 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
733 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
734 {libcall, {{48, unrolled_loop, false},
735 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
736 4, /* scalar_stmt_cost. */
737 2, /* scalar load_cost. */
738 2, /* scalar_store_cost. */
739 5, /* vec_stmt_cost. */
740 0, /* vec_to_scalar_cost. */
741 2, /* scalar_to_vec_cost. */
742 2, /* vec_align_load_cost. */
743 3, /* vec_unalign_load_cost. */
744 3, /* vec_store_cost. */
745 3, /* cond_taken_branch_cost. */
746 2, /* cond_not_taken_branch_cost. */
749 struct processor_costs amdfam10_cost = {
750 COSTS_N_INSNS (1), /* cost of an add instruction */
751 COSTS_N_INSNS (2), /* cost of a lea instruction */
752 COSTS_N_INSNS (1), /* variable shift costs */
753 COSTS_N_INSNS (1), /* constant shift costs */
754 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
755 COSTS_N_INSNS (4), /* HI */
756 COSTS_N_INSNS (3), /* SI */
757 COSTS_N_INSNS (4), /* DI */
758 COSTS_N_INSNS (5)}, /* other */
759 0, /* cost of multiply per each bit set */
760 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
761 COSTS_N_INSNS (35), /* HI */
762 COSTS_N_INSNS (51), /* SI */
763 COSTS_N_INSNS (83), /* DI */
764 COSTS_N_INSNS (83)}, /* other */
765 COSTS_N_INSNS (1), /* cost of movsx */
766 COSTS_N_INSNS (1), /* cost of movzx */
767 8, /* "large" insn */
768 9, /* MOVE_RATIO */
769 4, /* cost for loading QImode using movzbl */
770 {3, 4, 3}, /* cost of loading integer registers
771 in QImode, HImode and SImode.
772 Relative to reg-reg move (2). */
773 {3, 4, 3}, /* cost of storing integer registers */
774 4, /* cost of reg,reg fld/fst */
775 {4, 4, 12}, /* cost of loading fp registers
776 in SFmode, DFmode and XFmode */
777 {6, 6, 8}, /* cost of storing fp registers
778 in SFmode, DFmode and XFmode */
779 2, /* cost of moving MMX register */
780 {3, 3}, /* cost of loading MMX registers
781 in SImode and DImode */
782 {4, 4}, /* cost of storing MMX registers
783 in SImode and DImode */
784 2, /* cost of moving SSE register */
785 {4, 4, 3}, /* cost of loading SSE registers
786 in SImode, DImode and TImode */
787 {4, 4, 5}, /* cost of storing SSE registers
788 in SImode, DImode and TImode */
789 3, /* MMX or SSE register to integer */
790 /* On K8:
791 MOVD reg64, xmmreg Double FSTORE 4
792 MOVD reg32, xmmreg Double FSTORE 4
793 On AMDFAM10:
794 MOVD reg64, xmmreg Double FADD 3
795 1/1 1/1
796 MOVD reg32, xmmreg Double FADD 3
797 1/1 1/1 */
798 64, /* size of l1 cache. */
799 512, /* size of l2 cache. */
800 64, /* size of prefetch block */
801 /* New AMD processors never drop prefetches; if they cannot be performed
802 immediately, they are queued. We set number of simultaneous prefetches
803 to a large constant to reflect this (it probably is not a good idea not
804 to limit number of prefetches at all, as their execution also takes some
805 time). */
806 100, /* number of parallel prefetches */
807 2, /* Branch cost */
808 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
809 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
810 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
811 COSTS_N_INSNS (2), /* cost of FABS instruction. */
812 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
813 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
815 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
816 very small blocks it is better to use loop. For large blocks, libcall can
817 do nontemporary accesses and beat inline considerably. */
818 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
819 {-1, rep_prefix_4_byte, false}}},
820 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
821 {-1, libcall, false}}}},
822 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
823 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
824 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
825 {-1, libcall, false}}}},
826 4, /* scalar_stmt_cost. */
827 2, /* scalar load_cost. */
828 2, /* scalar_store_cost. */
829 6, /* vec_stmt_cost. */
830 0, /* vec_to_scalar_cost. */
831 2, /* scalar_to_vec_cost. */
832 2, /* vec_align_load_cost. */
833 2, /* vec_unalign_load_cost. */
834 2, /* vec_store_cost. */
835 2, /* cond_taken_branch_cost. */
836 1, /* cond_not_taken_branch_cost. */
839 struct processor_costs bdver1_cost = {
840 COSTS_N_INSNS (1), /* cost of an add instruction */
841 COSTS_N_INSNS (1), /* cost of a lea instruction */
842 COSTS_N_INSNS (1), /* variable shift costs */
843 COSTS_N_INSNS (1), /* constant shift costs */
844 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
845 COSTS_N_INSNS (4), /* HI */
846 COSTS_N_INSNS (4), /* SI */
847 COSTS_N_INSNS (6), /* DI */
848 COSTS_N_INSNS (6)}, /* other */
849 0, /* cost of multiply per each bit set */
850 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
851 COSTS_N_INSNS (35), /* HI */
852 COSTS_N_INSNS (51), /* SI */
853 COSTS_N_INSNS (83), /* DI */
854 COSTS_N_INSNS (83)}, /* other */
855 COSTS_N_INSNS (1), /* cost of movsx */
856 COSTS_N_INSNS (1), /* cost of movzx */
857 8, /* "large" insn */
858 9, /* MOVE_RATIO */
859 4, /* cost for loading QImode using movzbl */
860 {5, 5, 4}, /* cost of loading integer registers
861 in QImode, HImode and SImode.
862 Relative to reg-reg move (2). */
863 {4, 4, 4}, /* cost of storing integer registers */
864 2, /* cost of reg,reg fld/fst */
865 {5, 5, 12}, /* cost of loading fp registers
866 in SFmode, DFmode and XFmode */
867 {4, 4, 8}, /* cost of storing fp registers
868 in SFmode, DFmode and XFmode */
869 2, /* cost of moving MMX register */
870 {4, 4}, /* cost of loading MMX registers
871 in SImode and DImode */
872 {4, 4}, /* cost of storing MMX registers
873 in SImode and DImode */
874 2, /* cost of moving SSE register */
875 {4, 4, 4}, /* cost of loading SSE registers
876 in SImode, DImode and TImode */
877 {4, 4, 4}, /* cost of storing SSE registers
878 in SImode, DImode and TImode */
879 2, /* MMX or SSE register to integer */
880 /* On K8:
881 MOVD reg64, xmmreg Double FSTORE 4
882 MOVD reg32, xmmreg Double FSTORE 4
883 On AMDFAM10:
884 MOVD reg64, xmmreg Double FADD 3
885 1/1 1/1
886 MOVD reg32, xmmreg Double FADD 3
887 1/1 1/1 */
888 16, /* size of l1 cache. */
889 2048, /* size of l2 cache. */
890 64, /* size of prefetch block */
891 /* New AMD processors never drop prefetches; if they cannot be performed
892 immediately, they are queued. We set number of simultaneous prefetches
893 to a large constant to reflect this (it probably is not a good idea not
894 to limit number of prefetches at all, as their execution also takes some
895 time). */
896 100, /* number of parallel prefetches */
897 2, /* Branch cost */
898 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
899 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
900 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
901 COSTS_N_INSNS (2), /* cost of FABS instruction. */
902 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
903 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
905 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
906 very small blocks it is better to use loop. For large blocks, libcall
907 can do nontemporary accesses and beat inline considerably. */
908 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
909 {-1, rep_prefix_4_byte, false}}},
910 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
911 {-1, libcall, false}}}},
912 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
913 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
914 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
915 {-1, libcall, false}}}},
916 6, /* scalar_stmt_cost. */
917 4, /* scalar load_cost. */
918 4, /* scalar_store_cost. */
919 6, /* vec_stmt_cost. */
920 0, /* vec_to_scalar_cost. */
921 2, /* scalar_to_vec_cost. */
922 4, /* vec_align_load_cost. */
923 4, /* vec_unalign_load_cost. */
924 4, /* vec_store_cost. */
925 2, /* cond_taken_branch_cost. */
926 1, /* cond_not_taken_branch_cost. */
929 struct processor_costs bdver2_cost = {
930 COSTS_N_INSNS (1), /* cost of an add instruction */
931 COSTS_N_INSNS (1), /* cost of a lea instruction */
932 COSTS_N_INSNS (1), /* variable shift costs */
933 COSTS_N_INSNS (1), /* constant shift costs */
934 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
935 COSTS_N_INSNS (4), /* HI */
936 COSTS_N_INSNS (4), /* SI */
937 COSTS_N_INSNS (6), /* DI */
938 COSTS_N_INSNS (6)}, /* other */
939 0, /* cost of multiply per each bit set */
940 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
941 COSTS_N_INSNS (35), /* HI */
942 COSTS_N_INSNS (51), /* SI */
943 COSTS_N_INSNS (83), /* DI */
944 COSTS_N_INSNS (83)}, /* other */
945 COSTS_N_INSNS (1), /* cost of movsx */
946 COSTS_N_INSNS (1), /* cost of movzx */
947 8, /* "large" insn */
948 9, /* MOVE_RATIO */
949 4, /* cost for loading QImode using movzbl */
950 {5, 5, 4}, /* cost of loading integer registers
951 in QImode, HImode and SImode.
952 Relative to reg-reg move (2). */
953 {4, 4, 4}, /* cost of storing integer registers */
954 2, /* cost of reg,reg fld/fst */
955 {5, 5, 12}, /* cost of loading fp registers
956 in SFmode, DFmode and XFmode */
957 {4, 4, 8}, /* cost of storing fp registers
958 in SFmode, DFmode and XFmode */
959 2, /* cost of moving MMX register */
960 {4, 4}, /* cost of loading MMX registers
961 in SImode and DImode */
962 {4, 4}, /* cost of storing MMX registers
963 in SImode and DImode */
964 2, /* cost of moving SSE register */
965 {4, 4, 4}, /* cost of loading SSE registers
966 in SImode, DImode and TImode */
967 {4, 4, 4}, /* cost of storing SSE registers
968 in SImode, DImode and TImode */
969 2, /* MMX or SSE register to integer */
970 /* On K8:
971 MOVD reg64, xmmreg Double FSTORE 4
972 MOVD reg32, xmmreg Double FSTORE 4
973 On AMDFAM10:
974 MOVD reg64, xmmreg Double FADD 3
975 1/1 1/1
976 MOVD reg32, xmmreg Double FADD 3
977 1/1 1/1 */
978 16, /* size of l1 cache. */
979 2048, /* size of l2 cache. */
980 64, /* size of prefetch block */
981 /* New AMD processors never drop prefetches; if they cannot be performed
982 immediately, they are queued. We set number of simultaneous prefetches
983 to a large constant to reflect this (it probably is not a good idea not
984 to limit number of prefetches at all, as their execution also takes some
985 time). */
986 100, /* number of parallel prefetches */
987 2, /* Branch cost */
988 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
989 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
990 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
991 COSTS_N_INSNS (2), /* cost of FABS instruction. */
992 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
993 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
995 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
996 very small blocks it is better to use loop. For large blocks, libcall
997 can do nontemporary accesses and beat inline considerably. */
998 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
999 {-1, rep_prefix_4_byte, false}}},
1000 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1001 {-1, libcall, false}}}},
1002 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1003 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1004 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1005 {-1, libcall, false}}}},
1006 6, /* scalar_stmt_cost. */
1007 4, /* scalar load_cost. */
1008 4, /* scalar_store_cost. */
1009 6, /* vec_stmt_cost. */
1010 0, /* vec_to_scalar_cost. */
1011 2, /* scalar_to_vec_cost. */
1012 4, /* vec_align_load_cost. */
1013 4, /* vec_unalign_load_cost. */
1014 4, /* vec_store_cost. */
1015 2, /* cond_taken_branch_cost. */
1016 1, /* cond_not_taken_branch_cost. */
1019 struct processor_costs bdver3_cost = {
1020 COSTS_N_INSNS (1), /* cost of an add instruction */
1021 COSTS_N_INSNS (1), /* cost of a lea instruction */
1022 COSTS_N_INSNS (1), /* variable shift costs */
1023 COSTS_N_INSNS (1), /* constant shift costs */
1024 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1025 COSTS_N_INSNS (4), /* HI */
1026 COSTS_N_INSNS (4), /* SI */
1027 COSTS_N_INSNS (6), /* DI */
1028 COSTS_N_INSNS (6)}, /* other */
1029 0, /* cost of multiply per each bit set */
1030 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1031 COSTS_N_INSNS (35), /* HI */
1032 COSTS_N_INSNS (51), /* SI */
1033 COSTS_N_INSNS (83), /* DI */
1034 COSTS_N_INSNS (83)}, /* other */
1035 COSTS_N_INSNS (1), /* cost of movsx */
1036 COSTS_N_INSNS (1), /* cost of movzx */
1037 8, /* "large" insn */
1038 9, /* MOVE_RATIO */
1039 4, /* cost for loading QImode using movzbl */
1040 {5, 5, 4}, /* cost of loading integer registers
1041 in QImode, HImode and SImode.
1042 Relative to reg-reg move (2). */
1043 {4, 4, 4}, /* cost of storing integer registers */
1044 2, /* cost of reg,reg fld/fst */
1045 {5, 5, 12}, /* cost of loading fp registers
1046 in SFmode, DFmode and XFmode */
1047 {4, 4, 8}, /* cost of storing fp registers
1048 in SFmode, DFmode and XFmode */
1049 2, /* cost of moving MMX register */
1050 {4, 4}, /* cost of loading MMX registers
1051 in SImode and DImode */
1052 {4, 4}, /* cost of storing MMX registers
1053 in SImode and DImode */
1054 2, /* cost of moving SSE register */
1055 {4, 4, 4}, /* cost of loading SSE registers
1056 in SImode, DImode and TImode */
1057 {4, 4, 4}, /* cost of storing SSE registers
1058 in SImode, DImode and TImode */
1059 2, /* MMX or SSE register to integer */
1060 16, /* size of l1 cache. */
1061 2048, /* size of l2 cache. */
1062 64, /* size of prefetch block */
1063 /* New AMD processors never drop prefetches; if they cannot be performed
1064 immediately, they are queued. We set number of simultaneous prefetches
1065 to a large constant to reflect this (it probably is not a good idea not
1066 to limit number of prefetches at all, as their execution also takes some
1067 time). */
1068 100, /* number of parallel prefetches */
1069 2, /* Branch cost */
1070 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1071 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1072 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1073 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1074 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1075 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1077 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1078 very small blocks it is better to use loop. For large blocks, libcall
1079 can do nontemporary accesses and beat inline considerably. */
1080 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1081 {-1, rep_prefix_4_byte, false}}},
1082 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1083 {-1, libcall, false}}}},
1084 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1085 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1086 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1087 {-1, libcall, false}}}},
1088 6, /* scalar_stmt_cost. */
1089 4, /* scalar load_cost. */
1090 4, /* scalar_store_cost. */
1091 6, /* vec_stmt_cost. */
1092 0, /* vec_to_scalar_cost. */
1093 2, /* scalar_to_vec_cost. */
1094 4, /* vec_align_load_cost. */
1095 4, /* vec_unalign_load_cost. */
1096 4, /* vec_store_cost. */
1097 2, /* cond_taken_branch_cost. */
1098 1, /* cond_not_taken_branch_cost. */
1101 struct processor_costs btver1_cost = {
1102 COSTS_N_INSNS (1), /* cost of an add instruction */
1103 COSTS_N_INSNS (2), /* cost of a lea instruction */
1104 COSTS_N_INSNS (1), /* variable shift costs */
1105 COSTS_N_INSNS (1), /* constant shift costs */
1106 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1107 COSTS_N_INSNS (4), /* HI */
1108 COSTS_N_INSNS (3), /* SI */
1109 COSTS_N_INSNS (4), /* DI */
1110 COSTS_N_INSNS (5)}, /* other */
1111 0, /* cost of multiply per each bit set */
1112 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1113 COSTS_N_INSNS (35), /* HI */
1114 COSTS_N_INSNS (51), /* SI */
1115 COSTS_N_INSNS (83), /* DI */
1116 COSTS_N_INSNS (83)}, /* other */
1117 COSTS_N_INSNS (1), /* cost of movsx */
1118 COSTS_N_INSNS (1), /* cost of movzx */
1119 8, /* "large" insn */
1120 9, /* MOVE_RATIO */
1121 4, /* cost for loading QImode using movzbl */
1122 {3, 4, 3}, /* cost of loading integer registers
1123 in QImode, HImode and SImode.
1124 Relative to reg-reg move (2). */
1125 {3, 4, 3}, /* cost of storing integer registers */
1126 4, /* cost of reg,reg fld/fst */
1127 {4, 4, 12}, /* cost of loading fp registers
1128 in SFmode, DFmode and XFmode */
1129 {6, 6, 8}, /* cost of storing fp registers
1130 in SFmode, DFmode and XFmode */
1131 2, /* cost of moving MMX register */
1132 {3, 3}, /* cost of loading MMX registers
1133 in SImode and DImode */
1134 {4, 4}, /* cost of storing MMX registers
1135 in SImode and DImode */
1136 2, /* cost of moving SSE register */
1137 {4, 4, 3}, /* cost of loading SSE registers
1138 in SImode, DImode and TImode */
1139 {4, 4, 5}, /* cost of storing SSE registers
1140 in SImode, DImode and TImode */
1141 3, /* MMX or SSE register to integer */
1142 /* On K8:
1143 MOVD reg64, xmmreg Double FSTORE 4
1144 MOVD reg32, xmmreg Double FSTORE 4
1145 On AMDFAM10:
1146 MOVD reg64, xmmreg Double FADD 3
1147 1/1 1/1
1148 MOVD reg32, xmmreg Double FADD 3
1149 1/1 1/1 */
1150 32, /* size of l1 cache. */
1151 512, /* size of l2 cache. */
1152 64, /* size of prefetch block */
1153 100, /* number of parallel prefetches */
1154 2, /* Branch cost */
1155 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1156 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1157 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1158 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1159 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1160 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1162 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1163 very small blocks it is better to use loop. For large blocks, libcall can
1164 do nontemporary accesses and beat inline considerably. */
1165 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1166 {-1, rep_prefix_4_byte, false}}},
1167 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1168 {-1, libcall, false}}}},
1169 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1170 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1171 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1172 {-1, libcall, false}}}},
1173 4, /* scalar_stmt_cost. */
1174 2, /* scalar load_cost. */
1175 2, /* scalar_store_cost. */
1176 6, /* vec_stmt_cost. */
1177 0, /* vec_to_scalar_cost. */
1178 2, /* scalar_to_vec_cost. */
1179 2, /* vec_align_load_cost. */
1180 2, /* vec_unalign_load_cost. */
1181 2, /* vec_store_cost. */
1182 2, /* cond_taken_branch_cost. */
1183 1, /* cond_not_taken_branch_cost. */
1186 struct processor_costs btver2_cost = {
1187 COSTS_N_INSNS (1), /* cost of an add instruction */
1188 COSTS_N_INSNS (2), /* cost of a lea instruction */
1189 COSTS_N_INSNS (1), /* variable shift costs */
1190 COSTS_N_INSNS (1), /* constant shift costs */
1191 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1192 COSTS_N_INSNS (4), /* HI */
1193 COSTS_N_INSNS (3), /* SI */
1194 COSTS_N_INSNS (4), /* DI */
1195 COSTS_N_INSNS (5)}, /* other */
1196 0, /* cost of multiply per each bit set */
1197 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1198 COSTS_N_INSNS (35), /* HI */
1199 COSTS_N_INSNS (51), /* SI */
1200 COSTS_N_INSNS (83), /* DI */
1201 COSTS_N_INSNS (83)}, /* other */
1202 COSTS_N_INSNS (1), /* cost of movsx */
1203 COSTS_N_INSNS (1), /* cost of movzx */
1204 8, /* "large" insn */
1205 9, /* MOVE_RATIO */
1206 4, /* cost for loading QImode using movzbl */
1207 {3, 4, 3}, /* cost of loading integer registers
1208 in QImode, HImode and SImode.
1209 Relative to reg-reg move (2). */
1210 {3, 4, 3}, /* cost of storing integer registers */
1211 4, /* cost of reg,reg fld/fst */
1212 {4, 4, 12}, /* cost of loading fp registers
1213 in SFmode, DFmode and XFmode */
1214 {6, 6, 8}, /* cost of storing fp registers
1215 in SFmode, DFmode and XFmode */
1216 2, /* cost of moving MMX register */
1217 {3, 3}, /* cost of loading MMX registers
1218 in SImode and DImode */
1219 {4, 4}, /* cost of storing MMX registers
1220 in SImode and DImode */
1221 2, /* cost of moving SSE register */
1222 {4, 4, 3}, /* cost of loading SSE registers
1223 in SImode, DImode and TImode */
1224 {4, 4, 5}, /* cost of storing SSE registers
1225 in SImode, DImode and TImode */
1226 3, /* MMX or SSE register to integer */
1227 /* On K8:
1228 MOVD reg64, xmmreg Double FSTORE 4
1229 MOVD reg32, xmmreg Double FSTORE 4
1230 On AMDFAM10:
1231 MOVD reg64, xmmreg Double FADD 3
1232 1/1 1/1
1233 MOVD reg32, xmmreg Double FADD 3
1234 1/1 1/1 */
1235 32, /* size of l1 cache. */
1236 2048, /* size of l2 cache. */
1237 64, /* size of prefetch block */
1238 100, /* number of parallel prefetches */
1239 2, /* Branch cost */
1240 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1241 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1242 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1243 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1244 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1245 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1247 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1248 {-1, rep_prefix_4_byte, false}}},
1249 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1250 {-1, libcall, false}}}},
1251 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1252 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1253 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1254 {-1, libcall, false}}}},
1255 4, /* scalar_stmt_cost. */
1256 2, /* scalar load_cost. */
1257 2, /* scalar_store_cost. */
1258 6, /* vec_stmt_cost. */
1259 0, /* vec_to_scalar_cost. */
1260 2, /* scalar_to_vec_cost. */
1261 2, /* vec_align_load_cost. */
1262 2, /* vec_unalign_load_cost. */
1263 2, /* vec_store_cost. */
1264 2, /* cond_taken_branch_cost. */
1265 1, /* cond_not_taken_branch_cost. */
1268 static const
1269 struct processor_costs pentium4_cost = {
1270 COSTS_N_INSNS (1), /* cost of an add instruction */
1271 COSTS_N_INSNS (3), /* cost of a lea instruction */
1272 COSTS_N_INSNS (4), /* variable shift costs */
1273 COSTS_N_INSNS (4), /* constant shift costs */
1274 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1275 COSTS_N_INSNS (15), /* HI */
1276 COSTS_N_INSNS (15), /* SI */
1277 COSTS_N_INSNS (15), /* DI */
1278 COSTS_N_INSNS (15)}, /* other */
1279 0, /* cost of multiply per each bit set */
1280 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1281 COSTS_N_INSNS (56), /* HI */
1282 COSTS_N_INSNS (56), /* SI */
1283 COSTS_N_INSNS (56), /* DI */
1284 COSTS_N_INSNS (56)}, /* other */
1285 COSTS_N_INSNS (1), /* cost of movsx */
1286 COSTS_N_INSNS (1), /* cost of movzx */
1287 16, /* "large" insn */
1288 6, /* MOVE_RATIO */
1289 2, /* cost for loading QImode using movzbl */
1290 {4, 5, 4}, /* cost of loading integer registers
1291 in QImode, HImode and SImode.
1292 Relative to reg-reg move (2). */
1293 {2, 3, 2}, /* cost of storing integer registers */
1294 2, /* cost of reg,reg fld/fst */
1295 {2, 2, 6}, /* cost of loading fp registers
1296 in SFmode, DFmode and XFmode */
1297 {4, 4, 6}, /* cost of storing fp registers
1298 in SFmode, DFmode and XFmode */
1299 2, /* cost of moving MMX register */
1300 {2, 2}, /* cost of loading MMX registers
1301 in SImode and DImode */
1302 {2, 2}, /* cost of storing MMX registers
1303 in SImode and DImode */
1304 12, /* cost of moving SSE register */
1305 {12, 12, 12}, /* cost of loading SSE registers
1306 in SImode, DImode and TImode */
1307 {2, 2, 8}, /* cost of storing SSE registers
1308 in SImode, DImode and TImode */
1309 10, /* MMX or SSE register to integer */
1310 8, /* size of l1 cache. */
1311 256, /* size of l2 cache. */
1312 64, /* size of prefetch block */
1313 6, /* number of parallel prefetches */
1314 2, /* Branch cost */
1315 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1316 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1317 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1318 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1319 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1320 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1321 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1322 DUMMY_STRINGOP_ALGS},
1323 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1324 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1325 DUMMY_STRINGOP_ALGS},
1326 1, /* scalar_stmt_cost. */
1327 1, /* scalar load_cost. */
1328 1, /* scalar_store_cost. */
1329 1, /* vec_stmt_cost. */
1330 1, /* vec_to_scalar_cost. */
1331 1, /* scalar_to_vec_cost. */
1332 1, /* vec_align_load_cost. */
1333 2, /* vec_unalign_load_cost. */
1334 1, /* vec_store_cost. */
1335 3, /* cond_taken_branch_cost. */
1336 1, /* cond_not_taken_branch_cost. */
1339 static const
1340 struct processor_costs nocona_cost = {
1341 COSTS_N_INSNS (1), /* cost of an add instruction */
1342 COSTS_N_INSNS (1), /* cost of a lea instruction */
1343 COSTS_N_INSNS (1), /* variable shift costs */
1344 COSTS_N_INSNS (1), /* constant shift costs */
1345 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1346 COSTS_N_INSNS (10), /* HI */
1347 COSTS_N_INSNS (10), /* SI */
1348 COSTS_N_INSNS (10), /* DI */
1349 COSTS_N_INSNS (10)}, /* other */
1350 0, /* cost of multiply per each bit set */
1351 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1352 COSTS_N_INSNS (66), /* HI */
1353 COSTS_N_INSNS (66), /* SI */
1354 COSTS_N_INSNS (66), /* DI */
1355 COSTS_N_INSNS (66)}, /* other */
1356 COSTS_N_INSNS (1), /* cost of movsx */
1357 COSTS_N_INSNS (1), /* cost of movzx */
1358 16, /* "large" insn */
1359 17, /* MOVE_RATIO */
1360 4, /* cost for loading QImode using movzbl */
1361 {4, 4, 4}, /* cost of loading integer registers
1362 in QImode, HImode and SImode.
1363 Relative to reg-reg move (2). */
1364 {4, 4, 4}, /* cost of storing integer registers */
1365 3, /* cost of reg,reg fld/fst */
1366 {12, 12, 12}, /* cost of loading fp registers
1367 in SFmode, DFmode and XFmode */
1368 {4, 4, 4}, /* cost of storing fp registers
1369 in SFmode, DFmode and XFmode */
1370 6, /* cost of moving MMX register */
1371 {12, 12}, /* cost of loading MMX registers
1372 in SImode and DImode */
1373 {12, 12}, /* cost of storing MMX registers
1374 in SImode and DImode */
1375 6, /* cost of moving SSE register */
1376 {12, 12, 12}, /* cost of loading SSE registers
1377 in SImode, DImode and TImode */
1378 {12, 12, 12}, /* cost of storing SSE registers
1379 in SImode, DImode and TImode */
1380 8, /* MMX or SSE register to integer */
1381 8, /* size of l1 cache. */
1382 1024, /* size of l2 cache. */
1383 128, /* size of prefetch block */
1384 8, /* number of parallel prefetches */
1385 1, /* Branch cost */
1386 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1387 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1388 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1389 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1390 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1391 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1392 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1393 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1394 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1395 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1396 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1397 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1398 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1399 1, /* scalar_stmt_cost. */
1400 1, /* scalar load_cost. */
1401 1, /* scalar_store_cost. */
1402 1, /* vec_stmt_cost. */
1403 1, /* vec_to_scalar_cost. */
1404 1, /* scalar_to_vec_cost. */
1405 1, /* vec_align_load_cost. */
1406 2, /* vec_unalign_load_cost. */
1407 1, /* vec_store_cost. */
1408 3, /* cond_taken_branch_cost. */
1409 1, /* cond_not_taken_branch_cost. */
1412 static const
1413 struct processor_costs atom_cost = {
1414 COSTS_N_INSNS (1), /* cost of an add instruction */
1415 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1416 COSTS_N_INSNS (1), /* variable shift costs */
1417 COSTS_N_INSNS (1), /* constant shift costs */
1418 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1419 COSTS_N_INSNS (4), /* HI */
1420 COSTS_N_INSNS (3), /* SI */
1421 COSTS_N_INSNS (4), /* DI */
1422 COSTS_N_INSNS (2)}, /* other */
1423 0, /* cost of multiply per each bit set */
1424 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1425 COSTS_N_INSNS (26), /* HI */
1426 COSTS_N_INSNS (42), /* SI */
1427 COSTS_N_INSNS (74), /* DI */
1428 COSTS_N_INSNS (74)}, /* other */
1429 COSTS_N_INSNS (1), /* cost of movsx */
1430 COSTS_N_INSNS (1), /* cost of movzx */
1431 8, /* "large" insn */
1432 17, /* MOVE_RATIO */
1433 4, /* cost for loading QImode using movzbl */
1434 {4, 4, 4}, /* cost of loading integer registers
1435 in QImode, HImode and SImode.
1436 Relative to reg-reg move (2). */
1437 {4, 4, 4}, /* cost of storing integer registers */
1438 4, /* cost of reg,reg fld/fst */
1439 {12, 12, 12}, /* cost of loading fp registers
1440 in SFmode, DFmode and XFmode */
1441 {6, 6, 8}, /* cost of storing fp registers
1442 in SFmode, DFmode and XFmode */
1443 2, /* cost of moving MMX register */
1444 {8, 8}, /* cost of loading MMX registers
1445 in SImode and DImode */
1446 {8, 8}, /* cost of storing MMX registers
1447 in SImode and DImode */
1448 2, /* cost of moving SSE register */
1449 {8, 8, 8}, /* cost of loading SSE registers
1450 in SImode, DImode and TImode */
1451 {8, 8, 8}, /* cost of storing SSE registers
1452 in SImode, DImode and TImode */
1453 5, /* MMX or SSE register to integer */
1454 32, /* size of l1 cache. */
1455 256, /* size of l2 cache. */
1456 64, /* size of prefetch block */
1457 6, /* number of parallel prefetches */
1458 3, /* Branch cost */
1459 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1460 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1461 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1462 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1463 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1464 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1465 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1466 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1467 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1468 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1469 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1470 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1471 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1472 1, /* scalar_stmt_cost. */
1473 1, /* scalar load_cost. */
1474 1, /* scalar_store_cost. */
1475 1, /* vec_stmt_cost. */
1476 1, /* vec_to_scalar_cost. */
1477 1, /* scalar_to_vec_cost. */
1478 1, /* vec_align_load_cost. */
1479 2, /* vec_unalign_load_cost. */
1480 1, /* vec_store_cost. */
1481 3, /* cond_taken_branch_cost. */
1482 1, /* cond_not_taken_branch_cost. */
1485 /* Generic64 should produce code tuned for Nocona and K8. */
1486 static const
1487 struct processor_costs generic64_cost = {
1488 COSTS_N_INSNS (1), /* cost of an add instruction */
1489 /* On all chips taken into consideration lea is 2 cycles and more. With
1490 this cost however our current implementation of synth_mult results in
1491 use of unnecessary temporary registers causing regression on several
1492 SPECfp benchmarks. */
1493 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1494 COSTS_N_INSNS (1), /* variable shift costs */
1495 COSTS_N_INSNS (1), /* constant shift costs */
1496 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1497 COSTS_N_INSNS (4), /* HI */
1498 COSTS_N_INSNS (3), /* SI */
1499 COSTS_N_INSNS (4), /* DI */
1500 COSTS_N_INSNS (2)}, /* other */
1501 0, /* cost of multiply per each bit set */
1502 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1503 COSTS_N_INSNS (26), /* HI */
1504 COSTS_N_INSNS (42), /* SI */
1505 COSTS_N_INSNS (74), /* DI */
1506 COSTS_N_INSNS (74)}, /* other */
1507 COSTS_N_INSNS (1), /* cost of movsx */
1508 COSTS_N_INSNS (1), /* cost of movzx */
1509 8, /* "large" insn */
1510 17, /* MOVE_RATIO */
1511 4, /* cost for loading QImode using movzbl */
1512 {4, 4, 4}, /* cost of loading integer registers
1513 in QImode, HImode and SImode.
1514 Relative to reg-reg move (2). */
1515 {4, 4, 4}, /* cost of storing integer registers */
1516 4, /* cost of reg,reg fld/fst */
1517 {12, 12, 12}, /* cost of loading fp registers
1518 in SFmode, DFmode and XFmode */
1519 {6, 6, 8}, /* cost of storing fp registers
1520 in SFmode, DFmode and XFmode */
1521 2, /* cost of moving MMX register */
1522 {8, 8}, /* cost of loading MMX registers
1523 in SImode and DImode */
1524 {8, 8}, /* cost of storing MMX registers
1525 in SImode and DImode */
1526 2, /* cost of moving SSE register */
1527 {8, 8, 8}, /* cost of loading SSE registers
1528 in SImode, DImode and TImode */
1529 {8, 8, 8}, /* cost of storing SSE registers
1530 in SImode, DImode and TImode */
1531 5, /* MMX or SSE register to integer */
1532 32, /* size of l1 cache. */
1533 512, /* size of l2 cache. */
1534 64, /* size of prefetch block */
1535 6, /* number of parallel prefetches */
1536 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1537 value is increased to perhaps more appropriate value of 5. */
1538 3, /* Branch cost */
1539 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1540 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1541 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1542 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1543 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1544 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1545 {DUMMY_STRINGOP_ALGS,
1546 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1547 {-1, libcall, false}}}},
1548 {DUMMY_STRINGOP_ALGS,
1549 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1550 {-1, libcall, false}}}},
1551 1, /* scalar_stmt_cost. */
1552 1, /* scalar load_cost. */
1553 1, /* scalar_store_cost. */
1554 1, /* vec_stmt_cost. */
1555 1, /* vec_to_scalar_cost. */
1556 1, /* scalar_to_vec_cost. */
1557 1, /* vec_align_load_cost. */
1558 2, /* vec_unalign_load_cost. */
1559 1, /* vec_store_cost. */
1560 3, /* cond_taken_branch_cost. */
1561 1, /* cond_not_taken_branch_cost. */
1564 /* core_cost should produce code tuned for Core familly of CPUs. */
1565 static const
1566 struct processor_costs core_cost = {
1567 COSTS_N_INSNS (1), /* cost of an add instruction */
1568 /* On all chips taken into consideration lea is 2 cycles and more. With
1569 this cost however our current implementation of synth_mult results in
1570 use of unnecessary temporary registers causing regression on several
1571 SPECfp benchmarks. */
1572 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1573 COSTS_N_INSNS (1), /* variable shift costs */
1574 COSTS_N_INSNS (1), /* constant shift costs */
1575 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1576 COSTS_N_INSNS (4), /* HI */
1577 COSTS_N_INSNS (3), /* SI */
1578 COSTS_N_INSNS (4), /* DI */
1579 COSTS_N_INSNS (2)}, /* other */
1580 0, /* cost of multiply per each bit set */
1581 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1582 COSTS_N_INSNS (26), /* HI */
1583 COSTS_N_INSNS (42), /* SI */
1584 COSTS_N_INSNS (74), /* DI */
1585 COSTS_N_INSNS (74)}, /* other */
1586 COSTS_N_INSNS (1), /* cost of movsx */
1587 COSTS_N_INSNS (1), /* cost of movzx */
1588 8, /* "large" insn */
1589 17, /* MOVE_RATIO */
1590 4, /* cost for loading QImode using movzbl */
1591 {4, 4, 4}, /* cost of loading integer registers
1592 in QImode, HImode and SImode.
1593 Relative to reg-reg move (2). */
1594 {4, 4, 4}, /* cost of storing integer registers */
1595 4, /* cost of reg,reg fld/fst */
1596 {12, 12, 12}, /* cost of loading fp registers
1597 in SFmode, DFmode and XFmode */
1598 {6, 6, 8}, /* cost of storing fp registers
1599 in SFmode, DFmode and XFmode */
1600 2, /* cost of moving MMX register */
1601 {8, 8}, /* cost of loading MMX registers
1602 in SImode and DImode */
1603 {8, 8}, /* cost of storing MMX registers
1604 in SImode and DImode */
1605 2, /* cost of moving SSE register */
1606 {8, 8, 8}, /* cost of loading SSE registers
1607 in SImode, DImode and TImode */
1608 {8, 8, 8}, /* cost of storing SSE registers
1609 in SImode, DImode and TImode */
1610 5, /* MMX or SSE register to integer */
1611 64, /* size of l1 cache. */
1612 512, /* size of l2 cache. */
1613 64, /* size of prefetch block */
1614 6, /* number of parallel prefetches */
1615 /* FIXME perhaps more appropriate value is 5. */
1616 3, /* Branch cost */
1617 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1618 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1619 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1620 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1621 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1622 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1623 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1624 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1625 {-1, libcall, false}}}},
1626 {{libcall, {{6, loop_1_byte, true},
1627 {24, loop, true},
1628 {8192, rep_prefix_4_byte, true},
1629 {-1, libcall, false}}},
1630 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1631 {-1, libcall, false}}}},
1632 1, /* scalar_stmt_cost. */
1633 1, /* scalar load_cost. */
1634 1, /* scalar_store_cost. */
1635 1, /* vec_stmt_cost. */
1636 1, /* vec_to_scalar_cost. */
1637 1, /* scalar_to_vec_cost. */
1638 1, /* vec_align_load_cost. */
1639 2, /* vec_unalign_load_cost. */
1640 1, /* vec_store_cost. */
1641 3, /* cond_taken_branch_cost. */
1642 1, /* cond_not_taken_branch_cost. */
1645 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1646 Athlon and K8. */
1647 static const
1648 struct processor_costs generic32_cost = {
1649 COSTS_N_INSNS (1), /* cost of an add instruction */
1650 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1651 COSTS_N_INSNS (1), /* variable shift costs */
1652 COSTS_N_INSNS (1), /* constant shift costs */
1653 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1654 COSTS_N_INSNS (4), /* HI */
1655 COSTS_N_INSNS (3), /* SI */
1656 COSTS_N_INSNS (4), /* DI */
1657 COSTS_N_INSNS (2)}, /* other */
1658 0, /* cost of multiply per each bit set */
1659 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1660 COSTS_N_INSNS (26), /* HI */
1661 COSTS_N_INSNS (42), /* SI */
1662 COSTS_N_INSNS (74), /* DI */
1663 COSTS_N_INSNS (74)}, /* other */
1664 COSTS_N_INSNS (1), /* cost of movsx */
1665 COSTS_N_INSNS (1), /* cost of movzx */
1666 8, /* "large" insn */
1667 17, /* MOVE_RATIO */
1668 4, /* cost for loading QImode using movzbl */
1669 {4, 4, 4}, /* cost of loading integer registers
1670 in QImode, HImode and SImode.
1671 Relative to reg-reg move (2). */
1672 {4, 4, 4}, /* cost of storing integer registers */
1673 4, /* cost of reg,reg fld/fst */
1674 {12, 12, 12}, /* cost of loading fp registers
1675 in SFmode, DFmode and XFmode */
1676 {6, 6, 8}, /* cost of storing fp registers
1677 in SFmode, DFmode and XFmode */
1678 2, /* cost of moving MMX register */
1679 {8, 8}, /* cost of loading MMX registers
1680 in SImode and DImode */
1681 {8, 8}, /* cost of storing MMX registers
1682 in SImode and DImode */
1683 2, /* cost of moving SSE register */
1684 {8, 8, 8}, /* cost of loading SSE registers
1685 in SImode, DImode and TImode */
1686 {8, 8, 8}, /* cost of storing SSE registers
1687 in SImode, DImode and TImode */
1688 5, /* MMX or SSE register to integer */
1689 32, /* size of l1 cache. */
1690 256, /* size of l2 cache. */
1691 64, /* size of prefetch block */
1692 6, /* number of parallel prefetches */
1693 3, /* Branch cost */
1694 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1695 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1696 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1697 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1698 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1699 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1700 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1701 {-1, libcall, false}}},
1702 DUMMY_STRINGOP_ALGS},
1703 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1704 {-1, libcall, false}}},
1705 DUMMY_STRINGOP_ALGS},
1706 1, /* scalar_stmt_cost. */
1707 1, /* scalar load_cost. */
1708 1, /* scalar_store_cost. */
1709 1, /* vec_stmt_cost. */
1710 1, /* vec_to_scalar_cost. */
1711 1, /* scalar_to_vec_cost. */
1712 1, /* vec_align_load_cost. */
1713 2, /* vec_unalign_load_cost. */
1714 1, /* vec_store_cost. */
1715 3, /* cond_taken_branch_cost. */
1716 1, /* cond_not_taken_branch_cost. */
1719 /* Set by -mtune. */
1720 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1722 /* Set by -mtune or -Os. */
1723 const struct processor_costs *ix86_cost = &pentium_cost;
1725 /* Processor feature/optimization bitmasks. */
1726 #define m_386 (1<<PROCESSOR_I386)
1727 #define m_486 (1<<PROCESSOR_I486)
1728 #define m_PENT (1<<PROCESSOR_PENTIUM)
1729 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1730 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1731 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1732 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1733 #define m_CORE2 (1<<PROCESSOR_CORE2)
1734 #define m_COREI7 (1<<PROCESSOR_COREI7)
1735 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1736 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1737 #define m_ATOM (1<<PROCESSOR_ATOM)
1739 #define m_GEODE (1<<PROCESSOR_GEODE)
1740 #define m_K6 (1<<PROCESSOR_K6)
1741 #define m_K6_GEODE (m_K6 | m_GEODE)
1742 #define m_K8 (1<<PROCESSOR_K8)
1743 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1744 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1745 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1746 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1747 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1748 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1749 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1750 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1751 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1752 #define m_BTVER (m_BTVER1 | m_BTVER2)
1753 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1755 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1756 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1758 /* Generic instruction choice should be common subset of supported CPUs
1759 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1760 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1762 /* Feature tests against the various tunings. */
1763 unsigned char ix86_tune_features[X86_TUNE_LAST];
1765 /* Feature tests against the various tunings used to create ix86_tune_features
1766 based on the processor mask. */
1767 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1768 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1769 negatively, so enabling for Generic64 seems like good code size
1770 tradeoff. We can't enable it for 32bit generic because it does not
1771 work well with PPro base chips. */
1772 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1774 /* X86_TUNE_PUSH_MEMORY */
1775 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1777 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1778 m_486 | m_PENT,
1780 /* X86_TUNE_UNROLL_STRLEN */
1781 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1783 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1784 on simulation result. But after P4 was made, no performance benefit
1785 was observed with branch hints. It also increases the code size.
1786 As a result, icc never generates branch hints. */
1789 /* X86_TUNE_DOUBLE_WITH_ADD */
1790 ~m_386,
1792 /* X86_TUNE_USE_SAHF */
1793 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1795 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1796 partial dependencies. */
1797 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1799 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1800 register stalls on Generic32 compilation setting as well. However
1801 in current implementation the partial register stalls are not eliminated
1802 very well - they can be introduced via subregs synthesized by combine
1803 and can happen in caller/callee saving sequences. Because this option
1804 pays back little on PPro based chips and is in conflict with partial reg
1805 dependencies used by Athlon/P4 based chips, it is better to leave it off
1806 for generic32 for now. */
1807 m_PPRO,
1809 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1810 m_CORE_ALL | m_GENERIC,
1812 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1813 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1814 m_CORE_ALL | m_GENERIC,
1816 /* X86_TUNE_USE_HIMODE_FIOP */
1817 m_386 | m_486 | m_K6_GEODE,
1819 /* X86_TUNE_USE_SIMODE_FIOP */
1820 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1822 /* X86_TUNE_USE_MOV0 */
1823 m_K6,
1825 /* X86_TUNE_USE_CLTD */
1826 ~(m_PENT | m_ATOM | m_K6),
1828 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1829 m_PENT4,
1831 /* X86_TUNE_SPLIT_LONG_MOVES */
1832 m_PPRO,
1834 /* X86_TUNE_READ_MODIFY_WRITE */
1835 ~m_PENT,
1837 /* X86_TUNE_READ_MODIFY */
1838 ~(m_PENT | m_PPRO),
1840 /* X86_TUNE_PROMOTE_QIMODE */
1841 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1843 /* X86_TUNE_FAST_PREFIX */
1844 ~(m_386 | m_486 | m_PENT),
1846 /* X86_TUNE_SINGLE_STRINGOP */
1847 m_386 | m_P4_NOCONA,
1849 /* X86_TUNE_QIMODE_MATH */
1852 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1853 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1854 might be considered for Generic32 if our scheme for avoiding partial
1855 stalls was more effective. */
1856 ~m_PPRO,
1858 /* X86_TUNE_PROMOTE_QI_REGS */
1861 /* X86_TUNE_PROMOTE_HI_REGS */
1862 m_PPRO,
1864 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1865 over esp addition. */
1866 m_386 | m_486 | m_PENT | m_PPRO,
1868 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1869 over esp addition. */
1870 m_PENT,
1872 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1873 over esp subtraction. */
1874 m_386 | m_486 | m_PENT | m_K6_GEODE,
1876 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1877 over esp subtraction. */
1878 m_PENT | m_K6_GEODE,
1880 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1881 for DFmode copies */
1882 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1884 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1885 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1887 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1888 conflict here in between PPro/Pentium4 based chips that thread 128bit
1889 SSE registers as single units versus K8 based chips that divide SSE
1890 registers to two 64bit halves. This knob promotes all store destinations
1891 to be 128bit to allow register renaming on 128bit SSE units, but usually
1892 results in one extra microop on 64bit SSE units. Experimental results
1893 shows that disabling this option on P4 brings over 20% SPECfp regression,
1894 while enabling it on K8 brings roughly 2.4% regression that can be partly
1895 masked by careful scheduling of moves. */
1896 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1898 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1899 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
1901 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1902 m_COREI7 | m_BDVER,
1904 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1905 m_BDVER ,
1907 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1908 are resolved on SSE register parts instead of whole registers, so we may
1909 maintain just lower part of scalar values in proper format leaving the
1910 upper part undefined. */
1911 m_ATHLON_K8,
1913 /* X86_TUNE_SSE_TYPELESS_STORES */
1914 m_AMD_MULTIPLE,
1916 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1917 m_PPRO | m_P4_NOCONA,
1919 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1920 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1922 /* X86_TUNE_PROLOGUE_USING_MOVE */
1923 m_PPRO | m_ATHLON_K8,
1925 /* X86_TUNE_EPILOGUE_USING_MOVE */
1926 m_PPRO | m_ATHLON_K8,
1928 /* X86_TUNE_SHIFT1 */
1929 ~m_486,
1931 /* X86_TUNE_USE_FFREEP */
1932 m_AMD_MULTIPLE,
1934 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC */
1935 ~(m_AMD_MULTIPLE | m_GENERIC),
1937 /* X86_TUNE_INTER_UNIT_MOVES_FROM_VEC */
1938 ~m_ATHLON_K8,
1940 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1941 ~(m_AMDFAM10 | m_BDVER ),
1943 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1944 than 4 branch instructions in the 16 byte window. */
1945 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1947 /* X86_TUNE_SCHEDULE */
1948 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1950 /* X86_TUNE_USE_BT */
1951 m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1953 /* X86_TUNE_USE_INCDEC */
1954 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC),
1956 /* X86_TUNE_PAD_RETURNS */
1957 m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
1959 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1960 m_ATOM,
1962 /* X86_TUNE_EXT_80387_CONSTANTS */
1963 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1965 /* X86_TUNE_AVOID_VECTOR_DECODE */
1966 m_CORE_ALL | m_K8 | m_GENERIC64,
1968 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1969 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1970 ~(m_386 | m_486),
1972 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1973 vector path on AMD machines. */
1974 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1976 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1977 machines. */
1978 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1980 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1981 than a MOV. */
1982 m_PENT,
1984 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1985 but one byte longer. */
1986 m_PENT,
1988 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1989 operand that cannot be represented using a modRM byte. The XOR
1990 replacement is long decoded, so this split helps here as well. */
1991 m_K6,
1993 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1994 from FP to FP. */
1995 m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
1997 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1998 from integer to FP. */
1999 m_AMDFAM10,
2001 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2002 with a subsequent conditional jump instruction into a single
2003 compare-and-branch uop. */
2004 m_BDVER,
2006 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2007 will impact LEA instruction selection. */
2008 m_ATOM,
2010 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2011 instructions. */
2012 ~m_ATOM,
2014 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2015 at -O3. For the moment, the prefetching seems badly tuned for Intel
2016 chips. */
2017 m_K6_GEODE | m_AMD_MULTIPLE,
2019 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2020 the auto-vectorizer. */
2021 m_BDVER | m_BTVER2,
2023 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2024 during reassociation of integer computation. */
2025 m_ATOM,
2027 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2028 during reassociation of fp computation. */
2029 m_ATOM | m_HASWELL,
2031 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2032 regs instead of memory. */
2033 m_CORE_ALL,
2035 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2036 a conditional move. */
2037 m_ATOM
2040 /* Feature tests against the various architecture variations. */
2041 unsigned char ix86_arch_features[X86_ARCH_LAST];
2043 /* Feature tests against the various architecture variations, used to create
2044 ix86_arch_features based on the processor mask. */
2045 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2046 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2047 ~(m_386 | m_486 | m_PENT | m_K6),
2049 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2050 ~m_386,
2052 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2053 ~(m_386 | m_486),
2055 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2056 ~m_386,
2058 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2059 ~m_386,
2062 static const unsigned int x86_accumulate_outgoing_args
2063 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
2065 static const unsigned int x86_arch_always_fancy_math_387
2066 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2068 static const unsigned int x86_avx256_split_unaligned_load
2069 = m_COREI7 | m_GENERIC;
2071 static const unsigned int x86_avx256_split_unaligned_store
2072 = m_COREI7 | m_BDVER | m_GENERIC;
2074 /* In case the average insn count for single function invocation is
2075 lower than this constant, emit fast (but longer) prologue and
2076 epilogue code. */
2077 #define FAST_PROLOGUE_INSN_COUNT 20
2079 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2080 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2081 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2082 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2084 /* Array of the smallest class containing reg number REGNO, indexed by
2085 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2087 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2089 /* ax, dx, cx, bx */
2090 AREG, DREG, CREG, BREG,
2091 /* si, di, bp, sp */
2092 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2093 /* FP registers */
2094 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2095 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2096 /* arg pointer */
2097 NON_Q_REGS,
2098 /* flags, fpsr, fpcr, frame */
2099 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2100 /* SSE registers */
2101 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2102 SSE_REGS, SSE_REGS,
2103 /* MMX registers */
2104 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2105 MMX_REGS, MMX_REGS,
2106 /* REX registers */
2107 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2108 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2109 /* SSE REX registers */
2110 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2111 SSE_REGS, SSE_REGS,
2114 /* The "default" register map used in 32bit mode. */
2116 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2118 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2119 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2120 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2121 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2122 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2123 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2124 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2127 /* The "default" register map used in 64bit mode. */
2129 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2131 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2132 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2133 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2134 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2135 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2136 8,9,10,11,12,13,14,15, /* extended integer registers */
2137 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2140 /* Define the register numbers to be used in Dwarf debugging information.
2141 The SVR4 reference port C compiler uses the following register numbers
2142 in its Dwarf output code:
2143 0 for %eax (gcc regno = 0)
2144 1 for %ecx (gcc regno = 2)
2145 2 for %edx (gcc regno = 1)
2146 3 for %ebx (gcc regno = 3)
2147 4 for %esp (gcc regno = 7)
2148 5 for %ebp (gcc regno = 6)
2149 6 for %esi (gcc regno = 4)
2150 7 for %edi (gcc regno = 5)
2151 The following three DWARF register numbers are never generated by
2152 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2153 believes these numbers have these meanings.
2154 8 for %eip (no gcc equivalent)
2155 9 for %eflags (gcc regno = 17)
2156 10 for %trapno (no gcc equivalent)
2157 It is not at all clear how we should number the FP stack registers
2158 for the x86 architecture. If the version of SDB on x86/svr4 were
2159 a bit less brain dead with respect to floating-point then we would
2160 have a precedent to follow with respect to DWARF register numbers
2161 for x86 FP registers, but the SDB on x86/svr4 is so completely
2162 broken with respect to FP registers that it is hardly worth thinking
2163 of it as something to strive for compatibility with.
2164 The version of x86/svr4 SDB I have at the moment does (partially)
2165 seem to believe that DWARF register number 11 is associated with
2166 the x86 register %st(0), but that's about all. Higher DWARF
2167 register numbers don't seem to be associated with anything in
2168 particular, and even for DWARF regno 11, SDB only seems to under-
2169 stand that it should say that a variable lives in %st(0) (when
2170 asked via an `=' command) if we said it was in DWARF regno 11,
2171 but SDB still prints garbage when asked for the value of the
2172 variable in question (via a `/' command).
2173 (Also note that the labels SDB prints for various FP stack regs
2174 when doing an `x' command are all wrong.)
2175 Note that these problems generally don't affect the native SVR4
2176 C compiler because it doesn't allow the use of -O with -g and
2177 because when it is *not* optimizing, it allocates a memory
2178 location for each floating-point variable, and the memory
2179 location is what gets described in the DWARF AT_location
2180 attribute for the variable in question.
2181 Regardless of the severe mental illness of the x86/svr4 SDB, we
2182 do something sensible here and we use the following DWARF
2183 register numbers. Note that these are all stack-top-relative
2184 numbers.
2185 11 for %st(0) (gcc regno = 8)
2186 12 for %st(1) (gcc regno = 9)
2187 13 for %st(2) (gcc regno = 10)
2188 14 for %st(3) (gcc regno = 11)
2189 15 for %st(4) (gcc regno = 12)
2190 16 for %st(5) (gcc regno = 13)
2191 17 for %st(6) (gcc regno = 14)
2192 18 for %st(7) (gcc regno = 15)
2194 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2196 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2197 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2198 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2199 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2200 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2201 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2202 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2205 /* Define parameter passing and return registers. */
2207 static int const x86_64_int_parameter_registers[6] =
2209 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2212 static int const x86_64_ms_abi_int_parameter_registers[4] =
2214 CX_REG, DX_REG, R8_REG, R9_REG
2217 static int const x86_64_int_return_registers[4] =
2219 AX_REG, DX_REG, DI_REG, SI_REG
2222 /* Additional registers that are clobbered by SYSV calls. */
2224 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2226 SI_REG, DI_REG,
2227 XMM6_REG, XMM7_REG,
2228 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2229 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2232 /* Define the structure for the machine field in struct function. */
2234 struct GTY(()) stack_local_entry {
2235 unsigned short mode;
2236 unsigned short n;
2237 rtx rtl;
2238 struct stack_local_entry *next;
2241 /* Structure describing stack frame layout.
2242 Stack grows downward:
2244 [arguments]
2245 <- ARG_POINTER
2246 saved pc
2248 saved static chain if ix86_static_chain_on_stack
2250 saved frame pointer if frame_pointer_needed
2251 <- HARD_FRAME_POINTER
2252 [saved regs]
2253 <- regs_save_offset
2254 [padding0]
2256 [saved SSE regs]
2257 <- sse_regs_save_offset
2258 [padding1] |
2259 | <- FRAME_POINTER
2260 [va_arg registers] |
2262 [frame] |
2264 [padding2] | = to_allocate
2265 <- STACK_POINTER
2267 struct ix86_frame
2269 int nsseregs;
2270 int nregs;
2271 int va_arg_size;
2272 int red_zone_size;
2273 int outgoing_arguments_size;
2275 /* The offsets relative to ARG_POINTER. */
2276 HOST_WIDE_INT frame_pointer_offset;
2277 HOST_WIDE_INT hard_frame_pointer_offset;
2278 HOST_WIDE_INT stack_pointer_offset;
2279 HOST_WIDE_INT hfp_save_offset;
2280 HOST_WIDE_INT reg_save_offset;
2281 HOST_WIDE_INT sse_reg_save_offset;
2283 /* When save_regs_using_mov is set, emit prologue using
2284 move instead of push instructions. */
2285 bool save_regs_using_mov;
2288 /* Which cpu are we scheduling for. */
2289 enum attr_cpu ix86_schedule;
2291 /* Which cpu are we optimizing for. */
2292 enum processor_type ix86_tune;
2294 /* Which instruction set architecture to use. */
2295 enum processor_type ix86_arch;
2297 /* True if processor has SSE prefetch instruction. */
2298 unsigned char x86_prefetch_sse;
2300 /* -mstackrealign option */
2301 static const char ix86_force_align_arg_pointer_string[]
2302 = "force_align_arg_pointer";
2304 static rtx (*ix86_gen_leave) (void);
2305 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2306 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2307 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2308 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2309 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2310 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2311 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2312 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2313 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2314 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2315 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2317 /* Preferred alignment for stack boundary in bits. */
2318 unsigned int ix86_preferred_stack_boundary;
2320 /* Alignment for incoming stack boundary in bits specified at
2321 command line. */
2322 static unsigned int ix86_user_incoming_stack_boundary;
2324 /* Default alignment for incoming stack boundary in bits. */
2325 static unsigned int ix86_default_incoming_stack_boundary;
2327 /* Alignment for incoming stack boundary in bits. */
2328 unsigned int ix86_incoming_stack_boundary;
2330 /* Calling abi specific va_list type nodes. */
2331 static GTY(()) tree sysv_va_list_type_node;
2332 static GTY(()) tree ms_va_list_type_node;
2334 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2335 char internal_label_prefix[16];
2336 int internal_label_prefix_len;
2338 /* Fence to use after loop using movnt. */
2339 tree x86_mfence;
2341 /* Register class used for passing given 64bit part of the argument.
2342 These represent classes as documented by the PS ABI, with the exception
2343 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2344 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2346 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2347 whenever possible (upper half does contain padding). */
2348 enum x86_64_reg_class
2350 X86_64_NO_CLASS,
2351 X86_64_INTEGER_CLASS,
2352 X86_64_INTEGERSI_CLASS,
2353 X86_64_SSE_CLASS,
2354 X86_64_SSESF_CLASS,
2355 X86_64_SSEDF_CLASS,
2356 X86_64_SSEUP_CLASS,
2357 X86_64_X87_CLASS,
2358 X86_64_X87UP_CLASS,
2359 X86_64_COMPLEX_X87_CLASS,
2360 X86_64_MEMORY_CLASS
2363 #define MAX_CLASSES 4
2365 /* Table of constants used by fldpi, fldln2, etc.... */
2366 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2367 static bool ext_80387_constants_init = 0;
2370 static struct machine_function * ix86_init_machine_status (void);
2371 static rtx ix86_function_value (const_tree, const_tree, bool);
2372 static bool ix86_function_value_regno_p (const unsigned int);
2373 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2374 const_tree);
2375 static rtx ix86_static_chain (const_tree, bool);
2376 static int ix86_function_regparm (const_tree, const_tree);
2377 static void ix86_compute_frame_layout (struct ix86_frame *);
2378 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2379 rtx, rtx, int);
2380 static void ix86_add_new_builtins (HOST_WIDE_INT);
2381 static tree ix86_canonical_va_list_type (tree);
2382 static void predict_jump (int);
2383 static unsigned int split_stack_prologue_scratch_regno (void);
2384 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2386 enum ix86_function_specific_strings
2388 IX86_FUNCTION_SPECIFIC_ARCH,
2389 IX86_FUNCTION_SPECIFIC_TUNE,
2390 IX86_FUNCTION_SPECIFIC_MAX
2393 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2394 const char *, enum fpmath_unit, bool);
2395 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2396 static void ix86_function_specific_save (struct cl_target_option *);
2397 static void ix86_function_specific_restore (struct cl_target_option *);
2398 static void ix86_function_specific_print (FILE *, int,
2399 struct cl_target_option *);
2400 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2401 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2402 struct gcc_options *);
2403 static bool ix86_can_inline_p (tree, tree);
2404 static void ix86_set_current_function (tree);
2405 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2407 static enum calling_abi ix86_function_abi (const_tree);
2410 #ifndef SUBTARGET32_DEFAULT_CPU
2411 #define SUBTARGET32_DEFAULT_CPU "i386"
2412 #endif
2414 /* Whether -mtune= or -march= were specified */
2415 static int ix86_tune_defaulted;
2416 static int ix86_arch_specified;
2418 /* Vectorization library interface and handlers. */
2419 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2421 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2422 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2424 /* Processor target table, indexed by processor number */
2425 struct ptt
2427 const struct processor_costs *cost; /* Processor costs */
2428 const int align_loop; /* Default alignments. */
2429 const int align_loop_max_skip;
2430 const int align_jump;
2431 const int align_jump_max_skip;
2432 const int align_func;
2435 static const struct ptt processor_target_table[PROCESSOR_max] =
2437 {&i386_cost, 4, 3, 4, 3, 4},
2438 {&i486_cost, 16, 15, 16, 15, 16},
2439 {&pentium_cost, 16, 7, 16, 7, 16},
2440 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2441 {&geode_cost, 0, 0, 0, 0, 0},
2442 {&k6_cost, 32, 7, 32, 7, 32},
2443 {&athlon_cost, 16, 7, 16, 7, 16},
2444 {&pentium4_cost, 0, 0, 0, 0, 0},
2445 {&k8_cost, 16, 7, 16, 7, 16},
2446 {&nocona_cost, 0, 0, 0, 0, 0},
2447 /* Core 2 */
2448 {&core_cost, 16, 10, 16, 10, 16},
2449 /* Core i7 */
2450 {&core_cost, 16, 10, 16, 10, 16},
2451 /* Core avx2 */
2452 {&core_cost, 16, 10, 16, 10, 16},
2453 {&generic32_cost, 16, 7, 16, 7, 16},
2454 {&generic64_cost, 16, 10, 16, 10, 16},
2455 {&amdfam10_cost, 32, 24, 32, 7, 32},
2456 {&bdver1_cost, 32, 24, 32, 7, 32},
2457 {&bdver2_cost, 32, 24, 32, 7, 32},
2458 {&bdver3_cost, 32, 24, 32, 7, 32},
2459 {&btver1_cost, 32, 24, 32, 7, 32},
2460 {&btver2_cost, 32, 24, 32, 7, 32},
2461 {&atom_cost, 16, 15, 16, 7, 16}
2464 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2466 "generic",
2467 "i386",
2468 "i486",
2469 "pentium",
2470 "pentium-mmx",
2471 "pentiumpro",
2472 "pentium2",
2473 "pentium3",
2474 "pentium4",
2475 "pentium-m",
2476 "prescott",
2477 "nocona",
2478 "core2",
2479 "corei7",
2480 "core-avx2",
2481 "atom",
2482 "geode",
2483 "k6",
2484 "k6-2",
2485 "k6-3",
2486 "athlon",
2487 "athlon-4",
2488 "k8",
2489 "amdfam10",
2490 "bdver1",
2491 "bdver2",
2492 "bdver3",
2493 "btver1",
2494 "btver2"
2497 static bool
2498 gate_insert_vzeroupper (void)
2500 return TARGET_VZEROUPPER;
2503 static unsigned int
2504 rest_of_handle_insert_vzeroupper (void)
2506 int i;
2508 /* vzeroupper instructions are inserted immediately after reload to
2509 account for possible spills from 256bit registers. The pass
2510 reuses mode switching infrastructure by re-running mode insertion
2511 pass, so disable entities that have already been processed. */
2512 for (i = 0; i < MAX_386_ENTITIES; i++)
2513 ix86_optimize_mode_switching[i] = 0;
2515 ix86_optimize_mode_switching[AVX_U128] = 1;
2517 /* Call optimize_mode_switching. */
2518 pass_mode_switching.pass.execute ();
2519 return 0;
2522 struct rtl_opt_pass pass_insert_vzeroupper =
2525 RTL_PASS,
2526 "vzeroupper", /* name */
2527 OPTGROUP_NONE, /* optinfo_flags */
2528 gate_insert_vzeroupper, /* gate */
2529 rest_of_handle_insert_vzeroupper, /* execute */
2530 NULL, /* sub */
2531 NULL, /* next */
2532 0, /* static_pass_number */
2533 TV_NONE, /* tv_id */
2534 0, /* properties_required */
2535 0, /* properties_provided */
2536 0, /* properties_destroyed */
2537 0, /* todo_flags_start */
2538 TODO_df_finish | TODO_verify_rtl_sharing |
2539 0, /* todo_flags_finish */
2543 /* Return true if a red-zone is in use. */
2545 static inline bool
2546 ix86_using_red_zone (void)
2548 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2551 /* Return a string that documents the current -m options. The caller is
2552 responsible for freeing the string. */
2554 static char *
2555 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2556 const char *tune, enum fpmath_unit fpmath,
2557 bool add_nl_p)
2559 struct ix86_target_opts
2561 const char *option; /* option string */
2562 HOST_WIDE_INT mask; /* isa mask options */
2565 /* This table is ordered so that options like -msse4.2 that imply
2566 preceding options while match those first. */
2567 static struct ix86_target_opts isa_opts[] =
2569 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2570 { "-mfma", OPTION_MASK_ISA_FMA },
2571 { "-mxop", OPTION_MASK_ISA_XOP },
2572 { "-mlwp", OPTION_MASK_ISA_LWP },
2573 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2574 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2575 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2576 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2577 { "-msse3", OPTION_MASK_ISA_SSE3 },
2578 { "-msse2", OPTION_MASK_ISA_SSE2 },
2579 { "-msse", OPTION_MASK_ISA_SSE },
2580 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2581 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2582 { "-mmmx", OPTION_MASK_ISA_MMX },
2583 { "-mabm", OPTION_MASK_ISA_ABM },
2584 { "-mbmi", OPTION_MASK_ISA_BMI },
2585 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2586 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2587 { "-mhle", OPTION_MASK_ISA_HLE },
2588 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2589 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2590 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2591 { "-madx", OPTION_MASK_ISA_ADX },
2592 { "-mtbm", OPTION_MASK_ISA_TBM },
2593 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2594 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2595 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2596 { "-maes", OPTION_MASK_ISA_AES },
2597 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2598 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2599 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2600 { "-mf16c", OPTION_MASK_ISA_F16C },
2601 { "-mrtm", OPTION_MASK_ISA_RTM },
2602 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2603 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2606 /* Flag options. */
2607 static struct ix86_target_opts flag_opts[] =
2609 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2610 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2611 { "-m80387", MASK_80387 },
2612 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2613 { "-malign-double", MASK_ALIGN_DOUBLE },
2614 { "-mcld", MASK_CLD },
2615 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2616 { "-mieee-fp", MASK_IEEE_FP },
2617 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2618 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2619 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2620 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2621 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2622 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2623 { "-mno-red-zone", MASK_NO_RED_ZONE },
2624 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2625 { "-mrecip", MASK_RECIP },
2626 { "-mrtd", MASK_RTD },
2627 { "-msseregparm", MASK_SSEREGPARM },
2628 { "-mstack-arg-probe", MASK_STACK_PROBE },
2629 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2630 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2631 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2632 { "-mvzeroupper", MASK_VZEROUPPER },
2633 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2634 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2635 { "-mprefer-avx128", MASK_PREFER_AVX128},
2638 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2640 char isa_other[40];
2641 char target_other[40];
2642 unsigned num = 0;
2643 unsigned i, j;
2644 char *ret;
2645 char *ptr;
2646 size_t len;
2647 size_t line_len;
2648 size_t sep_len;
2649 const char *abi;
2651 memset (opts, '\0', sizeof (opts));
2653 /* Add -march= option. */
2654 if (arch)
2656 opts[num][0] = "-march=";
2657 opts[num++][1] = arch;
2660 /* Add -mtune= option. */
2661 if (tune)
2663 opts[num][0] = "-mtune=";
2664 opts[num++][1] = tune;
2667 /* Add -m32/-m64/-mx32. */
2668 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2670 if ((isa & OPTION_MASK_ABI_64) != 0)
2671 abi = "-m64";
2672 else
2673 abi = "-mx32";
2674 isa &= ~ (OPTION_MASK_ISA_64BIT
2675 | OPTION_MASK_ABI_64
2676 | OPTION_MASK_ABI_X32);
2678 else
2679 abi = "-m32";
2680 opts[num++][0] = abi;
2682 /* Pick out the options in isa options. */
2683 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2685 if ((isa & isa_opts[i].mask) != 0)
2687 opts[num++][0] = isa_opts[i].option;
2688 isa &= ~ isa_opts[i].mask;
2692 if (isa && add_nl_p)
2694 opts[num++][0] = isa_other;
2695 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2696 isa);
2699 /* Add flag options. */
2700 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2702 if ((flags & flag_opts[i].mask) != 0)
2704 opts[num++][0] = flag_opts[i].option;
2705 flags &= ~ flag_opts[i].mask;
2709 if (flags && add_nl_p)
2711 opts[num++][0] = target_other;
2712 sprintf (target_other, "(other flags: %#x)", flags);
2715 /* Add -fpmath= option. */
2716 if (fpmath)
2718 opts[num][0] = "-mfpmath=";
2719 switch ((int) fpmath)
2721 case FPMATH_387:
2722 opts[num++][1] = "387";
2723 break;
2725 case FPMATH_SSE:
2726 opts[num++][1] = "sse";
2727 break;
2729 case FPMATH_387 | FPMATH_SSE:
2730 opts[num++][1] = "sse+387";
2731 break;
2733 default:
2734 gcc_unreachable ();
2738 /* Any options? */
2739 if (num == 0)
2740 return NULL;
2742 gcc_assert (num < ARRAY_SIZE (opts));
2744 /* Size the string. */
2745 len = 0;
2746 sep_len = (add_nl_p) ? 3 : 1;
2747 for (i = 0; i < num; i++)
2749 len += sep_len;
2750 for (j = 0; j < 2; j++)
2751 if (opts[i][j])
2752 len += strlen (opts[i][j]);
2755 /* Build the string. */
2756 ret = ptr = (char *) xmalloc (len);
2757 line_len = 0;
2759 for (i = 0; i < num; i++)
2761 size_t len2[2];
2763 for (j = 0; j < 2; j++)
2764 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2766 if (i != 0)
2768 *ptr++ = ' ';
2769 line_len++;
2771 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2773 *ptr++ = '\\';
2774 *ptr++ = '\n';
2775 line_len = 0;
2779 for (j = 0; j < 2; j++)
2780 if (opts[i][j])
2782 memcpy (ptr, opts[i][j], len2[j]);
2783 ptr += len2[j];
2784 line_len += len2[j];
2788 *ptr = '\0';
2789 gcc_assert (ret + len >= ptr);
2791 return ret;
2794 /* Return true, if profiling code should be emitted before
2795 prologue. Otherwise it returns false.
2796 Note: For x86 with "hotfix" it is sorried. */
2797 static bool
2798 ix86_profile_before_prologue (void)
2800 return flag_fentry != 0;
2803 /* Function that is callable from the debugger to print the current
2804 options. */
2805 void
2806 ix86_debug_options (void)
2808 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2809 ix86_arch_string, ix86_tune_string,
2810 ix86_fpmath, true);
2812 if (opts)
2814 fprintf (stderr, "%s\n\n", opts);
2815 free (opts);
2817 else
2818 fputs ("<no options>\n\n", stderr);
2820 return;
2823 /* Override various settings based on options. If MAIN_ARGS_P, the
2824 options are from the command line, otherwise they are from
2825 attributes. */
2827 static void
2828 ix86_option_override_internal (bool main_args_p)
2830 int i;
2831 unsigned int ix86_arch_mask, ix86_tune_mask;
2832 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2833 const char *prefix;
2834 const char *suffix;
2835 const char *sw;
2837 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2838 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2839 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2840 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2841 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2842 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2843 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2844 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2845 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2846 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2847 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2848 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2849 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2850 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2851 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2852 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2853 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2854 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2855 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2856 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2857 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2858 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2859 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2860 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2861 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2862 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2863 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2864 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2865 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2866 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2867 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2868 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2869 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2870 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2871 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2872 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2873 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2874 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2875 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2876 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2878 /* if this reaches 64, need to widen struct pta flags below */
2880 static struct pta
2882 const char *const name; /* processor name or nickname. */
2883 const enum processor_type processor;
2884 const enum attr_cpu schedule;
2885 const unsigned HOST_WIDE_INT flags;
2887 const processor_alias_table[] =
2889 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2890 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2891 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2892 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2893 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2894 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2895 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2896 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2897 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2898 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2899 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2900 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2901 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2902 PTA_MMX | PTA_SSE | PTA_FXSR},
2903 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2904 PTA_MMX | PTA_SSE | PTA_FXSR},
2905 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2906 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2907 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2908 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2909 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2910 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2911 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2912 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2913 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2914 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2915 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2916 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2917 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2918 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2919 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2920 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2921 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
2922 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
2923 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2924 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2925 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2926 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2927 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
2928 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2929 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2930 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2931 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2932 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
2933 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2934 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2935 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2936 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2937 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2938 | PTA_XSAVEOPT},
2939 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2940 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2941 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2942 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2943 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2944 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2945 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2946 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2947 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2948 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2949 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2950 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2951 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2952 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2953 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2954 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2955 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2956 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2957 {"x86-64", PROCESSOR_K8, CPU_K8,
2958 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2959 {"k8", PROCESSOR_K8, CPU_K8,
2960 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2961 | PTA_SSE2 | PTA_NO_SAHF},
2962 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2963 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2964 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2965 {"opteron", PROCESSOR_K8, CPU_K8,
2966 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2967 | PTA_SSE2 | PTA_NO_SAHF},
2968 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2969 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2970 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2971 {"athlon64", PROCESSOR_K8, CPU_K8,
2972 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2973 | PTA_SSE2 | PTA_NO_SAHF},
2974 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2975 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2976 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2977 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2978 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2979 | PTA_SSE2 | PTA_NO_SAHF},
2980 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2981 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2982 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2983 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2984 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2985 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2986 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2989 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2990 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2991 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2992 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2993 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2994 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2995 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2996 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2997 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2998 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2999 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3000 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3001 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3002 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3003 | PTA_XSAVEOPT},
3004 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3005 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3006 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3007 | PTA_FXSR | PTA_XSAVE},
3008 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3009 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3010 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3011 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3012 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3013 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3015 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3016 PTA_HLE /* flags are only used for -march switch. */ },
3017 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3018 PTA_64BIT
3019 | PTA_HLE /* flags are only used for -march switch. */ },
3022 /* -mrecip options. */
3023 static struct
3025 const char *string; /* option name */
3026 unsigned int mask; /* mask bits to set */
3028 const recip_options[] =
3030 { "all", RECIP_MASK_ALL },
3031 { "none", RECIP_MASK_NONE },
3032 { "div", RECIP_MASK_DIV },
3033 { "sqrt", RECIP_MASK_SQRT },
3034 { "vec-div", RECIP_MASK_VEC_DIV },
3035 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3038 int const pta_size = ARRAY_SIZE (processor_alias_table);
3040 /* Set up prefix/suffix so the error messages refer to either the command
3041 line argument, or the attribute(target). */
3042 if (main_args_p)
3044 prefix = "-m";
3045 suffix = "";
3046 sw = "switch";
3048 else
3050 prefix = "option(\"";
3051 suffix = "\")";
3052 sw = "attribute";
3055 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3056 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3057 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3058 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3059 #ifdef TARGET_BI_ARCH
3060 else
3062 #if TARGET_BI_ARCH == 1
3063 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3064 is on and OPTION_MASK_ABI_X32 is off. We turn off
3065 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3066 -mx32. */
3067 if (TARGET_X32)
3068 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3069 #else
3070 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3071 on and OPTION_MASK_ABI_64 is off. We turn off
3072 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3073 -m64. */
3074 if (TARGET_LP64)
3075 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3076 #endif
3078 #endif
3080 if (TARGET_X32)
3082 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3083 OPTION_MASK_ABI_64 for TARGET_X32. */
3084 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3085 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3087 else if (TARGET_LP64)
3089 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3090 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3091 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3092 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3095 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3096 SUBTARGET_OVERRIDE_OPTIONS;
3097 #endif
3099 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3100 SUBSUBTARGET_OVERRIDE_OPTIONS;
3101 #endif
3103 /* -fPIC is the default for x86_64. */
3104 if (TARGET_MACHO && TARGET_64BIT)
3105 flag_pic = 2;
3107 /* Need to check -mtune=generic first. */
3108 if (ix86_tune_string)
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "i686")
3112 /* As special support for cross compilers we read -mtune=native
3113 as -mtune=generic. With native compilers we won't see the
3114 -mtune=native, as it was changed by the driver. */
3115 || !strcmp (ix86_tune_string, "native"))
3117 if (TARGET_64BIT)
3118 ix86_tune_string = "generic64";
3119 else
3120 ix86_tune_string = "generic32";
3122 /* If this call is for setting the option attribute, allow the
3123 generic32/generic64 that was previously set. */
3124 else if (!main_args_p
3125 && (!strcmp (ix86_tune_string, "generic32")
3126 || !strcmp (ix86_tune_string, "generic64")))
3128 else if (!strncmp (ix86_tune_string, "generic", 7))
3129 error ("bad value (%s) for %stune=%s %s",
3130 ix86_tune_string, prefix, suffix, sw);
3131 else if (!strcmp (ix86_tune_string, "x86-64"))
3132 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3133 "%stune=k8%s or %stune=generic%s instead as appropriate",
3134 prefix, suffix, prefix, suffix, prefix, suffix);
3136 else
3138 if (ix86_arch_string)
3139 ix86_tune_string = ix86_arch_string;
3140 if (!ix86_tune_string)
3142 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3143 ix86_tune_defaulted = 1;
3146 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3147 need to use a sensible tune option. */
3148 if (!strcmp (ix86_tune_string, "generic")
3149 || !strcmp (ix86_tune_string, "x86-64")
3150 || !strcmp (ix86_tune_string, "i686"))
3152 if (TARGET_64BIT)
3153 ix86_tune_string = "generic64";
3154 else
3155 ix86_tune_string = "generic32";
3159 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3161 /* rep; movq isn't available in 32-bit code. */
3162 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3163 ix86_stringop_alg = no_stringop;
3166 if (!ix86_arch_string)
3167 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3168 else
3169 ix86_arch_specified = 1;
3171 if (global_options_set.x_ix86_pmode)
3173 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3174 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3175 error ("address mode %qs not supported in the %s bit mode",
3176 TARGET_64BIT ? "short" : "long",
3177 TARGET_64BIT ? "64" : "32");
3179 else
3180 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3182 if (!global_options_set.x_ix86_abi)
3183 ix86_abi = DEFAULT_ABI;
3185 if (global_options_set.x_ix86_cmodel)
3187 switch (ix86_cmodel)
3189 case CM_SMALL:
3190 case CM_SMALL_PIC:
3191 if (flag_pic)
3192 ix86_cmodel = CM_SMALL_PIC;
3193 if (!TARGET_64BIT)
3194 error ("code model %qs not supported in the %s bit mode",
3195 "small", "32");
3196 break;
3198 case CM_MEDIUM:
3199 case CM_MEDIUM_PIC:
3200 if (flag_pic)
3201 ix86_cmodel = CM_MEDIUM_PIC;
3202 if (!TARGET_64BIT)
3203 error ("code model %qs not supported in the %s bit mode",
3204 "medium", "32");
3205 else if (TARGET_X32)
3206 error ("code model %qs not supported in x32 mode",
3207 "medium");
3208 break;
3210 case CM_LARGE:
3211 case CM_LARGE_PIC:
3212 if (flag_pic)
3213 ix86_cmodel = CM_LARGE_PIC;
3214 if (!TARGET_64BIT)
3215 error ("code model %qs not supported in the %s bit mode",
3216 "large", "32");
3217 else if (TARGET_X32)
3218 error ("code model %qs not supported in x32 mode",
3219 "large");
3220 break;
3222 case CM_32:
3223 if (flag_pic)
3224 error ("code model %s does not support PIC mode", "32");
3225 if (TARGET_64BIT)
3226 error ("code model %qs not supported in the %s bit mode",
3227 "32", "64");
3228 break;
3230 case CM_KERNEL:
3231 if (flag_pic)
3233 error ("code model %s does not support PIC mode", "kernel");
3234 ix86_cmodel = CM_32;
3236 if (!TARGET_64BIT)
3237 error ("code model %qs not supported in the %s bit mode",
3238 "kernel", "32");
3239 break;
3241 default:
3242 gcc_unreachable ();
3245 else
3247 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3248 use of rip-relative addressing. This eliminates fixups that
3249 would otherwise be needed if this object is to be placed in a
3250 DLL, and is essentially just as efficient as direct addressing. */
3251 if (TARGET_64BIT && (TARGET_RDOS || DEFAULT_ABI == MS_ABI))
3252 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3253 else if (TARGET_64BIT)
3254 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3255 else
3256 ix86_cmodel = CM_32;
3258 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3260 error ("-masm=intel not supported in this configuration");
3261 ix86_asm_dialect = ASM_ATT;
3263 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3264 sorry ("%i-bit mode not compiled in",
3265 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3267 for (i = 0; i < pta_size; i++)
3268 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3270 ix86_schedule = processor_alias_table[i].schedule;
3271 ix86_arch = processor_alias_table[i].processor;
3272 /* Default cpu tuning to the architecture. */
3273 ix86_tune = ix86_arch;
3275 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3276 error ("CPU you selected does not support x86-64 "
3277 "instruction set");
3279 if (processor_alias_table[i].flags & PTA_MMX
3280 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3281 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3282 if (processor_alias_table[i].flags & PTA_3DNOW
3283 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3284 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3285 if (processor_alias_table[i].flags & PTA_3DNOW_A
3286 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3287 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3288 if (processor_alias_table[i].flags & PTA_SSE
3289 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3290 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3291 if (processor_alias_table[i].flags & PTA_SSE2
3292 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3293 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3294 if (processor_alias_table[i].flags & PTA_SSE3
3295 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3296 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3297 if (processor_alias_table[i].flags & PTA_SSSE3
3298 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3299 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3300 if (processor_alias_table[i].flags & PTA_SSE4_1
3301 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3302 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3303 if (processor_alias_table[i].flags & PTA_SSE4_2
3304 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3305 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3306 if (processor_alias_table[i].flags & PTA_AVX
3307 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3308 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3309 if (processor_alias_table[i].flags & PTA_AVX2
3310 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3311 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3312 if (processor_alias_table[i].flags & PTA_FMA
3313 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3314 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3315 if (processor_alias_table[i].flags & PTA_SSE4A
3316 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3317 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3318 if (processor_alias_table[i].flags & PTA_FMA4
3319 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3320 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3321 if (processor_alias_table[i].flags & PTA_XOP
3322 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3323 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3324 if (processor_alias_table[i].flags & PTA_LWP
3325 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3326 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3327 if (processor_alias_table[i].flags & PTA_ABM
3328 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3329 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3330 if (processor_alias_table[i].flags & PTA_BMI
3331 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3332 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3333 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3334 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3335 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3336 if (processor_alias_table[i].flags & PTA_TBM
3337 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3338 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3339 if (processor_alias_table[i].flags & PTA_BMI2
3340 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3341 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3342 if (processor_alias_table[i].flags & PTA_CX16
3343 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3344 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3345 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3346 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3347 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3348 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3349 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3350 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3351 if (processor_alias_table[i].flags & PTA_MOVBE
3352 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3353 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3354 if (processor_alias_table[i].flags & PTA_AES
3355 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3356 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3357 if (processor_alias_table[i].flags & PTA_PCLMUL
3358 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3359 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3360 if (processor_alias_table[i].flags & PTA_FSGSBASE
3361 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3362 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3363 if (processor_alias_table[i].flags & PTA_RDRND
3364 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3365 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3366 if (processor_alias_table[i].flags & PTA_F16C
3367 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3368 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3369 if (processor_alias_table[i].flags & PTA_RTM
3370 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3371 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3372 if (processor_alias_table[i].flags & PTA_HLE
3373 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3374 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3375 if (processor_alias_table[i].flags & PTA_PRFCHW
3376 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3377 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3378 if (processor_alias_table[i].flags & PTA_RDSEED
3379 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3380 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3381 if (processor_alias_table[i].flags & PTA_ADX
3382 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3383 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3384 if (processor_alias_table[i].flags & PTA_FXSR
3385 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3386 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3387 if (processor_alias_table[i].flags & PTA_XSAVE
3388 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3389 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3390 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3391 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3392 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3393 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3394 x86_prefetch_sse = true;
3396 break;
3399 if (!strcmp (ix86_arch_string, "generic"))
3400 error ("generic CPU can be used only for %stune=%s %s",
3401 prefix, suffix, sw);
3402 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3403 error ("bad value (%s) for %sarch=%s %s",
3404 ix86_arch_string, prefix, suffix, sw);
3406 ix86_arch_mask = 1u << ix86_arch;
3407 for (i = 0; i < X86_ARCH_LAST; ++i)
3408 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3410 for (i = 0; i < pta_size; i++)
3411 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3413 ix86_schedule = processor_alias_table[i].schedule;
3414 ix86_tune = processor_alias_table[i].processor;
3415 if (TARGET_64BIT)
3417 if (!(processor_alias_table[i].flags & PTA_64BIT))
3419 if (ix86_tune_defaulted)
3421 ix86_tune_string = "x86-64";
3422 for (i = 0; i < pta_size; i++)
3423 if (! strcmp (ix86_tune_string,
3424 processor_alias_table[i].name))
3425 break;
3426 ix86_schedule = processor_alias_table[i].schedule;
3427 ix86_tune = processor_alias_table[i].processor;
3429 else
3430 error ("CPU you selected does not support x86-64 "
3431 "instruction set");
3434 else
3436 /* Adjust tuning when compiling for 32-bit ABI. */
3437 switch (ix86_tune)
3439 case PROCESSOR_GENERIC64:
3440 ix86_tune = PROCESSOR_GENERIC32;
3441 ix86_schedule = CPU_PENTIUMPRO;
3442 break;
3444 default:
3445 break;
3448 /* Intel CPUs have always interpreted SSE prefetch instructions as
3449 NOPs; so, we can enable SSE prefetch instructions even when
3450 -mtune (rather than -march) points us to a processor that has them.
3451 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3452 higher processors. */
3453 if (TARGET_CMOV
3454 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3455 x86_prefetch_sse = true;
3456 break;
3459 if (ix86_tune_specified && i == pta_size)
3460 error ("bad value (%s) for %stune=%s %s",
3461 ix86_tune_string, prefix, suffix, sw);
3463 ix86_tune_mask = 1u << ix86_tune;
3464 for (i = 0; i < X86_TUNE_LAST; ++i)
3465 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3467 #ifndef USE_IX86_FRAME_POINTER
3468 #define USE_IX86_FRAME_POINTER 0
3469 #endif
3471 #ifndef USE_X86_64_FRAME_POINTER
3472 #define USE_X86_64_FRAME_POINTER 0
3473 #endif
3475 /* Set the default values for switches whose default depends on TARGET_64BIT
3476 in case they weren't overwritten by command line options. */
3477 if (TARGET_64BIT)
3479 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3480 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3481 if (flag_asynchronous_unwind_tables == 2)
3482 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3483 if (flag_pcc_struct_return == 2)
3484 flag_pcc_struct_return = 0;
3486 else
3488 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3489 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3490 if (flag_asynchronous_unwind_tables == 2)
3491 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3492 if (flag_pcc_struct_return == 2)
3493 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3496 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3497 if (optimize_size)
3498 ix86_cost = &ix86_size_cost;
3499 else
3500 ix86_cost = ix86_tune_cost;
3502 /* Arrange to set up i386_stack_locals for all functions. */
3503 init_machine_status = ix86_init_machine_status;
3505 /* Validate -mregparm= value. */
3506 if (global_options_set.x_ix86_regparm)
3508 if (TARGET_64BIT)
3509 warning (0, "-mregparm is ignored in 64-bit mode");
3510 if (ix86_regparm > REGPARM_MAX)
3512 error ("-mregparm=%d is not between 0 and %d",
3513 ix86_regparm, REGPARM_MAX);
3514 ix86_regparm = 0;
3517 if (TARGET_64BIT)
3518 ix86_regparm = REGPARM_MAX;
3520 /* Default align_* from the processor table. */
3521 if (align_loops == 0)
3523 align_loops = processor_target_table[ix86_tune].align_loop;
3524 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3526 if (align_jumps == 0)
3528 align_jumps = processor_target_table[ix86_tune].align_jump;
3529 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3531 if (align_functions == 0)
3533 align_functions = processor_target_table[ix86_tune].align_func;
3536 /* Provide default for -mbranch-cost= value. */
3537 if (!global_options_set.x_ix86_branch_cost)
3538 ix86_branch_cost = ix86_cost->branch_cost;
3540 if (TARGET_64BIT)
3542 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3544 /* Enable by default the SSE and MMX builtins. Do allow the user to
3545 explicitly disable any of these. In particular, disabling SSE and
3546 MMX for kernel code is extremely useful. */
3547 if (!ix86_arch_specified)
3548 ix86_isa_flags
3549 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3550 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3552 if (TARGET_RTD)
3553 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3555 else
3557 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3559 if (!ix86_arch_specified)
3560 ix86_isa_flags
3561 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3563 /* i386 ABI does not specify red zone. It still makes sense to use it
3564 when programmer takes care to stack from being destroyed. */
3565 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3566 target_flags |= MASK_NO_RED_ZONE;
3569 /* Keep nonleaf frame pointers. */
3570 if (flag_omit_frame_pointer)
3571 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3572 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3573 flag_omit_frame_pointer = 1;
3575 /* If we're doing fast math, we don't care about comparison order
3576 wrt NaNs. This lets us use a shorter comparison sequence. */
3577 if (flag_finite_math_only)
3578 target_flags &= ~MASK_IEEE_FP;
3580 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3581 since the insns won't need emulation. */
3582 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3583 target_flags &= ~MASK_NO_FANCY_MATH_387;
3585 /* Likewise, if the target doesn't have a 387, or we've specified
3586 software floating point, don't use 387 inline intrinsics. */
3587 if (!TARGET_80387)
3588 target_flags |= MASK_NO_FANCY_MATH_387;
3590 /* Turn on MMX builtins for -msse. */
3591 if (TARGET_SSE)
3592 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3594 /* Enable SSE prefetch. */
3595 if (TARGET_SSE || TARGET_PRFCHW)
3596 x86_prefetch_sse = true;
3598 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3599 if (TARGET_SSE4_2 || TARGET_ABM)
3600 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3602 /* Turn on lzcnt instruction for -mabm. */
3603 if (TARGET_ABM)
3604 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3606 /* Validate -mpreferred-stack-boundary= value or default it to
3607 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3608 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3609 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3611 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3612 int max = (TARGET_SEH ? 4 : 12);
3614 if (ix86_preferred_stack_boundary_arg < min
3615 || ix86_preferred_stack_boundary_arg > max)
3617 if (min == max)
3618 error ("-mpreferred-stack-boundary is not supported "
3619 "for this target");
3620 else
3621 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3622 ix86_preferred_stack_boundary_arg, min, max);
3624 else
3625 ix86_preferred_stack_boundary
3626 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3629 /* Set the default value for -mstackrealign. */
3630 if (ix86_force_align_arg_pointer == -1)
3631 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3633 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3635 /* Validate -mincoming-stack-boundary= value or default it to
3636 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3637 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3638 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3640 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3641 || ix86_incoming_stack_boundary_arg > 12)
3642 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3643 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3644 else
3646 ix86_user_incoming_stack_boundary
3647 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3648 ix86_incoming_stack_boundary
3649 = ix86_user_incoming_stack_boundary;
3653 /* Accept -msseregparm only if at least SSE support is enabled. */
3654 if (TARGET_SSEREGPARM
3655 && ! TARGET_SSE)
3656 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3658 if (global_options_set.x_ix86_fpmath)
3660 if (ix86_fpmath & FPMATH_SSE)
3662 if (!TARGET_SSE)
3664 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3665 ix86_fpmath = FPMATH_387;
3667 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3669 warning (0, "387 instruction set disabled, using SSE arithmetics");
3670 ix86_fpmath = FPMATH_SSE;
3674 else
3675 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3677 /* If the i387 is disabled, then do not return values in it. */
3678 if (!TARGET_80387)
3679 target_flags &= ~MASK_FLOAT_RETURNS;
3681 /* Use external vectorized library in vectorizing intrinsics. */
3682 if (global_options_set.x_ix86_veclibabi_type)
3683 switch (ix86_veclibabi_type)
3685 case ix86_veclibabi_type_svml:
3686 ix86_veclib_handler = ix86_veclibabi_svml;
3687 break;
3689 case ix86_veclibabi_type_acml:
3690 ix86_veclib_handler = ix86_veclibabi_acml;
3691 break;
3693 default:
3694 gcc_unreachable ();
3697 if ((!USE_IX86_FRAME_POINTER
3698 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3699 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3700 && !optimize_size)
3701 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3703 /* ??? Unwind info is not correct around the CFG unless either a frame
3704 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3705 unwind info generation to be aware of the CFG and propagating states
3706 around edges. */
3707 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3708 || flag_exceptions || flag_non_call_exceptions)
3709 && flag_omit_frame_pointer
3710 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3712 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3713 warning (0, "unwind tables currently require either a frame pointer "
3714 "or %saccumulate-outgoing-args%s for correctness",
3715 prefix, suffix);
3716 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3719 /* If stack probes are required, the space used for large function
3720 arguments on the stack must also be probed, so enable
3721 -maccumulate-outgoing-args so this happens in the prologue. */
3722 if (TARGET_STACK_PROBE
3723 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3725 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3726 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3727 "for correctness", prefix, suffix);
3728 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3731 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3733 char *p;
3734 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3735 p = strchr (internal_label_prefix, 'X');
3736 internal_label_prefix_len = p - internal_label_prefix;
3737 *p = '\0';
3740 /* When scheduling description is not available, disable scheduler pass
3741 so it won't slow down the compilation and make x87 code slower. */
3742 if (!TARGET_SCHEDULE)
3743 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3745 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3746 ix86_tune_cost->simultaneous_prefetches,
3747 global_options.x_param_values,
3748 global_options_set.x_param_values);
3749 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3750 ix86_tune_cost->prefetch_block,
3751 global_options.x_param_values,
3752 global_options_set.x_param_values);
3753 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3754 ix86_tune_cost->l1_cache_size,
3755 global_options.x_param_values,
3756 global_options_set.x_param_values);
3757 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3758 ix86_tune_cost->l2_cache_size,
3759 global_options.x_param_values,
3760 global_options_set.x_param_values);
3762 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3763 if (flag_prefetch_loop_arrays < 0
3764 && HAVE_prefetch
3765 && (optimize >= 3 || flag_profile_use)
3766 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3767 flag_prefetch_loop_arrays = 1;
3769 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3770 can be optimized to ap = __builtin_next_arg (0). */
3771 if (!TARGET_64BIT && !flag_split_stack)
3772 targetm.expand_builtin_va_start = NULL;
3774 if (TARGET_64BIT)
3776 ix86_gen_leave = gen_leave_rex64;
3777 if (Pmode == DImode)
3779 ix86_gen_monitor = gen_sse3_monitor64_di;
3780 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3781 ix86_gen_tls_local_dynamic_base_64
3782 = gen_tls_local_dynamic_base_64_di;
3784 else
3786 ix86_gen_monitor = gen_sse3_monitor64_si;
3787 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3788 ix86_gen_tls_local_dynamic_base_64
3789 = gen_tls_local_dynamic_base_64_si;
3792 else
3794 ix86_gen_leave = gen_leave;
3795 ix86_gen_monitor = gen_sse3_monitor;
3798 if (Pmode == DImode)
3800 ix86_gen_add3 = gen_adddi3;
3801 ix86_gen_sub3 = gen_subdi3;
3802 ix86_gen_sub3_carry = gen_subdi3_carry;
3803 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3804 ix86_gen_andsp = gen_anddi3;
3805 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3806 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3807 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3809 else
3811 ix86_gen_add3 = gen_addsi3;
3812 ix86_gen_sub3 = gen_subsi3;
3813 ix86_gen_sub3_carry = gen_subsi3_carry;
3814 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3815 ix86_gen_andsp = gen_andsi3;
3816 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3817 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3818 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3821 #ifdef USE_IX86_CLD
3822 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3823 if (!TARGET_64BIT)
3824 target_flags |= MASK_CLD & ~target_flags_explicit;
3825 #endif
3827 if (!TARGET_64BIT && flag_pic)
3829 if (flag_fentry > 0)
3830 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3831 "with -fpic");
3832 flag_fentry = 0;
3834 else if (TARGET_SEH)
3836 if (flag_fentry == 0)
3837 sorry ("-mno-fentry isn%'t compatible with SEH");
3838 flag_fentry = 1;
3840 else if (flag_fentry < 0)
3842 #if defined(PROFILE_BEFORE_PROLOGUE)
3843 flag_fentry = 1;
3844 #else
3845 flag_fentry = 0;
3846 #endif
3849 if (TARGET_AVX)
3851 /* When not optimize for size, enable vzeroupper optimization for
3852 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3853 AVX unaligned load/store. */
3854 if (!optimize_size)
3856 if (flag_expensive_optimizations
3857 && !(target_flags_explicit & MASK_VZEROUPPER))
3858 target_flags |= MASK_VZEROUPPER;
3859 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3860 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3861 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3862 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3863 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3864 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3865 /* Enable 128-bit AVX instruction generation
3866 for the auto-vectorizer. */
3867 if (TARGET_AVX128_OPTIMAL
3868 && !(target_flags_explicit & MASK_PREFER_AVX128))
3869 target_flags |= MASK_PREFER_AVX128;
3872 else
3874 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3875 target_flags &= ~MASK_VZEROUPPER;
3878 if (ix86_recip_name)
3880 char *p = ASTRDUP (ix86_recip_name);
3881 char *q;
3882 unsigned int mask, i;
3883 bool invert;
3885 while ((q = strtok (p, ",")) != NULL)
3887 p = NULL;
3888 if (*q == '!')
3890 invert = true;
3891 q++;
3893 else
3894 invert = false;
3896 if (!strcmp (q, "default"))
3897 mask = RECIP_MASK_ALL;
3898 else
3900 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3901 if (!strcmp (q, recip_options[i].string))
3903 mask = recip_options[i].mask;
3904 break;
3907 if (i == ARRAY_SIZE (recip_options))
3909 error ("unknown option for -mrecip=%s", q);
3910 invert = false;
3911 mask = RECIP_MASK_NONE;
3915 recip_mask_explicit |= mask;
3916 if (invert)
3917 recip_mask &= ~mask;
3918 else
3919 recip_mask |= mask;
3923 if (TARGET_RECIP)
3924 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3925 else if (target_flags_explicit & MASK_RECIP)
3926 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3928 /* Default long double to 64-bit for Bionic. */
3929 if (TARGET_HAS_BIONIC
3930 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3931 target_flags |= MASK_LONG_DOUBLE_64;
3933 /* Save the initial options in case the user does function specific
3934 options. */
3935 if (main_args_p)
3936 target_option_default_node = target_option_current_node
3937 = build_target_option_node ();
3939 /* Handle stack protector */
3940 if (!global_options_set.x_ix86_stack_protector_guard)
3941 ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
3944 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3946 static void
3947 ix86_option_override (void)
3949 static struct register_pass_info insert_vzeroupper_info
3950 = { &pass_insert_vzeroupper.pass, "reload",
3951 1, PASS_POS_INSERT_AFTER
3954 ix86_option_override_internal (true);
3957 /* This needs to be done at start up. It's convenient to do it here. */
3958 register_pass (&insert_vzeroupper_info);
3961 /* Update register usage after having seen the compiler flags. */
3963 static void
3964 ix86_conditional_register_usage (void)
3966 int i, c_mask;
3967 unsigned int j;
3969 /* The PIC register, if it exists, is fixed. */
3970 j = PIC_OFFSET_TABLE_REGNUM;
3971 if (j != INVALID_REGNUM)
3972 fixed_regs[j] = call_used_regs[j] = 1;
3974 /* For 32-bit targets, squash the REX registers. */
3975 if (! TARGET_64BIT)
3977 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3978 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3979 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3980 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3983 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3984 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3985 : TARGET_64BIT ? (1 << 2)
3986 : (1 << 1));
3988 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3990 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3992 /* Set/reset conditionally defined registers from
3993 CALL_USED_REGISTERS initializer. */
3994 if (call_used_regs[i] > 1)
3995 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3997 /* Calculate registers of CLOBBERED_REGS register set
3998 as call used registers from GENERAL_REGS register set. */
3999 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4000 && call_used_regs[i])
4001 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4004 /* If MMX is disabled, squash the registers. */
4005 if (! TARGET_MMX)
4006 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4007 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4008 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4010 /* If SSE is disabled, squash the registers. */
4011 if (! TARGET_SSE)
4012 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4013 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4014 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4016 /* If the FPU is disabled, squash the registers. */
4017 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4018 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4019 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4020 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4024 /* Save the current options */
4026 static void
4027 ix86_function_specific_save (struct cl_target_option *ptr)
4029 ptr->arch = ix86_arch;
4030 ptr->schedule = ix86_schedule;
4031 ptr->tune = ix86_tune;
4032 ptr->branch_cost = ix86_branch_cost;
4033 ptr->tune_defaulted = ix86_tune_defaulted;
4034 ptr->arch_specified = ix86_arch_specified;
4035 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4036 ptr->ix86_target_flags_explicit = target_flags_explicit;
4037 ptr->x_recip_mask_explicit = recip_mask_explicit;
4039 /* The fields are char but the variables are not; make sure the
4040 values fit in the fields. */
4041 gcc_assert (ptr->arch == ix86_arch);
4042 gcc_assert (ptr->schedule == ix86_schedule);
4043 gcc_assert (ptr->tune == ix86_tune);
4044 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4047 /* Restore the current options */
4049 static void
4050 ix86_function_specific_restore (struct cl_target_option *ptr)
4052 enum processor_type old_tune = ix86_tune;
4053 enum processor_type old_arch = ix86_arch;
4054 unsigned int ix86_arch_mask, ix86_tune_mask;
4055 int i;
4057 ix86_arch = (enum processor_type) ptr->arch;
4058 ix86_schedule = (enum attr_cpu) ptr->schedule;
4059 ix86_tune = (enum processor_type) ptr->tune;
4060 ix86_branch_cost = ptr->branch_cost;
4061 ix86_tune_defaulted = ptr->tune_defaulted;
4062 ix86_arch_specified = ptr->arch_specified;
4063 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4064 target_flags_explicit = ptr->ix86_target_flags_explicit;
4065 recip_mask_explicit = ptr->x_recip_mask_explicit;
4067 /* Recreate the arch feature tests if the arch changed */
4068 if (old_arch != ix86_arch)
4070 ix86_arch_mask = 1u << ix86_arch;
4071 for (i = 0; i < X86_ARCH_LAST; ++i)
4072 ix86_arch_features[i]
4073 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4076 /* Recreate the tune optimization tests */
4077 if (old_tune != ix86_tune)
4079 ix86_tune_mask = 1u << ix86_tune;
4080 for (i = 0; i < X86_TUNE_LAST; ++i)
4081 ix86_tune_features[i]
4082 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4086 /* Print the current options */
4088 static void
4089 ix86_function_specific_print (FILE *file, int indent,
4090 struct cl_target_option *ptr)
4092 char *target_string
4093 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4094 NULL, NULL, ptr->x_ix86_fpmath, false);
4096 fprintf (file, "%*sarch = %d (%s)\n",
4097 indent, "",
4098 ptr->arch,
4099 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4100 ? cpu_names[ptr->arch]
4101 : "<unknown>"));
4103 fprintf (file, "%*stune = %d (%s)\n",
4104 indent, "",
4105 ptr->tune,
4106 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4107 ? cpu_names[ptr->tune]
4108 : "<unknown>"));
4110 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4112 if (target_string)
4114 fprintf (file, "%*s%s\n", indent, "", target_string);
4115 free (target_string);
4120 /* Inner function to process the attribute((target(...))), take an argument and
4121 set the current options from the argument. If we have a list, recursively go
4122 over the list. */
4124 static bool
4125 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4126 struct gcc_options *enum_opts_set)
4128 char *next_optstr;
4129 bool ret = true;
4131 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4132 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4133 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4134 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4135 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4137 enum ix86_opt_type
4139 ix86_opt_unknown,
4140 ix86_opt_yes,
4141 ix86_opt_no,
4142 ix86_opt_str,
4143 ix86_opt_enum,
4144 ix86_opt_isa
4147 static const struct
4149 const char *string;
4150 size_t len;
4151 enum ix86_opt_type type;
4152 int opt;
4153 int mask;
4154 } attrs[] = {
4155 /* isa options */
4156 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4157 IX86_ATTR_ISA ("abm", OPT_mabm),
4158 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4159 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4160 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4161 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4162 IX86_ATTR_ISA ("aes", OPT_maes),
4163 IX86_ATTR_ISA ("avx", OPT_mavx),
4164 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4165 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4166 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4167 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4168 IX86_ATTR_ISA ("sse", OPT_msse),
4169 IX86_ATTR_ISA ("sse2", OPT_msse2),
4170 IX86_ATTR_ISA ("sse3", OPT_msse3),
4171 IX86_ATTR_ISA ("sse4", OPT_msse4),
4172 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4173 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4174 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4175 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4176 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4177 IX86_ATTR_ISA ("fma", OPT_mfma),
4178 IX86_ATTR_ISA ("xop", OPT_mxop),
4179 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4180 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4181 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4182 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4183 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4184 IX86_ATTR_ISA ("hle", OPT_mhle),
4185 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4186 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4187 IX86_ATTR_ISA ("adx", OPT_madx),
4188 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4189 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4190 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4192 /* enum options */
4193 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4195 /* string options */
4196 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4197 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4199 /* flag options */
4200 IX86_ATTR_YES ("cld",
4201 OPT_mcld,
4202 MASK_CLD),
4204 IX86_ATTR_NO ("fancy-math-387",
4205 OPT_mfancy_math_387,
4206 MASK_NO_FANCY_MATH_387),
4208 IX86_ATTR_YES ("ieee-fp",
4209 OPT_mieee_fp,
4210 MASK_IEEE_FP),
4212 IX86_ATTR_YES ("inline-all-stringops",
4213 OPT_minline_all_stringops,
4214 MASK_INLINE_ALL_STRINGOPS),
4216 IX86_ATTR_YES ("inline-stringops-dynamically",
4217 OPT_minline_stringops_dynamically,
4218 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4220 IX86_ATTR_NO ("align-stringops",
4221 OPT_mno_align_stringops,
4222 MASK_NO_ALIGN_STRINGOPS),
4224 IX86_ATTR_YES ("recip",
4225 OPT_mrecip,
4226 MASK_RECIP),
4230 /* If this is a list, recurse to get the options. */
4231 if (TREE_CODE (args) == TREE_LIST)
4233 bool ret = true;
4235 for (; args; args = TREE_CHAIN (args))
4236 if (TREE_VALUE (args)
4237 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4238 p_strings, enum_opts_set))
4239 ret = false;
4241 return ret;
4244 else if (TREE_CODE (args) != STRING_CST)
4246 error ("attribute %<target%> argument not a string");
4247 return false;
4250 /* Handle multiple arguments separated by commas. */
4251 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4253 while (next_optstr && *next_optstr != '\0')
4255 char *p = next_optstr;
4256 char *orig_p = p;
4257 char *comma = strchr (next_optstr, ',');
4258 const char *opt_string;
4259 size_t len, opt_len;
4260 int opt;
4261 bool opt_set_p;
4262 char ch;
4263 unsigned i;
4264 enum ix86_opt_type type = ix86_opt_unknown;
4265 int mask = 0;
4267 if (comma)
4269 *comma = '\0';
4270 len = comma - next_optstr;
4271 next_optstr = comma + 1;
4273 else
4275 len = strlen (p);
4276 next_optstr = NULL;
4279 /* Recognize no-xxx. */
4280 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4282 opt_set_p = false;
4283 p += 3;
4284 len -= 3;
4286 else
4287 opt_set_p = true;
4289 /* Find the option. */
4290 ch = *p;
4291 opt = N_OPTS;
4292 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4294 type = attrs[i].type;
4295 opt_len = attrs[i].len;
4296 if (ch == attrs[i].string[0]
4297 && ((type != ix86_opt_str && type != ix86_opt_enum)
4298 ? len == opt_len
4299 : len > opt_len)
4300 && memcmp (p, attrs[i].string, opt_len) == 0)
4302 opt = attrs[i].opt;
4303 mask = attrs[i].mask;
4304 opt_string = attrs[i].string;
4305 break;
4309 /* Process the option. */
4310 if (opt == N_OPTS)
4312 error ("attribute(target(\"%s\")) is unknown", orig_p);
4313 ret = false;
4316 else if (type == ix86_opt_isa)
4318 struct cl_decoded_option decoded;
4320 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4321 ix86_handle_option (&global_options, &global_options_set,
4322 &decoded, input_location);
4325 else if (type == ix86_opt_yes || type == ix86_opt_no)
4327 if (type == ix86_opt_no)
4328 opt_set_p = !opt_set_p;
4330 if (opt_set_p)
4331 target_flags |= mask;
4332 else
4333 target_flags &= ~mask;
4336 else if (type == ix86_opt_str)
4338 if (p_strings[opt])
4340 error ("option(\"%s\") was already specified", opt_string);
4341 ret = false;
4343 else
4344 p_strings[opt] = xstrdup (p + opt_len);
4347 else if (type == ix86_opt_enum)
4349 bool arg_ok;
4350 int value;
4352 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4353 if (arg_ok)
4354 set_option (&global_options, enum_opts_set, opt, value,
4355 p + opt_len, DK_UNSPECIFIED, input_location,
4356 global_dc);
4357 else
4359 error ("attribute(target(\"%s\")) is unknown", orig_p);
4360 ret = false;
4364 else
4365 gcc_unreachable ();
4368 return ret;
4371 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4373 tree
4374 ix86_valid_target_attribute_tree (tree args)
4376 const char *orig_arch_string = ix86_arch_string;
4377 const char *orig_tune_string = ix86_tune_string;
4378 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4379 int orig_tune_defaulted = ix86_tune_defaulted;
4380 int orig_arch_specified = ix86_arch_specified;
4381 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4382 tree t = NULL_TREE;
4383 int i;
4384 struct cl_target_option *def
4385 = TREE_TARGET_OPTION (target_option_default_node);
4386 struct gcc_options enum_opts_set;
4388 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4390 /* Process each of the options on the chain. */
4391 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4392 &enum_opts_set))
4393 return error_mark_node;
4395 /* If the changed options are different from the default, rerun
4396 ix86_option_override_internal, and then save the options away.
4397 The string options are are attribute options, and will be undone
4398 when we copy the save structure. */
4399 if (ix86_isa_flags != def->x_ix86_isa_flags
4400 || target_flags != def->x_target_flags
4401 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4402 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4403 || enum_opts_set.x_ix86_fpmath)
4405 /* If we are using the default tune= or arch=, undo the string assigned,
4406 and use the default. */
4407 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4408 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4409 else if (!orig_arch_specified)
4410 ix86_arch_string = NULL;
4412 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4413 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4414 else if (orig_tune_defaulted)
4415 ix86_tune_string = NULL;
4417 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4418 if (enum_opts_set.x_ix86_fpmath)
4419 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4420 else if (!TARGET_64BIT && TARGET_SSE)
4422 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4423 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4426 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4427 ix86_option_override_internal (false);
4429 /* Add any builtin functions with the new isa if any. */
4430 ix86_add_new_builtins (ix86_isa_flags);
4432 /* Save the current options unless we are validating options for
4433 #pragma. */
4434 t = build_target_option_node ();
4436 ix86_arch_string = orig_arch_string;
4437 ix86_tune_string = orig_tune_string;
4438 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4440 /* Free up memory allocated to hold the strings */
4441 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4442 free (option_strings[i]);
4445 return t;
4448 /* Hook to validate attribute((target("string"))). */
4450 static bool
4451 ix86_valid_target_attribute_p (tree fndecl,
4452 tree ARG_UNUSED (name),
4453 tree args,
4454 int ARG_UNUSED (flags))
4456 struct cl_target_option cur_target;
4457 bool ret = true;
4459 /* attribute((target("default"))) does nothing, beyond
4460 affecting multi-versioning. */
4461 if (TREE_VALUE (args)
4462 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4463 && TREE_CHAIN (args) == NULL_TREE
4464 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4465 return true;
4467 tree old_optimize = build_optimization_node ();
4468 tree new_target, new_optimize;
4469 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4471 /* If the function changed the optimization levels as well as setting target
4472 options, start with the optimizations specified. */
4473 if (func_optimize && func_optimize != old_optimize)
4474 cl_optimization_restore (&global_options,
4475 TREE_OPTIMIZATION (func_optimize));
4477 /* The target attributes may also change some optimization flags, so update
4478 the optimization options if necessary. */
4479 cl_target_option_save (&cur_target, &global_options);
4480 new_target = ix86_valid_target_attribute_tree (args);
4481 new_optimize = build_optimization_node ();
4483 if (new_target == error_mark_node)
4484 ret = false;
4486 else if (fndecl && new_target)
4488 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4490 if (old_optimize != new_optimize)
4491 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4494 cl_target_option_restore (&global_options, &cur_target);
4496 if (old_optimize != new_optimize)
4497 cl_optimization_restore (&global_options,
4498 TREE_OPTIMIZATION (old_optimize));
4500 return ret;
4504 /* Hook to determine if one function can safely inline another. */
4506 static bool
4507 ix86_can_inline_p (tree caller, tree callee)
4509 bool ret = false;
4510 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4511 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4513 /* If callee has no option attributes, then it is ok to inline. */
4514 if (!callee_tree)
4515 ret = true;
4517 /* If caller has no option attributes, but callee does then it is not ok to
4518 inline. */
4519 else if (!caller_tree)
4520 ret = false;
4522 else
4524 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4525 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4527 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4528 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4529 function. */
4530 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4531 != callee_opts->x_ix86_isa_flags)
4532 ret = false;
4534 /* See if we have the same non-isa options. */
4535 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4536 ret = false;
4538 /* See if arch, tune, etc. are the same. */
4539 else if (caller_opts->arch != callee_opts->arch)
4540 ret = false;
4542 else if (caller_opts->tune != callee_opts->tune)
4543 ret = false;
4545 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4546 ret = false;
4548 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4549 ret = false;
4551 else
4552 ret = true;
4555 return ret;
4559 /* Remember the last target of ix86_set_current_function. */
4560 static GTY(()) tree ix86_previous_fndecl;
4562 /* Establish appropriate back-end context for processing the function
4563 FNDECL. The argument might be NULL to indicate processing at top
4564 level, outside of any function scope. */
4565 static void
4566 ix86_set_current_function (tree fndecl)
4568 /* Only change the context if the function changes. This hook is called
4569 several times in the course of compiling a function, and we don't want to
4570 slow things down too much or call target_reinit when it isn't safe. */
4571 if (fndecl && fndecl != ix86_previous_fndecl)
4573 tree old_tree = (ix86_previous_fndecl
4574 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4575 : NULL_TREE);
4577 tree new_tree = (fndecl
4578 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4579 : NULL_TREE);
4581 ix86_previous_fndecl = fndecl;
4582 if (old_tree == new_tree)
4585 else if (new_tree)
4587 cl_target_option_restore (&global_options,
4588 TREE_TARGET_OPTION (new_tree));
4589 target_reinit ();
4592 else if (old_tree)
4594 struct cl_target_option *def
4595 = TREE_TARGET_OPTION (target_option_current_node);
4597 cl_target_option_restore (&global_options, def);
4598 target_reinit ();
4604 /* Return true if this goes in large data/bss. */
4606 static bool
4607 ix86_in_large_data_p (tree exp)
4609 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4610 return false;
4612 /* Functions are never large data. */
4613 if (TREE_CODE (exp) == FUNCTION_DECL)
4614 return false;
4616 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4618 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4619 if (strcmp (section, ".ldata") == 0
4620 || strcmp (section, ".lbss") == 0)
4621 return true;
4622 return false;
4624 else
4626 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4628 /* If this is an incomplete type with size 0, then we can't put it
4629 in data because it might be too big when completed. */
4630 if (!size || size > ix86_section_threshold)
4631 return true;
4634 return false;
4637 /* Switch to the appropriate section for output of DECL.
4638 DECL is either a `VAR_DECL' node or a constant of some sort.
4639 RELOC indicates whether forming the initial value of DECL requires
4640 link-time relocations. */
4642 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4643 ATTRIBUTE_UNUSED;
4645 static section *
4646 x86_64_elf_select_section (tree decl, int reloc,
4647 unsigned HOST_WIDE_INT align)
4649 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4650 && ix86_in_large_data_p (decl))
4652 const char *sname = NULL;
4653 unsigned int flags = SECTION_WRITE;
4654 switch (categorize_decl_for_section (decl, reloc))
4656 case SECCAT_DATA:
4657 sname = ".ldata";
4658 break;
4659 case SECCAT_DATA_REL:
4660 sname = ".ldata.rel";
4661 break;
4662 case SECCAT_DATA_REL_LOCAL:
4663 sname = ".ldata.rel.local";
4664 break;
4665 case SECCAT_DATA_REL_RO:
4666 sname = ".ldata.rel.ro";
4667 break;
4668 case SECCAT_DATA_REL_RO_LOCAL:
4669 sname = ".ldata.rel.ro.local";
4670 break;
4671 case SECCAT_BSS:
4672 sname = ".lbss";
4673 flags |= SECTION_BSS;
4674 break;
4675 case SECCAT_RODATA:
4676 case SECCAT_RODATA_MERGE_STR:
4677 case SECCAT_RODATA_MERGE_STR_INIT:
4678 case SECCAT_RODATA_MERGE_CONST:
4679 sname = ".lrodata";
4680 flags = 0;
4681 break;
4682 case SECCAT_SRODATA:
4683 case SECCAT_SDATA:
4684 case SECCAT_SBSS:
4685 gcc_unreachable ();
4686 case SECCAT_TEXT:
4687 case SECCAT_TDATA:
4688 case SECCAT_TBSS:
4689 /* We don't split these for medium model. Place them into
4690 default sections and hope for best. */
4691 break;
4693 if (sname)
4695 /* We might get called with string constants, but get_named_section
4696 doesn't like them as they are not DECLs. Also, we need to set
4697 flags in that case. */
4698 if (!DECL_P (decl))
4699 return get_section (sname, flags, NULL);
4700 return get_named_section (decl, sname, reloc);
4703 return default_elf_select_section (decl, reloc, align);
4706 /* Build up a unique section name, expressed as a
4707 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4708 RELOC indicates whether the initial value of EXP requires
4709 link-time relocations. */
4711 static void ATTRIBUTE_UNUSED
4712 x86_64_elf_unique_section (tree decl, int reloc)
4714 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4715 && ix86_in_large_data_p (decl))
4717 const char *prefix = NULL;
4718 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4719 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4721 switch (categorize_decl_for_section (decl, reloc))
4723 case SECCAT_DATA:
4724 case SECCAT_DATA_REL:
4725 case SECCAT_DATA_REL_LOCAL:
4726 case SECCAT_DATA_REL_RO:
4727 case SECCAT_DATA_REL_RO_LOCAL:
4728 prefix = one_only ? ".ld" : ".ldata";
4729 break;
4730 case SECCAT_BSS:
4731 prefix = one_only ? ".lb" : ".lbss";
4732 break;
4733 case SECCAT_RODATA:
4734 case SECCAT_RODATA_MERGE_STR:
4735 case SECCAT_RODATA_MERGE_STR_INIT:
4736 case SECCAT_RODATA_MERGE_CONST:
4737 prefix = one_only ? ".lr" : ".lrodata";
4738 break;
4739 case SECCAT_SRODATA:
4740 case SECCAT_SDATA:
4741 case SECCAT_SBSS:
4742 gcc_unreachable ();
4743 case SECCAT_TEXT:
4744 case SECCAT_TDATA:
4745 case SECCAT_TBSS:
4746 /* We don't split these for medium model. Place them into
4747 default sections and hope for best. */
4748 break;
4750 if (prefix)
4752 const char *name, *linkonce;
4753 char *string;
4755 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4756 name = targetm.strip_name_encoding (name);
4758 /* If we're using one_only, then there needs to be a .gnu.linkonce
4759 prefix to the section name. */
4760 linkonce = one_only ? ".gnu.linkonce" : "";
4762 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4764 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4765 return;
4768 default_unique_section (decl, reloc);
4771 #ifdef COMMON_ASM_OP
4772 /* This says how to output assembler code to declare an
4773 uninitialized external linkage data object.
4775 For medium model x86-64 we need to use .largecomm opcode for
4776 large objects. */
4777 void
4778 x86_elf_aligned_common (FILE *file,
4779 const char *name, unsigned HOST_WIDE_INT size,
4780 int align)
4782 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4783 && size > (unsigned int)ix86_section_threshold)
4784 fputs (".largecomm\t", file);
4785 else
4786 fputs (COMMON_ASM_OP, file);
4787 assemble_name (file, name);
4788 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4789 size, align / BITS_PER_UNIT);
4791 #endif
4793 /* Utility function for targets to use in implementing
4794 ASM_OUTPUT_ALIGNED_BSS. */
4796 void
4797 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4798 const char *name, unsigned HOST_WIDE_INT size,
4799 int align)
4801 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4802 && size > (unsigned int)ix86_section_threshold)
4803 switch_to_section (get_named_section (decl, ".lbss", 0));
4804 else
4805 switch_to_section (bss_section);
4806 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4807 #ifdef ASM_DECLARE_OBJECT_NAME
4808 last_assemble_variable_decl = decl;
4809 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4810 #else
4811 /* Standard thing is just output label for the object. */
4812 ASM_OUTPUT_LABEL (file, name);
4813 #endif /* ASM_DECLARE_OBJECT_NAME */
4814 ASM_OUTPUT_SKIP (file, size ? size : 1);
4817 /* Decide whether we must probe the stack before any space allocation
4818 on this target. It's essentially TARGET_STACK_PROBE except when
4819 -fstack-check causes the stack to be already probed differently. */
4821 bool
4822 ix86_target_stack_probe (void)
4824 /* Do not probe the stack twice if static stack checking is enabled. */
4825 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4826 return false;
4828 return TARGET_STACK_PROBE;
4831 /* Decide whether we can make a sibling call to a function. DECL is the
4832 declaration of the function being targeted by the call and EXP is the
4833 CALL_EXPR representing the call. */
4835 static bool
4836 ix86_function_ok_for_sibcall (tree decl, tree exp)
4838 tree type, decl_or_type;
4839 rtx a, b;
4841 /* If we are generating position-independent code, we cannot sibcall
4842 optimize any indirect call, or a direct call to a global function,
4843 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4844 if (!TARGET_MACHO
4845 && !TARGET_64BIT
4846 && flag_pic
4847 && (!decl || !targetm.binds_local_p (decl)))
4848 return false;
4850 /* If we need to align the outgoing stack, then sibcalling would
4851 unalign the stack, which may break the called function. */
4852 if (ix86_minimum_incoming_stack_boundary (true)
4853 < PREFERRED_STACK_BOUNDARY)
4854 return false;
4856 if (decl)
4858 decl_or_type = decl;
4859 type = TREE_TYPE (decl);
4861 else
4863 /* We're looking at the CALL_EXPR, we need the type of the function. */
4864 type = CALL_EXPR_FN (exp); /* pointer expression */
4865 type = TREE_TYPE (type); /* pointer type */
4866 type = TREE_TYPE (type); /* function type */
4867 decl_or_type = type;
4870 /* Check that the return value locations are the same. Like
4871 if we are returning floats on the 80387 register stack, we cannot
4872 make a sibcall from a function that doesn't return a float to a
4873 function that does or, conversely, from a function that does return
4874 a float to a function that doesn't; the necessary stack adjustment
4875 would not be executed. This is also the place we notice
4876 differences in the return value ABI. Note that it is ok for one
4877 of the functions to have void return type as long as the return
4878 value of the other is passed in a register. */
4879 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4880 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4881 cfun->decl, false);
4882 if (STACK_REG_P (a) || STACK_REG_P (b))
4884 if (!rtx_equal_p (a, b))
4885 return false;
4887 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4889 else if (!rtx_equal_p (a, b))
4890 return false;
4892 if (TARGET_64BIT)
4894 /* The SYSV ABI has more call-clobbered registers;
4895 disallow sibcalls from MS to SYSV. */
4896 if (cfun->machine->call_abi == MS_ABI
4897 && ix86_function_type_abi (type) == SYSV_ABI)
4898 return false;
4900 else
4902 /* If this call is indirect, we'll need to be able to use a
4903 call-clobbered register for the address of the target function.
4904 Make sure that all such registers are not used for passing
4905 parameters. Note that DLLIMPORT functions are indirect. */
4906 if (!decl
4907 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4909 if (ix86_function_regparm (type, NULL) >= 3)
4911 /* ??? Need to count the actual number of registers to be used,
4912 not the possible number of registers. Fix later. */
4913 return false;
4918 /* Otherwise okay. That also includes certain types of indirect calls. */
4919 return true;
4922 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4923 and "sseregparm" calling convention attributes;
4924 arguments as in struct attribute_spec.handler. */
4926 static tree
4927 ix86_handle_cconv_attribute (tree *node, tree name,
4928 tree args,
4929 int flags ATTRIBUTE_UNUSED,
4930 bool *no_add_attrs)
4932 if (TREE_CODE (*node) != FUNCTION_TYPE
4933 && TREE_CODE (*node) != METHOD_TYPE
4934 && TREE_CODE (*node) != FIELD_DECL
4935 && TREE_CODE (*node) != TYPE_DECL)
4937 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4938 name);
4939 *no_add_attrs = true;
4940 return NULL_TREE;
4943 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4944 if (is_attribute_p ("regparm", name))
4946 tree cst;
4948 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4950 error ("fastcall and regparm attributes are not compatible");
4953 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4955 error ("regparam and thiscall attributes are not compatible");
4958 cst = TREE_VALUE (args);
4959 if (TREE_CODE (cst) != INTEGER_CST)
4961 warning (OPT_Wattributes,
4962 "%qE attribute requires an integer constant argument",
4963 name);
4964 *no_add_attrs = true;
4966 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4968 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4969 name, REGPARM_MAX);
4970 *no_add_attrs = true;
4973 return NULL_TREE;
4976 if (TARGET_64BIT)
4978 /* Do not warn when emulating the MS ABI. */
4979 if ((TREE_CODE (*node) != FUNCTION_TYPE
4980 && TREE_CODE (*node) != METHOD_TYPE)
4981 || ix86_function_type_abi (*node) != MS_ABI)
4982 warning (OPT_Wattributes, "%qE attribute ignored",
4983 name);
4984 *no_add_attrs = true;
4985 return NULL_TREE;
4988 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4989 if (is_attribute_p ("fastcall", name))
4991 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4993 error ("fastcall and cdecl attributes are not compatible");
4995 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4997 error ("fastcall and stdcall attributes are not compatible");
4999 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5001 error ("fastcall and regparm attributes are not compatible");
5003 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5005 error ("fastcall and thiscall attributes are not compatible");
5009 /* Can combine stdcall with fastcall (redundant), regparm and
5010 sseregparm. */
5011 else if (is_attribute_p ("stdcall", name))
5013 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5015 error ("stdcall and cdecl attributes are not compatible");
5017 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5019 error ("stdcall and fastcall attributes are not compatible");
5021 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5023 error ("stdcall and thiscall attributes are not compatible");
5027 /* Can combine cdecl with regparm and sseregparm. */
5028 else if (is_attribute_p ("cdecl", name))
5030 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5032 error ("stdcall and cdecl attributes are not compatible");
5034 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5036 error ("fastcall and cdecl attributes are not compatible");
5038 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5040 error ("cdecl and thiscall attributes are not compatible");
5043 else if (is_attribute_p ("thiscall", name))
5045 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5046 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5047 name);
5048 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5050 error ("stdcall and thiscall attributes are not compatible");
5052 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5054 error ("fastcall and thiscall attributes are not compatible");
5056 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5058 error ("cdecl and thiscall attributes are not compatible");
5062 /* Can combine sseregparm with all attributes. */
5064 return NULL_TREE;
5067 /* The transactional memory builtins are implicitly regparm or fastcall
5068 depending on the ABI. Override the generic do-nothing attribute that
5069 these builtins were declared with, and replace it with one of the two
5070 attributes that we expect elsewhere. */
5072 static tree
5073 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5074 tree args ATTRIBUTE_UNUSED,
5075 int flags ATTRIBUTE_UNUSED,
5076 bool *no_add_attrs)
5078 tree alt;
5080 /* In no case do we want to add the placeholder attribute. */
5081 *no_add_attrs = true;
5083 /* The 64-bit ABI is unchanged for transactional memory. */
5084 if (TARGET_64BIT)
5085 return NULL_TREE;
5087 /* ??? Is there a better way to validate 32-bit windows? We have
5088 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5089 if (CHECK_STACK_LIMIT > 0)
5090 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5091 else
5093 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5094 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5096 decl_attributes (node, alt, flags);
5098 return NULL_TREE;
5101 /* This function determines from TYPE the calling-convention. */
5103 unsigned int
5104 ix86_get_callcvt (const_tree type)
5106 unsigned int ret = 0;
5107 bool is_stdarg;
5108 tree attrs;
5110 if (TARGET_64BIT)
5111 return IX86_CALLCVT_CDECL;
5113 attrs = TYPE_ATTRIBUTES (type);
5114 if (attrs != NULL_TREE)
5116 if (lookup_attribute ("cdecl", attrs))
5117 ret |= IX86_CALLCVT_CDECL;
5118 else if (lookup_attribute ("stdcall", attrs))
5119 ret |= IX86_CALLCVT_STDCALL;
5120 else if (lookup_attribute ("fastcall", attrs))
5121 ret |= IX86_CALLCVT_FASTCALL;
5122 else if (lookup_attribute ("thiscall", attrs))
5123 ret |= IX86_CALLCVT_THISCALL;
5125 /* Regparam isn't allowed for thiscall and fastcall. */
5126 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5128 if (lookup_attribute ("regparm", attrs))
5129 ret |= IX86_CALLCVT_REGPARM;
5130 if (lookup_attribute ("sseregparm", attrs))
5131 ret |= IX86_CALLCVT_SSEREGPARM;
5134 if (IX86_BASE_CALLCVT(ret) != 0)
5135 return ret;
5138 is_stdarg = stdarg_p (type);
5139 if (TARGET_RTD && !is_stdarg)
5140 return IX86_CALLCVT_STDCALL | ret;
5142 if (ret != 0
5143 || is_stdarg
5144 || TREE_CODE (type) != METHOD_TYPE
5145 || ix86_function_type_abi (type) != MS_ABI)
5146 return IX86_CALLCVT_CDECL | ret;
5148 return IX86_CALLCVT_THISCALL;
5151 /* Return 0 if the attributes for two types are incompatible, 1 if they
5152 are compatible, and 2 if they are nearly compatible (which causes a
5153 warning to be generated). */
5155 static int
5156 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5158 unsigned int ccvt1, ccvt2;
5160 if (TREE_CODE (type1) != FUNCTION_TYPE
5161 && TREE_CODE (type1) != METHOD_TYPE)
5162 return 1;
5164 ccvt1 = ix86_get_callcvt (type1);
5165 ccvt2 = ix86_get_callcvt (type2);
5166 if (ccvt1 != ccvt2)
5167 return 0;
5168 if (ix86_function_regparm (type1, NULL)
5169 != ix86_function_regparm (type2, NULL))
5170 return 0;
5172 return 1;
5175 /* Return the regparm value for a function with the indicated TYPE and DECL.
5176 DECL may be NULL when calling function indirectly
5177 or considering a libcall. */
5179 static int
5180 ix86_function_regparm (const_tree type, const_tree decl)
5182 tree attr;
5183 int regparm;
5184 unsigned int ccvt;
5186 if (TARGET_64BIT)
5187 return (ix86_function_type_abi (type) == SYSV_ABI
5188 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5189 ccvt = ix86_get_callcvt (type);
5190 regparm = ix86_regparm;
5192 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5194 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5195 if (attr)
5197 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5198 return regparm;
5201 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5202 return 2;
5203 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5204 return 1;
5206 /* Use register calling convention for local functions when possible. */
5207 if (decl
5208 && TREE_CODE (decl) == FUNCTION_DECL
5209 && optimize
5210 && !(profile_flag && !flag_fentry))
5212 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5213 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5214 if (i && i->local && i->can_change_signature)
5216 int local_regparm, globals = 0, regno;
5218 /* Make sure no regparm register is taken by a
5219 fixed register variable. */
5220 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5221 if (fixed_regs[local_regparm])
5222 break;
5224 /* We don't want to use regparm(3) for nested functions as
5225 these use a static chain pointer in the third argument. */
5226 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5227 local_regparm = 2;
5229 /* In 32-bit mode save a register for the split stack. */
5230 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5231 local_regparm = 2;
5233 /* Each fixed register usage increases register pressure,
5234 so less registers should be used for argument passing.
5235 This functionality can be overriden by an explicit
5236 regparm value. */
5237 for (regno = AX_REG; regno <= DI_REG; regno++)
5238 if (fixed_regs[regno])
5239 globals++;
5241 local_regparm
5242 = globals < local_regparm ? local_regparm - globals : 0;
5244 if (local_regparm > regparm)
5245 regparm = local_regparm;
5249 return regparm;
5252 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5253 DFmode (2) arguments in SSE registers for a function with the
5254 indicated TYPE and DECL. DECL may be NULL when calling function
5255 indirectly or considering a libcall. Otherwise return 0. */
5257 static int
5258 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5260 gcc_assert (!TARGET_64BIT);
5262 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5263 by the sseregparm attribute. */
5264 if (TARGET_SSEREGPARM
5265 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5267 if (!TARGET_SSE)
5269 if (warn)
5271 if (decl)
5272 error ("calling %qD with attribute sseregparm without "
5273 "SSE/SSE2 enabled", decl);
5274 else
5275 error ("calling %qT with attribute sseregparm without "
5276 "SSE/SSE2 enabled", type);
5278 return 0;
5281 return 2;
5284 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5285 (and DFmode for SSE2) arguments in SSE registers. */
5286 if (decl && TARGET_SSE_MATH && optimize
5287 && !(profile_flag && !flag_fentry))
5289 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5290 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5291 if (i && i->local && i->can_change_signature)
5292 return TARGET_SSE2 ? 2 : 1;
5295 return 0;
5298 /* Return true if EAX is live at the start of the function. Used by
5299 ix86_expand_prologue to determine if we need special help before
5300 calling allocate_stack_worker. */
5302 static bool
5303 ix86_eax_live_at_start_p (void)
5305 /* Cheat. Don't bother working forward from ix86_function_regparm
5306 to the function type to whether an actual argument is located in
5307 eax. Instead just look at cfg info, which is still close enough
5308 to correct at this point. This gives false positives for broken
5309 functions that might use uninitialized data that happens to be
5310 allocated in eax, but who cares? */
5311 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5314 static bool
5315 ix86_keep_aggregate_return_pointer (tree fntype)
5317 tree attr;
5319 if (!TARGET_64BIT)
5321 attr = lookup_attribute ("callee_pop_aggregate_return",
5322 TYPE_ATTRIBUTES (fntype));
5323 if (attr)
5324 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5326 /* For 32-bit MS-ABI the default is to keep aggregate
5327 return pointer. */
5328 if (ix86_function_type_abi (fntype) == MS_ABI)
5329 return true;
5331 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5334 /* Value is the number of bytes of arguments automatically
5335 popped when returning from a subroutine call.
5336 FUNDECL is the declaration node of the function (as a tree),
5337 FUNTYPE is the data type of the function (as a tree),
5338 or for a library call it is an identifier node for the subroutine name.
5339 SIZE is the number of bytes of arguments passed on the stack.
5341 On the 80386, the RTD insn may be used to pop them if the number
5342 of args is fixed, but if the number is variable then the caller
5343 must pop them all. RTD can't be used for library calls now
5344 because the library is compiled with the Unix compiler.
5345 Use of RTD is a selectable option, since it is incompatible with
5346 standard Unix calling sequences. If the option is not selected,
5347 the caller must always pop the args.
5349 The attribute stdcall is equivalent to RTD on a per module basis. */
5351 static int
5352 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5354 unsigned int ccvt;
5356 /* None of the 64-bit ABIs pop arguments. */
5357 if (TARGET_64BIT)
5358 return 0;
5360 ccvt = ix86_get_callcvt (funtype);
5362 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5363 | IX86_CALLCVT_THISCALL)) != 0
5364 && ! stdarg_p (funtype))
5365 return size;
5367 /* Lose any fake structure return argument if it is passed on the stack. */
5368 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5369 && !ix86_keep_aggregate_return_pointer (funtype))
5371 int nregs = ix86_function_regparm (funtype, fundecl);
5372 if (nregs == 0)
5373 return GET_MODE_SIZE (Pmode);
5376 return 0;
5379 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5381 static bool
5382 ix86_legitimate_combined_insn (rtx insn)
5384 /* Check operand constraints in case hard registers were propagated
5385 into insn pattern. This check prevents combine pass from
5386 generating insn patterns with invalid hard register operands.
5387 These invalid insns can eventually confuse reload to error out
5388 with a spill failure. See also PRs 46829 and 46843. */
5389 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5391 int i;
5393 extract_insn (insn);
5394 preprocess_constraints ();
5396 for (i = 0; i < recog_data.n_operands; i++)
5398 rtx op = recog_data.operand[i];
5399 enum machine_mode mode = GET_MODE (op);
5400 struct operand_alternative *op_alt;
5401 int offset = 0;
5402 bool win;
5403 int j;
5405 /* A unary operator may be accepted by the predicate, but it
5406 is irrelevant for matching constraints. */
5407 if (UNARY_P (op))
5408 op = XEXP (op, 0);
5410 if (GET_CODE (op) == SUBREG)
5412 if (REG_P (SUBREG_REG (op))
5413 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5414 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5415 GET_MODE (SUBREG_REG (op)),
5416 SUBREG_BYTE (op),
5417 GET_MODE (op));
5418 op = SUBREG_REG (op);
5421 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5422 continue;
5424 op_alt = recog_op_alt[i];
5426 /* Operand has no constraints, anything is OK. */
5427 win = !recog_data.n_alternatives;
5429 for (j = 0; j < recog_data.n_alternatives; j++)
5431 if (op_alt[j].anything_ok
5432 || (op_alt[j].matches != -1
5433 && operands_match_p
5434 (recog_data.operand[i],
5435 recog_data.operand[op_alt[j].matches]))
5436 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5438 win = true;
5439 break;
5443 if (!win)
5444 return false;
5448 return true;
5451 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5453 static unsigned HOST_WIDE_INT
5454 ix86_asan_shadow_offset (void)
5456 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5457 : HOST_WIDE_INT_C (0x7fff8000))
5458 : (HOST_WIDE_INT_1 << 29);
5461 /* Argument support functions. */
5463 /* Return true when register may be used to pass function parameters. */
5464 bool
5465 ix86_function_arg_regno_p (int regno)
5467 int i;
5468 const int *parm_regs;
5470 if (!TARGET_64BIT)
5472 if (TARGET_MACHO)
5473 return (regno < REGPARM_MAX
5474 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5475 else
5476 return (regno < REGPARM_MAX
5477 || (TARGET_MMX && MMX_REGNO_P (regno)
5478 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5479 || (TARGET_SSE && SSE_REGNO_P (regno)
5480 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5483 if (TARGET_MACHO)
5485 if (SSE_REGNO_P (regno) && TARGET_SSE)
5486 return true;
5488 else
5490 if (TARGET_SSE && SSE_REGNO_P (regno)
5491 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5492 return true;
5495 /* TODO: The function should depend on current function ABI but
5496 builtins.c would need updating then. Therefore we use the
5497 default ABI. */
5499 /* RAX is used as hidden argument to va_arg functions. */
5500 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5501 return true;
5503 if (ix86_abi == MS_ABI)
5504 parm_regs = x86_64_ms_abi_int_parameter_registers;
5505 else
5506 parm_regs = x86_64_int_parameter_registers;
5507 for (i = 0; i < (ix86_abi == MS_ABI
5508 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5509 if (regno == parm_regs[i])
5510 return true;
5511 return false;
5514 /* Return if we do not know how to pass TYPE solely in registers. */
5516 static bool
5517 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5519 if (must_pass_in_stack_var_size_or_pad (mode, type))
5520 return true;
5522 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5523 The layout_type routine is crafty and tries to trick us into passing
5524 currently unsupported vector types on the stack by using TImode. */
5525 return (!TARGET_64BIT && mode == TImode
5526 && type && TREE_CODE (type) != VECTOR_TYPE);
5529 /* It returns the size, in bytes, of the area reserved for arguments passed
5530 in registers for the function represented by fndecl dependent to the used
5531 abi format. */
5533 ix86_reg_parm_stack_space (const_tree fndecl)
5535 enum calling_abi call_abi = SYSV_ABI;
5536 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5537 call_abi = ix86_function_abi (fndecl);
5538 else
5539 call_abi = ix86_function_type_abi (fndecl);
5540 if (TARGET_64BIT && call_abi == MS_ABI)
5541 return 32;
5542 return 0;
5545 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5546 call abi used. */
5547 enum calling_abi
5548 ix86_function_type_abi (const_tree fntype)
5550 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5552 enum calling_abi abi = ix86_abi;
5553 if (abi == SYSV_ABI)
5555 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5556 abi = MS_ABI;
5558 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5559 abi = SYSV_ABI;
5560 return abi;
5562 return ix86_abi;
5565 static bool
5566 ix86_function_ms_hook_prologue (const_tree fn)
5568 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5570 if (decl_function_context (fn) != NULL_TREE)
5571 error_at (DECL_SOURCE_LOCATION (fn),
5572 "ms_hook_prologue is not compatible with nested function");
5573 else
5574 return true;
5576 return false;
5579 static enum calling_abi
5580 ix86_function_abi (const_tree fndecl)
5582 if (! fndecl)
5583 return ix86_abi;
5584 return ix86_function_type_abi (TREE_TYPE (fndecl));
5587 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5588 call abi used. */
5589 enum calling_abi
5590 ix86_cfun_abi (void)
5592 if (! cfun)
5593 return ix86_abi;
5594 return cfun->machine->call_abi;
5597 /* Write the extra assembler code needed to declare a function properly. */
5599 void
5600 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5601 tree decl)
5603 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5605 if (is_ms_hook)
5607 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5608 unsigned int filler_cc = 0xcccccccc;
5610 for (i = 0; i < filler_count; i += 4)
5611 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5614 #ifdef SUBTARGET_ASM_UNWIND_INIT
5615 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5616 #endif
5618 ASM_OUTPUT_LABEL (asm_out_file, fname);
5620 /* Output magic byte marker, if hot-patch attribute is set. */
5621 if (is_ms_hook)
5623 if (TARGET_64BIT)
5625 /* leaq [%rsp + 0], %rsp */
5626 asm_fprintf (asm_out_file, ASM_BYTE
5627 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5629 else
5631 /* movl.s %edi, %edi
5632 push %ebp
5633 movl.s %esp, %ebp */
5634 asm_fprintf (asm_out_file, ASM_BYTE
5635 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5640 /* regclass.c */
5641 extern void init_regs (void);
5643 /* Implementation of call abi switching target hook. Specific to FNDECL
5644 the specific call register sets are set. See also
5645 ix86_conditional_register_usage for more details. */
5646 void
5647 ix86_call_abi_override (const_tree fndecl)
5649 if (fndecl == NULL_TREE)
5650 cfun->machine->call_abi = ix86_abi;
5651 else
5652 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5655 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5656 expensive re-initialization of init_regs each time we switch function context
5657 since this is needed only during RTL expansion. */
5658 static void
5659 ix86_maybe_switch_abi (void)
5661 if (TARGET_64BIT &&
5662 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5663 reinit_regs ();
5666 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5667 for a call to a function whose data type is FNTYPE.
5668 For a library call, FNTYPE is 0. */
5670 void
5671 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5672 tree fntype, /* tree ptr for function decl */
5673 rtx libname, /* SYMBOL_REF of library name or 0 */
5674 tree fndecl,
5675 int caller)
5677 struct cgraph_local_info *i;
5679 memset (cum, 0, sizeof (*cum));
5681 if (fndecl)
5683 i = cgraph_local_info (fndecl);
5684 cum->call_abi = ix86_function_abi (fndecl);
5686 else
5688 i = NULL;
5689 cum->call_abi = ix86_function_type_abi (fntype);
5692 cum->caller = caller;
5694 /* Set up the number of registers to use for passing arguments. */
5696 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5697 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5698 "or subtarget optimization implying it");
5699 cum->nregs = ix86_regparm;
5700 if (TARGET_64BIT)
5702 cum->nregs = (cum->call_abi == SYSV_ABI
5703 ? X86_64_REGPARM_MAX
5704 : X86_64_MS_REGPARM_MAX);
5706 if (TARGET_SSE)
5708 cum->sse_nregs = SSE_REGPARM_MAX;
5709 if (TARGET_64BIT)
5711 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5712 ? X86_64_SSE_REGPARM_MAX
5713 : X86_64_MS_SSE_REGPARM_MAX);
5716 if (TARGET_MMX)
5717 cum->mmx_nregs = MMX_REGPARM_MAX;
5718 cum->warn_avx = true;
5719 cum->warn_sse = true;
5720 cum->warn_mmx = true;
5722 /* Because type might mismatch in between caller and callee, we need to
5723 use actual type of function for local calls.
5724 FIXME: cgraph_analyze can be told to actually record if function uses
5725 va_start so for local functions maybe_vaarg can be made aggressive
5726 helping K&R code.
5727 FIXME: once typesytem is fixed, we won't need this code anymore. */
5728 if (i && i->local && i->can_change_signature)
5729 fntype = TREE_TYPE (fndecl);
5730 cum->maybe_vaarg = (fntype
5731 ? (!prototype_p (fntype) || stdarg_p (fntype))
5732 : !libname);
5734 if (!TARGET_64BIT)
5736 /* If there are variable arguments, then we won't pass anything
5737 in registers in 32-bit mode. */
5738 if (stdarg_p (fntype))
5740 cum->nregs = 0;
5741 cum->sse_nregs = 0;
5742 cum->mmx_nregs = 0;
5743 cum->warn_avx = 0;
5744 cum->warn_sse = 0;
5745 cum->warn_mmx = 0;
5746 return;
5749 /* Use ecx and edx registers if function has fastcall attribute,
5750 else look for regparm information. */
5751 if (fntype)
5753 unsigned int ccvt = ix86_get_callcvt (fntype);
5754 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5756 cum->nregs = 1;
5757 cum->fastcall = 1; /* Same first register as in fastcall. */
5759 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5761 cum->nregs = 2;
5762 cum->fastcall = 1;
5764 else
5765 cum->nregs = ix86_function_regparm (fntype, fndecl);
5768 /* Set up the number of SSE registers used for passing SFmode
5769 and DFmode arguments. Warn for mismatching ABI. */
5770 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5774 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5775 But in the case of vector types, it is some vector mode.
5777 When we have only some of our vector isa extensions enabled, then there
5778 are some modes for which vector_mode_supported_p is false. For these
5779 modes, the generic vector support in gcc will choose some non-vector mode
5780 in order to implement the type. By computing the natural mode, we'll
5781 select the proper ABI location for the operand and not depend on whatever
5782 the middle-end decides to do with these vector types.
5784 The midde-end can't deal with the vector types > 16 bytes. In this
5785 case, we return the original mode and warn ABI change if CUM isn't
5786 NULL. */
5788 static enum machine_mode
5789 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5791 enum machine_mode mode = TYPE_MODE (type);
5793 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5795 HOST_WIDE_INT size = int_size_in_bytes (type);
5796 if ((size == 8 || size == 16 || size == 32)
5797 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5798 && TYPE_VECTOR_SUBPARTS (type) > 1)
5800 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5802 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5803 mode = MIN_MODE_VECTOR_FLOAT;
5804 else
5805 mode = MIN_MODE_VECTOR_INT;
5807 /* Get the mode which has this inner mode and number of units. */
5808 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5809 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5810 && GET_MODE_INNER (mode) == innermode)
5812 if (size == 32 && !TARGET_AVX)
5814 static bool warnedavx;
5816 if (cum
5817 && !warnedavx
5818 && cum->warn_avx)
5820 warnedavx = true;
5821 warning (0, "AVX vector argument without AVX "
5822 "enabled changes the ABI");
5824 return TYPE_MODE (type);
5826 else if ((size == 8 || size == 16) && !TARGET_SSE)
5828 static bool warnedsse;
5830 if (cum
5831 && !warnedsse
5832 && cum->warn_sse)
5834 warnedsse = true;
5835 warning (0, "SSE vector argument without SSE "
5836 "enabled changes the ABI");
5838 return mode;
5840 else
5841 return mode;
5844 gcc_unreachable ();
5848 return mode;
5851 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5852 this may not agree with the mode that the type system has chosen for the
5853 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5854 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5856 static rtx
5857 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5858 unsigned int regno)
5860 rtx tmp;
5862 if (orig_mode != BLKmode)
5863 tmp = gen_rtx_REG (orig_mode, regno);
5864 else
5866 tmp = gen_rtx_REG (mode, regno);
5867 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5868 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5871 return tmp;
5874 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5875 of this code is to classify each 8bytes of incoming argument by the register
5876 class and assign registers accordingly. */
5878 /* Return the union class of CLASS1 and CLASS2.
5879 See the x86-64 PS ABI for details. */
5881 static enum x86_64_reg_class
5882 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5884 /* Rule #1: If both classes are equal, this is the resulting class. */
5885 if (class1 == class2)
5886 return class1;
5888 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5889 the other class. */
5890 if (class1 == X86_64_NO_CLASS)
5891 return class2;
5892 if (class2 == X86_64_NO_CLASS)
5893 return class1;
5895 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5896 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5897 return X86_64_MEMORY_CLASS;
5899 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5900 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5901 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5902 return X86_64_INTEGERSI_CLASS;
5903 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5904 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5905 return X86_64_INTEGER_CLASS;
5907 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5908 MEMORY is used. */
5909 if (class1 == X86_64_X87_CLASS
5910 || class1 == X86_64_X87UP_CLASS
5911 || class1 == X86_64_COMPLEX_X87_CLASS
5912 || class2 == X86_64_X87_CLASS
5913 || class2 == X86_64_X87UP_CLASS
5914 || class2 == X86_64_COMPLEX_X87_CLASS)
5915 return X86_64_MEMORY_CLASS;
5917 /* Rule #6: Otherwise class SSE is used. */
5918 return X86_64_SSE_CLASS;
5921 /* Classify the argument of type TYPE and mode MODE.
5922 CLASSES will be filled by the register class used to pass each word
5923 of the operand. The number of words is returned. In case the parameter
5924 should be passed in memory, 0 is returned. As a special case for zero
5925 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5927 BIT_OFFSET is used internally for handling records and specifies offset
5928 of the offset in bits modulo 256 to avoid overflow cases.
5930 See the x86-64 PS ABI for details.
5933 static int
5934 classify_argument (enum machine_mode mode, const_tree type,
5935 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5937 HOST_WIDE_INT bytes =
5938 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5939 int words
5940 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5942 /* Variable sized entities are always passed/returned in memory. */
5943 if (bytes < 0)
5944 return 0;
5946 if (mode != VOIDmode
5947 && targetm.calls.must_pass_in_stack (mode, type))
5948 return 0;
5950 if (type && AGGREGATE_TYPE_P (type))
5952 int i;
5953 tree field;
5954 enum x86_64_reg_class subclasses[MAX_CLASSES];
5956 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5957 if (bytes > 32)
5958 return 0;
5960 for (i = 0; i < words; i++)
5961 classes[i] = X86_64_NO_CLASS;
5963 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5964 signalize memory class, so handle it as special case. */
5965 if (!words)
5967 classes[0] = X86_64_NO_CLASS;
5968 return 1;
5971 /* Classify each field of record and merge classes. */
5972 switch (TREE_CODE (type))
5974 case RECORD_TYPE:
5975 /* And now merge the fields of structure. */
5976 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5978 if (TREE_CODE (field) == FIELD_DECL)
5980 int num;
5982 if (TREE_TYPE (field) == error_mark_node)
5983 continue;
5985 /* Bitfields are always classified as integer. Handle them
5986 early, since later code would consider them to be
5987 misaligned integers. */
5988 if (DECL_BIT_FIELD (field))
5990 for (i = (int_bit_position (field)
5991 + (bit_offset % 64)) / 8 / 8;
5992 i < ((int_bit_position (field) + (bit_offset % 64))
5993 + tree_low_cst (DECL_SIZE (field), 0)
5994 + 63) / 8 / 8; i++)
5995 classes[i] =
5996 merge_classes (X86_64_INTEGER_CLASS,
5997 classes[i]);
5999 else
6001 int pos;
6003 type = TREE_TYPE (field);
6005 /* Flexible array member is ignored. */
6006 if (TYPE_MODE (type) == BLKmode
6007 && TREE_CODE (type) == ARRAY_TYPE
6008 && TYPE_SIZE (type) == NULL_TREE
6009 && TYPE_DOMAIN (type) != NULL_TREE
6010 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6011 == NULL_TREE))
6013 static bool warned;
6015 if (!warned && warn_psabi)
6017 warned = true;
6018 inform (input_location,
6019 "the ABI of passing struct with"
6020 " a flexible array member has"
6021 " changed in GCC 4.4");
6023 continue;
6025 num = classify_argument (TYPE_MODE (type), type,
6026 subclasses,
6027 (int_bit_position (field)
6028 + bit_offset) % 256);
6029 if (!num)
6030 return 0;
6031 pos = (int_bit_position (field)
6032 + (bit_offset % 64)) / 8 / 8;
6033 for (i = 0; i < num && (i + pos) < words; i++)
6034 classes[i + pos] =
6035 merge_classes (subclasses[i], classes[i + pos]);
6039 break;
6041 case ARRAY_TYPE:
6042 /* Arrays are handled as small records. */
6044 int num;
6045 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6046 TREE_TYPE (type), subclasses, bit_offset);
6047 if (!num)
6048 return 0;
6050 /* The partial classes are now full classes. */
6051 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6052 subclasses[0] = X86_64_SSE_CLASS;
6053 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6054 && !((bit_offset % 64) == 0 && bytes == 4))
6055 subclasses[0] = X86_64_INTEGER_CLASS;
6057 for (i = 0; i < words; i++)
6058 classes[i] = subclasses[i % num];
6060 break;
6062 case UNION_TYPE:
6063 case QUAL_UNION_TYPE:
6064 /* Unions are similar to RECORD_TYPE but offset is always 0.
6066 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6068 if (TREE_CODE (field) == FIELD_DECL)
6070 int num;
6072 if (TREE_TYPE (field) == error_mark_node)
6073 continue;
6075 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6076 TREE_TYPE (field), subclasses,
6077 bit_offset);
6078 if (!num)
6079 return 0;
6080 for (i = 0; i < num; i++)
6081 classes[i] = merge_classes (subclasses[i], classes[i]);
6084 break;
6086 default:
6087 gcc_unreachable ();
6090 if (words > 2)
6092 /* When size > 16 bytes, if the first one isn't
6093 X86_64_SSE_CLASS or any other ones aren't
6094 X86_64_SSEUP_CLASS, everything should be passed in
6095 memory. */
6096 if (classes[0] != X86_64_SSE_CLASS)
6097 return 0;
6099 for (i = 1; i < words; i++)
6100 if (classes[i] != X86_64_SSEUP_CLASS)
6101 return 0;
6104 /* Final merger cleanup. */
6105 for (i = 0; i < words; i++)
6107 /* If one class is MEMORY, everything should be passed in
6108 memory. */
6109 if (classes[i] == X86_64_MEMORY_CLASS)
6110 return 0;
6112 /* The X86_64_SSEUP_CLASS should be always preceded by
6113 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6114 if (classes[i] == X86_64_SSEUP_CLASS
6115 && classes[i - 1] != X86_64_SSE_CLASS
6116 && classes[i - 1] != X86_64_SSEUP_CLASS)
6118 /* The first one should never be X86_64_SSEUP_CLASS. */
6119 gcc_assert (i != 0);
6120 classes[i] = X86_64_SSE_CLASS;
6123 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6124 everything should be passed in memory. */
6125 if (classes[i] == X86_64_X87UP_CLASS
6126 && (classes[i - 1] != X86_64_X87_CLASS))
6128 static bool warned;
6130 /* The first one should never be X86_64_X87UP_CLASS. */
6131 gcc_assert (i != 0);
6132 if (!warned && warn_psabi)
6134 warned = true;
6135 inform (input_location,
6136 "the ABI of passing union with long double"
6137 " has changed in GCC 4.4");
6139 return 0;
6142 return words;
6145 /* Compute alignment needed. We align all types to natural boundaries with
6146 exception of XFmode that is aligned to 64bits. */
6147 if (mode != VOIDmode && mode != BLKmode)
6149 int mode_alignment = GET_MODE_BITSIZE (mode);
6151 if (mode == XFmode)
6152 mode_alignment = 128;
6153 else if (mode == XCmode)
6154 mode_alignment = 256;
6155 if (COMPLEX_MODE_P (mode))
6156 mode_alignment /= 2;
6157 /* Misaligned fields are always returned in memory. */
6158 if (bit_offset % mode_alignment)
6159 return 0;
6162 /* for V1xx modes, just use the base mode */
6163 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6164 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6165 mode = GET_MODE_INNER (mode);
6167 /* Classification of atomic types. */
6168 switch (mode)
6170 case SDmode:
6171 case DDmode:
6172 classes[0] = X86_64_SSE_CLASS;
6173 return 1;
6174 case TDmode:
6175 classes[0] = X86_64_SSE_CLASS;
6176 classes[1] = X86_64_SSEUP_CLASS;
6177 return 2;
6178 case DImode:
6179 case SImode:
6180 case HImode:
6181 case QImode:
6182 case CSImode:
6183 case CHImode:
6184 case CQImode:
6186 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6188 if (size <= 32)
6190 classes[0] = X86_64_INTEGERSI_CLASS;
6191 return 1;
6193 else if (size <= 64)
6195 classes[0] = X86_64_INTEGER_CLASS;
6196 return 1;
6198 else if (size <= 64+32)
6200 classes[0] = X86_64_INTEGER_CLASS;
6201 classes[1] = X86_64_INTEGERSI_CLASS;
6202 return 2;
6204 else if (size <= 64+64)
6206 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6207 return 2;
6209 else
6210 gcc_unreachable ();
6212 case CDImode:
6213 case TImode:
6214 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6215 return 2;
6216 case COImode:
6217 case OImode:
6218 /* OImode shouldn't be used directly. */
6219 gcc_unreachable ();
6220 case CTImode:
6221 return 0;
6222 case SFmode:
6223 if (!(bit_offset % 64))
6224 classes[0] = X86_64_SSESF_CLASS;
6225 else
6226 classes[0] = X86_64_SSE_CLASS;
6227 return 1;
6228 case DFmode:
6229 classes[0] = X86_64_SSEDF_CLASS;
6230 return 1;
6231 case XFmode:
6232 classes[0] = X86_64_X87_CLASS;
6233 classes[1] = X86_64_X87UP_CLASS;
6234 return 2;
6235 case TFmode:
6236 classes[0] = X86_64_SSE_CLASS;
6237 classes[1] = X86_64_SSEUP_CLASS;
6238 return 2;
6239 case SCmode:
6240 classes[0] = X86_64_SSE_CLASS;
6241 if (!(bit_offset % 64))
6242 return 1;
6243 else
6245 static bool warned;
6247 if (!warned && warn_psabi)
6249 warned = true;
6250 inform (input_location,
6251 "the ABI of passing structure with complex float"
6252 " member has changed in GCC 4.4");
6254 classes[1] = X86_64_SSESF_CLASS;
6255 return 2;
6257 case DCmode:
6258 classes[0] = X86_64_SSEDF_CLASS;
6259 classes[1] = X86_64_SSEDF_CLASS;
6260 return 2;
6261 case XCmode:
6262 classes[0] = X86_64_COMPLEX_X87_CLASS;
6263 return 1;
6264 case TCmode:
6265 /* This modes is larger than 16 bytes. */
6266 return 0;
6267 case V8SFmode:
6268 case V8SImode:
6269 case V32QImode:
6270 case V16HImode:
6271 case V4DFmode:
6272 case V4DImode:
6273 classes[0] = X86_64_SSE_CLASS;
6274 classes[1] = X86_64_SSEUP_CLASS;
6275 classes[2] = X86_64_SSEUP_CLASS;
6276 classes[3] = X86_64_SSEUP_CLASS;
6277 return 4;
6278 case V4SFmode:
6279 case V4SImode:
6280 case V16QImode:
6281 case V8HImode:
6282 case V2DFmode:
6283 case V2DImode:
6284 classes[0] = X86_64_SSE_CLASS;
6285 classes[1] = X86_64_SSEUP_CLASS;
6286 return 2;
6287 case V1TImode:
6288 case V1DImode:
6289 case V2SFmode:
6290 case V2SImode:
6291 case V4HImode:
6292 case V8QImode:
6293 classes[0] = X86_64_SSE_CLASS;
6294 return 1;
6295 case BLKmode:
6296 case VOIDmode:
6297 return 0;
6298 default:
6299 gcc_assert (VECTOR_MODE_P (mode));
6301 if (bytes > 16)
6302 return 0;
6304 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6306 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6307 classes[0] = X86_64_INTEGERSI_CLASS;
6308 else
6309 classes[0] = X86_64_INTEGER_CLASS;
6310 classes[1] = X86_64_INTEGER_CLASS;
6311 return 1 + (bytes > 8);
6315 /* Examine the argument and return set number of register required in each
6316 class. Return 0 iff parameter should be passed in memory. */
6317 static int
6318 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6319 int *int_nregs, int *sse_nregs)
6321 enum x86_64_reg_class regclass[MAX_CLASSES];
6322 int n = classify_argument (mode, type, regclass, 0);
6324 *int_nregs = 0;
6325 *sse_nregs = 0;
6326 if (!n)
6327 return 0;
6328 for (n--; n >= 0; n--)
6329 switch (regclass[n])
6331 case X86_64_INTEGER_CLASS:
6332 case X86_64_INTEGERSI_CLASS:
6333 (*int_nregs)++;
6334 break;
6335 case X86_64_SSE_CLASS:
6336 case X86_64_SSESF_CLASS:
6337 case X86_64_SSEDF_CLASS:
6338 (*sse_nregs)++;
6339 break;
6340 case X86_64_NO_CLASS:
6341 case X86_64_SSEUP_CLASS:
6342 break;
6343 case X86_64_X87_CLASS:
6344 case X86_64_X87UP_CLASS:
6345 if (!in_return)
6346 return 0;
6347 break;
6348 case X86_64_COMPLEX_X87_CLASS:
6349 return in_return ? 2 : 0;
6350 case X86_64_MEMORY_CLASS:
6351 gcc_unreachable ();
6353 return 1;
6356 /* Construct container for the argument used by GCC interface. See
6357 FUNCTION_ARG for the detailed description. */
6359 static rtx
6360 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6361 const_tree type, int in_return, int nintregs, int nsseregs,
6362 const int *intreg, int sse_regno)
6364 /* The following variables hold the static issued_error state. */
6365 static bool issued_sse_arg_error;
6366 static bool issued_sse_ret_error;
6367 static bool issued_x87_ret_error;
6369 enum machine_mode tmpmode;
6370 int bytes =
6371 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6372 enum x86_64_reg_class regclass[MAX_CLASSES];
6373 int n;
6374 int i;
6375 int nexps = 0;
6376 int needed_sseregs, needed_intregs;
6377 rtx exp[MAX_CLASSES];
6378 rtx ret;
6380 n = classify_argument (mode, type, regclass, 0);
6381 if (!n)
6382 return NULL;
6383 if (!examine_argument (mode, type, in_return, &needed_intregs,
6384 &needed_sseregs))
6385 return NULL;
6386 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6387 return NULL;
6389 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6390 some less clueful developer tries to use floating-point anyway. */
6391 if (needed_sseregs && !TARGET_SSE)
6393 if (in_return)
6395 if (!issued_sse_ret_error)
6397 error ("SSE register return with SSE disabled");
6398 issued_sse_ret_error = true;
6401 else if (!issued_sse_arg_error)
6403 error ("SSE register argument with SSE disabled");
6404 issued_sse_arg_error = true;
6406 return NULL;
6409 /* Likewise, error if the ABI requires us to return values in the
6410 x87 registers and the user specified -mno-80387. */
6411 if (!TARGET_80387 && in_return)
6412 for (i = 0; i < n; i++)
6413 if (regclass[i] == X86_64_X87_CLASS
6414 || regclass[i] == X86_64_X87UP_CLASS
6415 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6417 if (!issued_x87_ret_error)
6419 error ("x87 register return with x87 disabled");
6420 issued_x87_ret_error = true;
6422 return NULL;
6425 /* First construct simple cases. Avoid SCmode, since we want to use
6426 single register to pass this type. */
6427 if (n == 1 && mode != SCmode)
6428 switch (regclass[0])
6430 case X86_64_INTEGER_CLASS:
6431 case X86_64_INTEGERSI_CLASS:
6432 return gen_rtx_REG (mode, intreg[0]);
6433 case X86_64_SSE_CLASS:
6434 case X86_64_SSESF_CLASS:
6435 case X86_64_SSEDF_CLASS:
6436 if (mode != BLKmode)
6437 return gen_reg_or_parallel (mode, orig_mode,
6438 SSE_REGNO (sse_regno));
6439 break;
6440 case X86_64_X87_CLASS:
6441 case X86_64_COMPLEX_X87_CLASS:
6442 return gen_rtx_REG (mode, FIRST_STACK_REG);
6443 case X86_64_NO_CLASS:
6444 /* Zero sized array, struct or class. */
6445 return NULL;
6446 default:
6447 gcc_unreachable ();
6449 if (n == 2
6450 && regclass[0] == X86_64_SSE_CLASS
6451 && regclass[1] == X86_64_SSEUP_CLASS
6452 && mode != BLKmode)
6453 return gen_reg_or_parallel (mode, orig_mode,
6454 SSE_REGNO (sse_regno));
6455 if (n == 4
6456 && regclass[0] == X86_64_SSE_CLASS
6457 && regclass[1] == X86_64_SSEUP_CLASS
6458 && regclass[2] == X86_64_SSEUP_CLASS
6459 && regclass[3] == X86_64_SSEUP_CLASS
6460 && mode != BLKmode)
6461 return gen_reg_or_parallel (mode, orig_mode,
6462 SSE_REGNO (sse_regno));
6463 if (n == 2
6464 && regclass[0] == X86_64_X87_CLASS
6465 && regclass[1] == X86_64_X87UP_CLASS)
6466 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6468 if (n == 2
6469 && regclass[0] == X86_64_INTEGER_CLASS
6470 && regclass[1] == X86_64_INTEGER_CLASS
6471 && (mode == CDImode || mode == TImode || mode == TFmode)
6472 && intreg[0] + 1 == intreg[1])
6473 return gen_rtx_REG (mode, intreg[0]);
6475 /* Otherwise figure out the entries of the PARALLEL. */
6476 for (i = 0; i < n; i++)
6478 int pos;
6480 switch (regclass[i])
6482 case X86_64_NO_CLASS:
6483 break;
6484 case X86_64_INTEGER_CLASS:
6485 case X86_64_INTEGERSI_CLASS:
6486 /* Merge TImodes on aligned occasions here too. */
6487 if (i * 8 + 8 > bytes)
6488 tmpmode
6489 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6490 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6491 tmpmode = SImode;
6492 else
6493 tmpmode = DImode;
6494 /* We've requested 24 bytes we
6495 don't have mode for. Use DImode. */
6496 if (tmpmode == BLKmode)
6497 tmpmode = DImode;
6498 exp [nexps++]
6499 = gen_rtx_EXPR_LIST (VOIDmode,
6500 gen_rtx_REG (tmpmode, *intreg),
6501 GEN_INT (i*8));
6502 intreg++;
6503 break;
6504 case X86_64_SSESF_CLASS:
6505 exp [nexps++]
6506 = gen_rtx_EXPR_LIST (VOIDmode,
6507 gen_rtx_REG (SFmode,
6508 SSE_REGNO (sse_regno)),
6509 GEN_INT (i*8));
6510 sse_regno++;
6511 break;
6512 case X86_64_SSEDF_CLASS:
6513 exp [nexps++]
6514 = gen_rtx_EXPR_LIST (VOIDmode,
6515 gen_rtx_REG (DFmode,
6516 SSE_REGNO (sse_regno)),
6517 GEN_INT (i*8));
6518 sse_regno++;
6519 break;
6520 case X86_64_SSE_CLASS:
6521 pos = i;
6522 switch (n)
6524 case 1:
6525 tmpmode = DImode;
6526 break;
6527 case 2:
6528 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6530 tmpmode = TImode;
6531 i++;
6533 else
6534 tmpmode = DImode;
6535 break;
6536 case 4:
6537 gcc_assert (i == 0
6538 && regclass[1] == X86_64_SSEUP_CLASS
6539 && regclass[2] == X86_64_SSEUP_CLASS
6540 && regclass[3] == X86_64_SSEUP_CLASS);
6541 tmpmode = OImode;
6542 i += 3;
6543 break;
6544 default:
6545 gcc_unreachable ();
6547 exp [nexps++]
6548 = gen_rtx_EXPR_LIST (VOIDmode,
6549 gen_rtx_REG (tmpmode,
6550 SSE_REGNO (sse_regno)),
6551 GEN_INT (pos*8));
6552 sse_regno++;
6553 break;
6554 default:
6555 gcc_unreachable ();
6559 /* Empty aligned struct, union or class. */
6560 if (nexps == 0)
6561 return NULL;
6563 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6564 for (i = 0; i < nexps; i++)
6565 XVECEXP (ret, 0, i) = exp [i];
6566 return ret;
6569 /* Update the data in CUM to advance over an argument of mode MODE
6570 and data type TYPE. (TYPE is null for libcalls where that information
6571 may not be available.) */
6573 static void
6574 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6575 const_tree type, HOST_WIDE_INT bytes,
6576 HOST_WIDE_INT words)
6578 switch (mode)
6580 default:
6581 break;
6583 case BLKmode:
6584 if (bytes < 0)
6585 break;
6586 /* FALLTHRU */
6588 case DImode:
6589 case SImode:
6590 case HImode:
6591 case QImode:
6592 cum->words += words;
6593 cum->nregs -= words;
6594 cum->regno += words;
6596 if (cum->nregs <= 0)
6598 cum->nregs = 0;
6599 cum->regno = 0;
6601 break;
6603 case OImode:
6604 /* OImode shouldn't be used directly. */
6605 gcc_unreachable ();
6607 case DFmode:
6608 if (cum->float_in_sse < 2)
6609 break;
6610 case SFmode:
6611 if (cum->float_in_sse < 1)
6612 break;
6613 /* FALLTHRU */
6615 case V8SFmode:
6616 case V8SImode:
6617 case V32QImode:
6618 case V16HImode:
6619 case V4DFmode:
6620 case V4DImode:
6621 case TImode:
6622 case V16QImode:
6623 case V8HImode:
6624 case V4SImode:
6625 case V2DImode:
6626 case V4SFmode:
6627 case V2DFmode:
6628 if (!type || !AGGREGATE_TYPE_P (type))
6630 cum->sse_words += words;
6631 cum->sse_nregs -= 1;
6632 cum->sse_regno += 1;
6633 if (cum->sse_nregs <= 0)
6635 cum->sse_nregs = 0;
6636 cum->sse_regno = 0;
6639 break;
6641 case V8QImode:
6642 case V4HImode:
6643 case V2SImode:
6644 case V2SFmode:
6645 case V1TImode:
6646 case V1DImode:
6647 if (!type || !AGGREGATE_TYPE_P (type))
6649 cum->mmx_words += words;
6650 cum->mmx_nregs -= 1;
6651 cum->mmx_regno += 1;
6652 if (cum->mmx_nregs <= 0)
6654 cum->mmx_nregs = 0;
6655 cum->mmx_regno = 0;
6658 break;
6662 static void
6663 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6664 const_tree type, HOST_WIDE_INT words, bool named)
6666 int int_nregs, sse_nregs;
6668 /* Unnamed 256bit vector mode parameters are passed on stack. */
6669 if (!named && VALID_AVX256_REG_MODE (mode))
6670 return;
6672 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6673 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6675 cum->nregs -= int_nregs;
6676 cum->sse_nregs -= sse_nregs;
6677 cum->regno += int_nregs;
6678 cum->sse_regno += sse_nregs;
6680 else
6682 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6683 cum->words = (cum->words + align - 1) & ~(align - 1);
6684 cum->words += words;
6688 static void
6689 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6690 HOST_WIDE_INT words)
6692 /* Otherwise, this should be passed indirect. */
6693 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6695 cum->words += words;
6696 if (cum->nregs > 0)
6698 cum->nregs -= 1;
6699 cum->regno += 1;
6703 /* Update the data in CUM to advance over an argument of mode MODE and
6704 data type TYPE. (TYPE is null for libcalls where that information
6705 may not be available.) */
6707 static void
6708 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6709 const_tree type, bool named)
6711 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6712 HOST_WIDE_INT bytes, words;
6714 if (mode == BLKmode)
6715 bytes = int_size_in_bytes (type);
6716 else
6717 bytes = GET_MODE_SIZE (mode);
6718 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6720 if (type)
6721 mode = type_natural_mode (type, NULL);
6723 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6724 function_arg_advance_ms_64 (cum, bytes, words);
6725 else if (TARGET_64BIT)
6726 function_arg_advance_64 (cum, mode, type, words, named);
6727 else
6728 function_arg_advance_32 (cum, mode, type, bytes, words);
6731 /* Define where to put the arguments to a function.
6732 Value is zero to push the argument on the stack,
6733 or a hard register in which to store the argument.
6735 MODE is the argument's machine mode.
6736 TYPE is the data type of the argument (as a tree).
6737 This is null for libcalls where that information may
6738 not be available.
6739 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6740 the preceding args and about the function being called.
6741 NAMED is nonzero if this argument is a named parameter
6742 (otherwise it is an extra parameter matching an ellipsis). */
6744 static rtx
6745 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6746 enum machine_mode orig_mode, const_tree type,
6747 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6749 static bool warnedsse, warnedmmx;
6751 /* Avoid the AL settings for the Unix64 ABI. */
6752 if (mode == VOIDmode)
6753 return constm1_rtx;
6755 switch (mode)
6757 default:
6758 break;
6760 case BLKmode:
6761 if (bytes < 0)
6762 break;
6763 /* FALLTHRU */
6764 case DImode:
6765 case SImode:
6766 case HImode:
6767 case QImode:
6768 if (words <= cum->nregs)
6770 int regno = cum->regno;
6772 /* Fastcall allocates the first two DWORD (SImode) or
6773 smaller arguments to ECX and EDX if it isn't an
6774 aggregate type . */
6775 if (cum->fastcall)
6777 if (mode == BLKmode
6778 || mode == DImode
6779 || (type && AGGREGATE_TYPE_P (type)))
6780 break;
6782 /* ECX not EAX is the first allocated register. */
6783 if (regno == AX_REG)
6784 regno = CX_REG;
6786 return gen_rtx_REG (mode, regno);
6788 break;
6790 case DFmode:
6791 if (cum->float_in_sse < 2)
6792 break;
6793 case SFmode:
6794 if (cum->float_in_sse < 1)
6795 break;
6796 /* FALLTHRU */
6797 case TImode:
6798 /* In 32bit, we pass TImode in xmm registers. */
6799 case V16QImode:
6800 case V8HImode:
6801 case V4SImode:
6802 case V2DImode:
6803 case V4SFmode:
6804 case V2DFmode:
6805 if (!type || !AGGREGATE_TYPE_P (type))
6807 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6809 warnedsse = true;
6810 warning (0, "SSE vector argument without SSE enabled "
6811 "changes the ABI");
6813 if (cum->sse_nregs)
6814 return gen_reg_or_parallel (mode, orig_mode,
6815 cum->sse_regno + FIRST_SSE_REG);
6817 break;
6819 case OImode:
6820 /* OImode shouldn't be used directly. */
6821 gcc_unreachable ();
6823 case V8SFmode:
6824 case V8SImode:
6825 case V32QImode:
6826 case V16HImode:
6827 case V4DFmode:
6828 case V4DImode:
6829 if (!type || !AGGREGATE_TYPE_P (type))
6831 if (cum->sse_nregs)
6832 return gen_reg_or_parallel (mode, orig_mode,
6833 cum->sse_regno + FIRST_SSE_REG);
6835 break;
6837 case V8QImode:
6838 case V4HImode:
6839 case V2SImode:
6840 case V2SFmode:
6841 case V1TImode:
6842 case V1DImode:
6843 if (!type || !AGGREGATE_TYPE_P (type))
6845 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6847 warnedmmx = true;
6848 warning (0, "MMX vector argument without MMX enabled "
6849 "changes the ABI");
6851 if (cum->mmx_nregs)
6852 return gen_reg_or_parallel (mode, orig_mode,
6853 cum->mmx_regno + FIRST_MMX_REG);
6855 break;
6858 return NULL_RTX;
6861 static rtx
6862 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6863 enum machine_mode orig_mode, const_tree type, bool named)
6865 /* Handle a hidden AL argument containing number of registers
6866 for varargs x86-64 functions. */
6867 if (mode == VOIDmode)
6868 return GEN_INT (cum->maybe_vaarg
6869 ? (cum->sse_nregs < 0
6870 ? X86_64_SSE_REGPARM_MAX
6871 : cum->sse_regno)
6872 : -1);
6874 switch (mode)
6876 default:
6877 break;
6879 case V8SFmode:
6880 case V8SImode:
6881 case V32QImode:
6882 case V16HImode:
6883 case V4DFmode:
6884 case V4DImode:
6885 /* Unnamed 256bit vector mode parameters are passed on stack. */
6886 if (!named)
6887 return NULL;
6888 break;
6891 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6892 cum->sse_nregs,
6893 &x86_64_int_parameter_registers [cum->regno],
6894 cum->sse_regno);
6897 static rtx
6898 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6899 enum machine_mode orig_mode, bool named,
6900 HOST_WIDE_INT bytes)
6902 unsigned int regno;
6904 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6905 We use value of -2 to specify that current function call is MSABI. */
6906 if (mode == VOIDmode)
6907 return GEN_INT (-2);
6909 /* If we've run out of registers, it goes on the stack. */
6910 if (cum->nregs == 0)
6911 return NULL_RTX;
6913 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6915 /* Only floating point modes are passed in anything but integer regs. */
6916 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6918 if (named)
6919 regno = cum->regno + FIRST_SSE_REG;
6920 else
6922 rtx t1, t2;
6924 /* Unnamed floating parameters are passed in both the
6925 SSE and integer registers. */
6926 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6927 t2 = gen_rtx_REG (mode, regno);
6928 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6929 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6930 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6933 /* Handle aggregated types passed in register. */
6934 if (orig_mode == BLKmode)
6936 if (bytes > 0 && bytes <= 8)
6937 mode = (bytes > 4 ? DImode : SImode);
6938 if (mode == BLKmode)
6939 mode = DImode;
6942 return gen_reg_or_parallel (mode, orig_mode, regno);
6945 /* Return where to put the arguments to a function.
6946 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6948 MODE is the argument's machine mode. TYPE is the data type of the
6949 argument. It is null for libcalls where that information may not be
6950 available. CUM gives information about the preceding args and about
6951 the function being called. NAMED is nonzero if this argument is a
6952 named parameter (otherwise it is an extra parameter matching an
6953 ellipsis). */
6955 static rtx
6956 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6957 const_tree type, bool named)
6959 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6960 enum machine_mode mode = omode;
6961 HOST_WIDE_INT bytes, words;
6962 rtx arg;
6964 if (mode == BLKmode)
6965 bytes = int_size_in_bytes (type);
6966 else
6967 bytes = GET_MODE_SIZE (mode);
6968 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6970 /* To simplify the code below, represent vector types with a vector mode
6971 even if MMX/SSE are not active. */
6972 if (type && TREE_CODE (type) == VECTOR_TYPE)
6973 mode = type_natural_mode (type, cum);
6975 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6976 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6977 else if (TARGET_64BIT)
6978 arg = function_arg_64 (cum, mode, omode, type, named);
6979 else
6980 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6982 return arg;
6985 /* A C expression that indicates when an argument must be passed by
6986 reference. If nonzero for an argument, a copy of that argument is
6987 made in memory and a pointer to the argument is passed instead of
6988 the argument itself. The pointer is passed in whatever way is
6989 appropriate for passing a pointer to that type. */
6991 static bool
6992 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6993 enum machine_mode mode ATTRIBUTE_UNUSED,
6994 const_tree type, bool named ATTRIBUTE_UNUSED)
6996 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6998 /* See Windows x64 Software Convention. */
6999 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7001 int msize = (int) GET_MODE_SIZE (mode);
7002 if (type)
7004 /* Arrays are passed by reference. */
7005 if (TREE_CODE (type) == ARRAY_TYPE)
7006 return true;
7008 if (AGGREGATE_TYPE_P (type))
7010 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7011 are passed by reference. */
7012 msize = int_size_in_bytes (type);
7016 /* __m128 is passed by reference. */
7017 switch (msize) {
7018 case 1: case 2: case 4: case 8:
7019 break;
7020 default:
7021 return true;
7024 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7025 return 1;
7027 return 0;
7030 /* Return true when TYPE should be 128bit aligned for 32bit argument
7031 passing ABI. XXX: This function is obsolete and is only used for
7032 checking psABI compatibility with previous versions of GCC. */
7034 static bool
7035 ix86_compat_aligned_value_p (const_tree type)
7037 enum machine_mode mode = TYPE_MODE (type);
7038 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7039 || mode == TDmode
7040 || mode == TFmode
7041 || mode == TCmode)
7042 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7043 return true;
7044 if (TYPE_ALIGN (type) < 128)
7045 return false;
7047 if (AGGREGATE_TYPE_P (type))
7049 /* Walk the aggregates recursively. */
7050 switch (TREE_CODE (type))
7052 case RECORD_TYPE:
7053 case UNION_TYPE:
7054 case QUAL_UNION_TYPE:
7056 tree field;
7058 /* Walk all the structure fields. */
7059 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7061 if (TREE_CODE (field) == FIELD_DECL
7062 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7063 return true;
7065 break;
7068 case ARRAY_TYPE:
7069 /* Just for use if some languages passes arrays by value. */
7070 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7071 return true;
7072 break;
7074 default:
7075 gcc_unreachable ();
7078 return false;
7081 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7082 XXX: This function is obsolete and is only used for checking psABI
7083 compatibility with previous versions of GCC. */
7085 static unsigned int
7086 ix86_compat_function_arg_boundary (enum machine_mode mode,
7087 const_tree type, unsigned int align)
7089 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7090 natural boundaries. */
7091 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7093 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7094 make an exception for SSE modes since these require 128bit
7095 alignment.
7097 The handling here differs from field_alignment. ICC aligns MMX
7098 arguments to 4 byte boundaries, while structure fields are aligned
7099 to 8 byte boundaries. */
7100 if (!type)
7102 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7103 align = PARM_BOUNDARY;
7105 else
7107 if (!ix86_compat_aligned_value_p (type))
7108 align = PARM_BOUNDARY;
7111 if (align > BIGGEST_ALIGNMENT)
7112 align = BIGGEST_ALIGNMENT;
7113 return align;
7116 /* Return true when TYPE should be 128bit aligned for 32bit argument
7117 passing ABI. */
7119 static bool
7120 ix86_contains_aligned_value_p (const_tree type)
7122 enum machine_mode mode = TYPE_MODE (type);
7124 if (mode == XFmode || mode == XCmode)
7125 return false;
7127 if (TYPE_ALIGN (type) < 128)
7128 return false;
7130 if (AGGREGATE_TYPE_P (type))
7132 /* Walk the aggregates recursively. */
7133 switch (TREE_CODE (type))
7135 case RECORD_TYPE:
7136 case UNION_TYPE:
7137 case QUAL_UNION_TYPE:
7139 tree field;
7141 /* Walk all the structure fields. */
7142 for (field = TYPE_FIELDS (type);
7143 field;
7144 field = DECL_CHAIN (field))
7146 if (TREE_CODE (field) == FIELD_DECL
7147 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7148 return true;
7150 break;
7153 case ARRAY_TYPE:
7154 /* Just for use if some languages passes arrays by value. */
7155 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7156 return true;
7157 break;
7159 default:
7160 gcc_unreachable ();
7163 else
7164 return TYPE_ALIGN (type) >= 128;
7166 return false;
7169 /* Gives the alignment boundary, in bits, of an argument with the
7170 specified mode and type. */
7172 static unsigned int
7173 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7175 unsigned int align;
7176 if (type)
7178 /* Since the main variant type is used for call, we convert it to
7179 the main variant type. */
7180 type = TYPE_MAIN_VARIANT (type);
7181 align = TYPE_ALIGN (type);
7183 else
7184 align = GET_MODE_ALIGNMENT (mode);
7185 if (align < PARM_BOUNDARY)
7186 align = PARM_BOUNDARY;
7187 else
7189 static bool warned;
7190 unsigned int saved_align = align;
7192 if (!TARGET_64BIT)
7194 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7195 if (!type)
7197 if (mode == XFmode || mode == XCmode)
7198 align = PARM_BOUNDARY;
7200 else if (!ix86_contains_aligned_value_p (type))
7201 align = PARM_BOUNDARY;
7203 if (align < 128)
7204 align = PARM_BOUNDARY;
7207 if (warn_psabi
7208 && !warned
7209 && align != ix86_compat_function_arg_boundary (mode, type,
7210 saved_align))
7212 warned = true;
7213 inform (input_location,
7214 "The ABI for passing parameters with %d-byte"
7215 " alignment has changed in GCC 4.6",
7216 align / BITS_PER_UNIT);
7220 return align;
7223 /* Return true if N is a possible register number of function value. */
7225 static bool
7226 ix86_function_value_regno_p (const unsigned int regno)
7228 switch (regno)
7230 case AX_REG:
7231 return true;
7233 case FIRST_FLOAT_REG:
7234 /* TODO: The function should depend on current function ABI but
7235 builtins.c would need updating then. Therefore we use the
7236 default ABI. */
7237 if (TARGET_64BIT && ix86_abi == MS_ABI)
7238 return false;
7239 return TARGET_FLOAT_RETURNS_IN_80387;
7241 case FIRST_SSE_REG:
7242 return TARGET_SSE;
7244 case FIRST_MMX_REG:
7245 if (TARGET_MACHO || TARGET_64BIT)
7246 return false;
7247 return TARGET_MMX;
7250 return false;
7253 /* Define how to find the value returned by a function.
7254 VALTYPE is the data type of the value (as a tree).
7255 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7256 otherwise, FUNC is 0. */
7258 static rtx
7259 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7260 const_tree fntype, const_tree fn)
7262 unsigned int regno;
7264 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7265 we normally prevent this case when mmx is not available. However
7266 some ABIs may require the result to be returned like DImode. */
7267 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7268 regno = FIRST_MMX_REG;
7270 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7271 we prevent this case when sse is not available. However some ABIs
7272 may require the result to be returned like integer TImode. */
7273 else if (mode == TImode
7274 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7275 regno = FIRST_SSE_REG;
7277 /* 32-byte vector modes in %ymm0. */
7278 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7279 regno = FIRST_SSE_REG;
7281 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7282 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7283 regno = FIRST_FLOAT_REG;
7284 else
7285 /* Most things go in %eax. */
7286 regno = AX_REG;
7288 /* Override FP return register with %xmm0 for local functions when
7289 SSE math is enabled or for functions with sseregparm attribute. */
7290 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7292 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7293 if ((sse_level >= 1 && mode == SFmode)
7294 || (sse_level == 2 && mode == DFmode))
7295 regno = FIRST_SSE_REG;
7298 /* OImode shouldn't be used directly. */
7299 gcc_assert (mode != OImode);
7301 return gen_rtx_REG (orig_mode, regno);
7304 static rtx
7305 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7306 const_tree valtype)
7308 rtx ret;
7310 /* Handle libcalls, which don't provide a type node. */
7311 if (valtype == NULL)
7313 unsigned int regno;
7315 switch (mode)
7317 case SFmode:
7318 case SCmode:
7319 case DFmode:
7320 case DCmode:
7321 case TFmode:
7322 case SDmode:
7323 case DDmode:
7324 case TDmode:
7325 regno = FIRST_SSE_REG;
7326 break;
7327 case XFmode:
7328 case XCmode:
7329 regno = FIRST_FLOAT_REG;
7330 break;
7331 case TCmode:
7332 return NULL;
7333 default:
7334 regno = AX_REG;
7337 return gen_rtx_REG (mode, regno);
7339 else if (POINTER_TYPE_P (valtype))
7341 /* Pointers are always returned in word_mode. */
7342 mode = word_mode;
7345 ret = construct_container (mode, orig_mode, valtype, 1,
7346 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7347 x86_64_int_return_registers, 0);
7349 /* For zero sized structures, construct_container returns NULL, but we
7350 need to keep rest of compiler happy by returning meaningful value. */
7351 if (!ret)
7352 ret = gen_rtx_REG (orig_mode, AX_REG);
7354 return ret;
7357 static rtx
7358 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7359 const_tree valtype)
7361 unsigned int regno = AX_REG;
7363 if (TARGET_SSE)
7365 switch (GET_MODE_SIZE (mode))
7367 case 16:
7368 if (valtype != NULL_TREE
7369 && !VECTOR_INTEGER_TYPE_P (valtype)
7370 && !VECTOR_INTEGER_TYPE_P (valtype)
7371 && !INTEGRAL_TYPE_P (valtype)
7372 && !VECTOR_FLOAT_TYPE_P (valtype))
7373 break;
7374 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7375 && !COMPLEX_MODE_P (mode))
7376 regno = FIRST_SSE_REG;
7377 break;
7378 case 8:
7379 case 4:
7380 if (mode == SFmode || mode == DFmode)
7381 regno = FIRST_SSE_REG;
7382 break;
7383 default:
7384 break;
7387 return gen_rtx_REG (orig_mode, regno);
7390 static rtx
7391 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7392 enum machine_mode orig_mode, enum machine_mode mode)
7394 const_tree fn, fntype;
7396 fn = NULL_TREE;
7397 if (fntype_or_decl && DECL_P (fntype_or_decl))
7398 fn = fntype_or_decl;
7399 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7401 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7402 return function_value_ms_64 (orig_mode, mode, valtype);
7403 else if (TARGET_64BIT)
7404 return function_value_64 (orig_mode, mode, valtype);
7405 else
7406 return function_value_32 (orig_mode, mode, fntype, fn);
7409 static rtx
7410 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7411 bool outgoing ATTRIBUTE_UNUSED)
7413 enum machine_mode mode, orig_mode;
7415 orig_mode = TYPE_MODE (valtype);
7416 mode = type_natural_mode (valtype, NULL);
7417 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7420 /* Pointer function arguments and return values are promoted to
7421 word_mode. */
7423 static enum machine_mode
7424 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7425 int *punsignedp, const_tree fntype,
7426 int for_return)
7428 if (type != NULL_TREE && POINTER_TYPE_P (type))
7430 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7431 return word_mode;
7433 return default_promote_function_mode (type, mode, punsignedp, fntype,
7434 for_return);
7437 /* Return true if a structure, union or array with MODE containing FIELD
7438 should be accessed using BLKmode. */
7440 static bool
7441 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7443 /* Union with XFmode must be in BLKmode. */
7444 return (mode == XFmode
7445 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7446 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7450 ix86_libcall_value (enum machine_mode mode)
7452 return ix86_function_value_1 (NULL, NULL, mode, mode);
7455 /* Return true iff type is returned in memory. */
7457 static bool ATTRIBUTE_UNUSED
7458 return_in_memory_32 (const_tree type, enum machine_mode mode)
7460 HOST_WIDE_INT size;
7462 if (mode == BLKmode)
7463 return true;
7465 size = int_size_in_bytes (type);
7467 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7468 return false;
7470 if (VECTOR_MODE_P (mode) || mode == TImode)
7472 /* User-created vectors small enough to fit in EAX. */
7473 if (size < 8)
7474 return false;
7476 /* MMX/3dNow values are returned in MM0,
7477 except when it doesn't exits or the ABI prescribes otherwise. */
7478 if (size == 8)
7479 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7481 /* SSE values are returned in XMM0, except when it doesn't exist. */
7482 if (size == 16)
7483 return !TARGET_SSE;
7485 /* AVX values are returned in YMM0, except when it doesn't exist. */
7486 if (size == 32)
7487 return !TARGET_AVX;
7490 if (mode == XFmode)
7491 return false;
7493 if (size > 12)
7494 return true;
7496 /* OImode shouldn't be used directly. */
7497 gcc_assert (mode != OImode);
7499 return false;
7502 static bool ATTRIBUTE_UNUSED
7503 return_in_memory_64 (const_tree type, enum machine_mode mode)
7505 int needed_intregs, needed_sseregs;
7506 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7509 static bool ATTRIBUTE_UNUSED
7510 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7512 HOST_WIDE_INT size = int_size_in_bytes (type);
7514 /* __m128 is returned in xmm0. */
7515 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7516 || VECTOR_FLOAT_TYPE_P (type))
7517 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7518 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7519 return false;
7521 /* Otherwise, the size must be exactly in [1248]. */
7522 return size != 1 && size != 2 && size != 4 && size != 8;
7525 static bool
7526 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7528 #ifdef SUBTARGET_RETURN_IN_MEMORY
7529 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7530 #else
7531 const enum machine_mode mode = type_natural_mode (type, NULL);
7533 if (TARGET_64BIT)
7535 if (ix86_function_type_abi (fntype) == MS_ABI)
7536 return return_in_memory_ms_64 (type, mode);
7537 else
7538 return return_in_memory_64 (type, mode);
7540 else
7541 return return_in_memory_32 (type, mode);
7542 #endif
7545 /* When returning SSE vector types, we have a choice of either
7546 (1) being abi incompatible with a -march switch, or
7547 (2) generating an error.
7548 Given no good solution, I think the safest thing is one warning.
7549 The user won't be able to use -Werror, but....
7551 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7552 called in response to actually generating a caller or callee that
7553 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7554 via aggregate_value_p for general type probing from tree-ssa. */
7556 static rtx
7557 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7559 static bool warnedsse, warnedmmx;
7561 if (!TARGET_64BIT && type)
7563 /* Look at the return type of the function, not the function type. */
7564 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7566 if (!TARGET_SSE && !warnedsse)
7568 if (mode == TImode
7569 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7571 warnedsse = true;
7572 warning (0, "SSE vector return without SSE enabled "
7573 "changes the ABI");
7577 if (!TARGET_MMX && !warnedmmx)
7579 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7581 warnedmmx = true;
7582 warning (0, "MMX vector return without MMX enabled "
7583 "changes the ABI");
7588 return NULL;
7592 /* Create the va_list data type. */
7594 /* Returns the calling convention specific va_list date type.
7595 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7597 static tree
7598 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7600 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7602 /* For i386 we use plain pointer to argument area. */
7603 if (!TARGET_64BIT || abi == MS_ABI)
7604 return build_pointer_type (char_type_node);
7606 record = lang_hooks.types.make_type (RECORD_TYPE);
7607 type_decl = build_decl (BUILTINS_LOCATION,
7608 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7610 f_gpr = build_decl (BUILTINS_LOCATION,
7611 FIELD_DECL, get_identifier ("gp_offset"),
7612 unsigned_type_node);
7613 f_fpr = build_decl (BUILTINS_LOCATION,
7614 FIELD_DECL, get_identifier ("fp_offset"),
7615 unsigned_type_node);
7616 f_ovf = build_decl (BUILTINS_LOCATION,
7617 FIELD_DECL, get_identifier ("overflow_arg_area"),
7618 ptr_type_node);
7619 f_sav = build_decl (BUILTINS_LOCATION,
7620 FIELD_DECL, get_identifier ("reg_save_area"),
7621 ptr_type_node);
7623 va_list_gpr_counter_field = f_gpr;
7624 va_list_fpr_counter_field = f_fpr;
7626 DECL_FIELD_CONTEXT (f_gpr) = record;
7627 DECL_FIELD_CONTEXT (f_fpr) = record;
7628 DECL_FIELD_CONTEXT (f_ovf) = record;
7629 DECL_FIELD_CONTEXT (f_sav) = record;
7631 TYPE_STUB_DECL (record) = type_decl;
7632 TYPE_NAME (record) = type_decl;
7633 TYPE_FIELDS (record) = f_gpr;
7634 DECL_CHAIN (f_gpr) = f_fpr;
7635 DECL_CHAIN (f_fpr) = f_ovf;
7636 DECL_CHAIN (f_ovf) = f_sav;
7638 layout_type (record);
7640 /* The correct type is an array type of one element. */
7641 return build_array_type (record, build_index_type (size_zero_node));
7644 /* Setup the builtin va_list data type and for 64-bit the additional
7645 calling convention specific va_list data types. */
7647 static tree
7648 ix86_build_builtin_va_list (void)
7650 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7652 /* Initialize abi specific va_list builtin types. */
7653 if (TARGET_64BIT)
7655 tree t;
7656 if (ix86_abi == MS_ABI)
7658 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7659 if (TREE_CODE (t) != RECORD_TYPE)
7660 t = build_variant_type_copy (t);
7661 sysv_va_list_type_node = t;
7663 else
7665 t = ret;
7666 if (TREE_CODE (t) != RECORD_TYPE)
7667 t = build_variant_type_copy (t);
7668 sysv_va_list_type_node = t;
7670 if (ix86_abi != MS_ABI)
7672 t = ix86_build_builtin_va_list_abi (MS_ABI);
7673 if (TREE_CODE (t) != RECORD_TYPE)
7674 t = build_variant_type_copy (t);
7675 ms_va_list_type_node = t;
7677 else
7679 t = ret;
7680 if (TREE_CODE (t) != RECORD_TYPE)
7681 t = build_variant_type_copy (t);
7682 ms_va_list_type_node = t;
7686 return ret;
7689 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7691 static void
7692 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7694 rtx save_area, mem;
7695 alias_set_type set;
7696 int i, max;
7698 /* GPR size of varargs save area. */
7699 if (cfun->va_list_gpr_size)
7700 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7701 else
7702 ix86_varargs_gpr_size = 0;
7704 /* FPR size of varargs save area. We don't need it if we don't pass
7705 anything in SSE registers. */
7706 if (TARGET_SSE && cfun->va_list_fpr_size)
7707 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7708 else
7709 ix86_varargs_fpr_size = 0;
7711 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7712 return;
7714 save_area = frame_pointer_rtx;
7715 set = get_varargs_alias_set ();
7717 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7718 if (max > X86_64_REGPARM_MAX)
7719 max = X86_64_REGPARM_MAX;
7721 for (i = cum->regno; i < max; i++)
7723 mem = gen_rtx_MEM (word_mode,
7724 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7725 MEM_NOTRAP_P (mem) = 1;
7726 set_mem_alias_set (mem, set);
7727 emit_move_insn (mem,
7728 gen_rtx_REG (word_mode,
7729 x86_64_int_parameter_registers[i]));
7732 if (ix86_varargs_fpr_size)
7734 enum machine_mode smode;
7735 rtx label, test;
7737 /* Now emit code to save SSE registers. The AX parameter contains number
7738 of SSE parameter registers used to call this function, though all we
7739 actually check here is the zero/non-zero status. */
7741 label = gen_label_rtx ();
7742 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7743 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7744 label));
7746 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7747 we used movdqa (i.e. TImode) instead? Perhaps even better would
7748 be if we could determine the real mode of the data, via a hook
7749 into pass_stdarg. Ignore all that for now. */
7750 smode = V4SFmode;
7751 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7752 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7754 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7755 if (max > X86_64_SSE_REGPARM_MAX)
7756 max = X86_64_SSE_REGPARM_MAX;
7758 for (i = cum->sse_regno; i < max; ++i)
7760 mem = plus_constant (Pmode, save_area,
7761 i * 16 + ix86_varargs_gpr_size);
7762 mem = gen_rtx_MEM (smode, mem);
7763 MEM_NOTRAP_P (mem) = 1;
7764 set_mem_alias_set (mem, set);
7765 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7767 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7770 emit_label (label);
7774 static void
7775 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7777 alias_set_type set = get_varargs_alias_set ();
7778 int i;
7780 /* Reset to zero, as there might be a sysv vaarg used
7781 before. */
7782 ix86_varargs_gpr_size = 0;
7783 ix86_varargs_fpr_size = 0;
7785 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7787 rtx reg, mem;
7789 mem = gen_rtx_MEM (Pmode,
7790 plus_constant (Pmode, virtual_incoming_args_rtx,
7791 i * UNITS_PER_WORD));
7792 MEM_NOTRAP_P (mem) = 1;
7793 set_mem_alias_set (mem, set);
7795 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7796 emit_move_insn (mem, reg);
7800 static void
7801 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7802 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7803 int no_rtl)
7805 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7806 CUMULATIVE_ARGS next_cum;
7807 tree fntype;
7809 /* This argument doesn't appear to be used anymore. Which is good,
7810 because the old code here didn't suppress rtl generation. */
7811 gcc_assert (!no_rtl);
7813 if (!TARGET_64BIT)
7814 return;
7816 fntype = TREE_TYPE (current_function_decl);
7818 /* For varargs, we do not want to skip the dummy va_dcl argument.
7819 For stdargs, we do want to skip the last named argument. */
7820 next_cum = *cum;
7821 if (stdarg_p (fntype))
7822 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7823 true);
7825 if (cum->call_abi == MS_ABI)
7826 setup_incoming_varargs_ms_64 (&next_cum);
7827 else
7828 setup_incoming_varargs_64 (&next_cum);
7831 /* Checks if TYPE is of kind va_list char *. */
7833 static bool
7834 is_va_list_char_pointer (tree type)
7836 tree canonic;
7838 /* For 32-bit it is always true. */
7839 if (!TARGET_64BIT)
7840 return true;
7841 canonic = ix86_canonical_va_list_type (type);
7842 return (canonic == ms_va_list_type_node
7843 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7846 /* Implement va_start. */
7848 static void
7849 ix86_va_start (tree valist, rtx nextarg)
7851 HOST_WIDE_INT words, n_gpr, n_fpr;
7852 tree f_gpr, f_fpr, f_ovf, f_sav;
7853 tree gpr, fpr, ovf, sav, t;
7854 tree type;
7855 rtx ovf_rtx;
7857 if (flag_split_stack
7858 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7860 unsigned int scratch_regno;
7862 /* When we are splitting the stack, we can't refer to the stack
7863 arguments using internal_arg_pointer, because they may be on
7864 the old stack. The split stack prologue will arrange to
7865 leave a pointer to the old stack arguments in a scratch
7866 register, which we here copy to a pseudo-register. The split
7867 stack prologue can't set the pseudo-register directly because
7868 it (the prologue) runs before any registers have been saved. */
7870 scratch_regno = split_stack_prologue_scratch_regno ();
7871 if (scratch_regno != INVALID_REGNUM)
7873 rtx reg, seq;
7875 reg = gen_reg_rtx (Pmode);
7876 cfun->machine->split_stack_varargs_pointer = reg;
7878 start_sequence ();
7879 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7880 seq = get_insns ();
7881 end_sequence ();
7883 push_topmost_sequence ();
7884 emit_insn_after (seq, entry_of_function ());
7885 pop_topmost_sequence ();
7889 /* Only 64bit target needs something special. */
7890 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7892 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7893 std_expand_builtin_va_start (valist, nextarg);
7894 else
7896 rtx va_r, next;
7898 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7899 next = expand_binop (ptr_mode, add_optab,
7900 cfun->machine->split_stack_varargs_pointer,
7901 crtl->args.arg_offset_rtx,
7902 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7903 convert_move (va_r, next, 0);
7905 return;
7908 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7909 f_fpr = DECL_CHAIN (f_gpr);
7910 f_ovf = DECL_CHAIN (f_fpr);
7911 f_sav = DECL_CHAIN (f_ovf);
7913 valist = build_simple_mem_ref (valist);
7914 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7915 /* The following should be folded into the MEM_REF offset. */
7916 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7917 f_gpr, NULL_TREE);
7918 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7919 f_fpr, NULL_TREE);
7920 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7921 f_ovf, NULL_TREE);
7922 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7923 f_sav, NULL_TREE);
7925 /* Count number of gp and fp argument registers used. */
7926 words = crtl->args.info.words;
7927 n_gpr = crtl->args.info.regno;
7928 n_fpr = crtl->args.info.sse_regno;
7930 if (cfun->va_list_gpr_size)
7932 type = TREE_TYPE (gpr);
7933 t = build2 (MODIFY_EXPR, type,
7934 gpr, build_int_cst (type, n_gpr * 8));
7935 TREE_SIDE_EFFECTS (t) = 1;
7936 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7939 if (TARGET_SSE && cfun->va_list_fpr_size)
7941 type = TREE_TYPE (fpr);
7942 t = build2 (MODIFY_EXPR, type, fpr,
7943 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7944 TREE_SIDE_EFFECTS (t) = 1;
7945 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7948 /* Find the overflow area. */
7949 type = TREE_TYPE (ovf);
7950 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7951 ovf_rtx = crtl->args.internal_arg_pointer;
7952 else
7953 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7954 t = make_tree (type, ovf_rtx);
7955 if (words != 0)
7956 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7957 t = build2 (MODIFY_EXPR, type, ovf, t);
7958 TREE_SIDE_EFFECTS (t) = 1;
7959 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7961 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7963 /* Find the register save area.
7964 Prologue of the function save it right above stack frame. */
7965 type = TREE_TYPE (sav);
7966 t = make_tree (type, frame_pointer_rtx);
7967 if (!ix86_varargs_gpr_size)
7968 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7969 t = build2 (MODIFY_EXPR, type, sav, t);
7970 TREE_SIDE_EFFECTS (t) = 1;
7971 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7975 /* Implement va_arg. */
7977 static tree
7978 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7979 gimple_seq *post_p)
7981 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7982 tree f_gpr, f_fpr, f_ovf, f_sav;
7983 tree gpr, fpr, ovf, sav, t;
7984 int size, rsize;
7985 tree lab_false, lab_over = NULL_TREE;
7986 tree addr, t2;
7987 rtx container;
7988 int indirect_p = 0;
7989 tree ptrtype;
7990 enum machine_mode nat_mode;
7991 unsigned int arg_boundary;
7993 /* Only 64bit target needs something special. */
7994 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7995 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7997 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7998 f_fpr = DECL_CHAIN (f_gpr);
7999 f_ovf = DECL_CHAIN (f_fpr);
8000 f_sav = DECL_CHAIN (f_ovf);
8002 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8003 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8004 valist = build_va_arg_indirect_ref (valist);
8005 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8006 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8007 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8009 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8010 if (indirect_p)
8011 type = build_pointer_type (type);
8012 size = int_size_in_bytes (type);
8013 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8015 nat_mode = type_natural_mode (type, NULL);
8016 switch (nat_mode)
8018 case V8SFmode:
8019 case V8SImode:
8020 case V32QImode:
8021 case V16HImode:
8022 case V4DFmode:
8023 case V4DImode:
8024 /* Unnamed 256bit vector mode parameters are passed on stack. */
8025 if (!TARGET_64BIT_MS_ABI)
8027 container = NULL;
8028 break;
8031 default:
8032 container = construct_container (nat_mode, TYPE_MODE (type),
8033 type, 0, X86_64_REGPARM_MAX,
8034 X86_64_SSE_REGPARM_MAX, intreg,
8036 break;
8039 /* Pull the value out of the saved registers. */
8041 addr = create_tmp_var (ptr_type_node, "addr");
8043 if (container)
8045 int needed_intregs, needed_sseregs;
8046 bool need_temp;
8047 tree int_addr, sse_addr;
8049 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8050 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8052 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8054 need_temp = (!REG_P (container)
8055 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8056 || TYPE_ALIGN (type) > 128));
8058 /* In case we are passing structure, verify that it is consecutive block
8059 on the register save area. If not we need to do moves. */
8060 if (!need_temp && !REG_P (container))
8062 /* Verify that all registers are strictly consecutive */
8063 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8065 int i;
8067 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8069 rtx slot = XVECEXP (container, 0, i);
8070 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8071 || INTVAL (XEXP (slot, 1)) != i * 16)
8072 need_temp = 1;
8075 else
8077 int i;
8079 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8081 rtx slot = XVECEXP (container, 0, i);
8082 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8083 || INTVAL (XEXP (slot, 1)) != i * 8)
8084 need_temp = 1;
8088 if (!need_temp)
8090 int_addr = addr;
8091 sse_addr = addr;
8093 else
8095 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8096 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8099 /* First ensure that we fit completely in registers. */
8100 if (needed_intregs)
8102 t = build_int_cst (TREE_TYPE (gpr),
8103 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8104 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8105 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8106 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8107 gimplify_and_add (t, pre_p);
8109 if (needed_sseregs)
8111 t = build_int_cst (TREE_TYPE (fpr),
8112 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8113 + X86_64_REGPARM_MAX * 8);
8114 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8115 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8116 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8117 gimplify_and_add (t, pre_p);
8120 /* Compute index to start of area used for integer regs. */
8121 if (needed_intregs)
8123 /* int_addr = gpr + sav; */
8124 t = fold_build_pointer_plus (sav, gpr);
8125 gimplify_assign (int_addr, t, pre_p);
8127 if (needed_sseregs)
8129 /* sse_addr = fpr + sav; */
8130 t = fold_build_pointer_plus (sav, fpr);
8131 gimplify_assign (sse_addr, t, pre_p);
8133 if (need_temp)
8135 int i, prev_size = 0;
8136 tree temp = create_tmp_var (type, "va_arg_tmp");
8138 /* addr = &temp; */
8139 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8140 gimplify_assign (addr, t, pre_p);
8142 for (i = 0; i < XVECLEN (container, 0); i++)
8144 rtx slot = XVECEXP (container, 0, i);
8145 rtx reg = XEXP (slot, 0);
8146 enum machine_mode mode = GET_MODE (reg);
8147 tree piece_type;
8148 tree addr_type;
8149 tree daddr_type;
8150 tree src_addr, src;
8151 int src_offset;
8152 tree dest_addr, dest;
8153 int cur_size = GET_MODE_SIZE (mode);
8155 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8156 prev_size = INTVAL (XEXP (slot, 1));
8157 if (prev_size + cur_size > size)
8159 cur_size = size - prev_size;
8160 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8161 if (mode == BLKmode)
8162 mode = QImode;
8164 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8165 if (mode == GET_MODE (reg))
8166 addr_type = build_pointer_type (piece_type);
8167 else
8168 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8169 true);
8170 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8171 true);
8173 if (SSE_REGNO_P (REGNO (reg)))
8175 src_addr = sse_addr;
8176 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8178 else
8180 src_addr = int_addr;
8181 src_offset = REGNO (reg) * 8;
8183 src_addr = fold_convert (addr_type, src_addr);
8184 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8186 dest_addr = fold_convert (daddr_type, addr);
8187 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8188 if (cur_size == GET_MODE_SIZE (mode))
8190 src = build_va_arg_indirect_ref (src_addr);
8191 dest = build_va_arg_indirect_ref (dest_addr);
8193 gimplify_assign (dest, src, pre_p);
8195 else
8197 tree copy
8198 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8199 3, dest_addr, src_addr,
8200 size_int (cur_size));
8201 gimplify_and_add (copy, pre_p);
8203 prev_size += cur_size;
8207 if (needed_intregs)
8209 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8210 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8211 gimplify_assign (gpr, t, pre_p);
8214 if (needed_sseregs)
8216 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8217 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8218 gimplify_assign (fpr, t, pre_p);
8221 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8223 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8226 /* ... otherwise out of the overflow area. */
8228 /* When we align parameter on stack for caller, if the parameter
8229 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8230 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8231 here with caller. */
8232 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8233 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8234 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8236 /* Care for on-stack alignment if needed. */
8237 if (arg_boundary <= 64 || size == 0)
8238 t = ovf;
8239 else
8241 HOST_WIDE_INT align = arg_boundary / 8;
8242 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8243 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8244 build_int_cst (TREE_TYPE (t), -align));
8247 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8248 gimplify_assign (addr, t, pre_p);
8250 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8251 gimplify_assign (unshare_expr (ovf), t, pre_p);
8253 if (container)
8254 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8256 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8257 addr = fold_convert (ptrtype, addr);
8259 if (indirect_p)
8260 addr = build_va_arg_indirect_ref (addr);
8261 return build_va_arg_indirect_ref (addr);
8264 /* Return true if OPNUM's MEM should be matched
8265 in movabs* patterns. */
8267 bool
8268 ix86_check_movabs (rtx insn, int opnum)
8270 rtx set, mem;
8272 set = PATTERN (insn);
8273 if (GET_CODE (set) == PARALLEL)
8274 set = XVECEXP (set, 0, 0);
8275 gcc_assert (GET_CODE (set) == SET);
8276 mem = XEXP (set, opnum);
8277 while (GET_CODE (mem) == SUBREG)
8278 mem = SUBREG_REG (mem);
8279 gcc_assert (MEM_P (mem));
8280 return volatile_ok || !MEM_VOLATILE_P (mem);
8283 /* Initialize the table of extra 80387 mathematical constants. */
8285 static void
8286 init_ext_80387_constants (void)
8288 static const char * cst[5] =
8290 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8291 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8292 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8293 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8294 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8296 int i;
8298 for (i = 0; i < 5; i++)
8300 real_from_string (&ext_80387_constants_table[i], cst[i]);
8301 /* Ensure each constant is rounded to XFmode precision. */
8302 real_convert (&ext_80387_constants_table[i],
8303 XFmode, &ext_80387_constants_table[i]);
8306 ext_80387_constants_init = 1;
8309 /* Return non-zero if the constant is something that
8310 can be loaded with a special instruction. */
8313 standard_80387_constant_p (rtx x)
8315 enum machine_mode mode = GET_MODE (x);
8317 REAL_VALUE_TYPE r;
8319 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8320 return -1;
8322 if (x == CONST0_RTX (mode))
8323 return 1;
8324 if (x == CONST1_RTX (mode))
8325 return 2;
8327 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8329 /* For XFmode constants, try to find a special 80387 instruction when
8330 optimizing for size or on those CPUs that benefit from them. */
8331 if (mode == XFmode
8332 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8334 int i;
8336 if (! ext_80387_constants_init)
8337 init_ext_80387_constants ();
8339 for (i = 0; i < 5; i++)
8340 if (real_identical (&r, &ext_80387_constants_table[i]))
8341 return i + 3;
8344 /* Load of the constant -0.0 or -1.0 will be split as
8345 fldz;fchs or fld1;fchs sequence. */
8346 if (real_isnegzero (&r))
8347 return 8;
8348 if (real_identical (&r, &dconstm1))
8349 return 9;
8351 return 0;
8354 /* Return the opcode of the special instruction to be used to load
8355 the constant X. */
8357 const char *
8358 standard_80387_constant_opcode (rtx x)
8360 switch (standard_80387_constant_p (x))
8362 case 1:
8363 return "fldz";
8364 case 2:
8365 return "fld1";
8366 case 3:
8367 return "fldlg2";
8368 case 4:
8369 return "fldln2";
8370 case 5:
8371 return "fldl2e";
8372 case 6:
8373 return "fldl2t";
8374 case 7:
8375 return "fldpi";
8376 case 8:
8377 case 9:
8378 return "#";
8379 default:
8380 gcc_unreachable ();
8384 /* Return the CONST_DOUBLE representing the 80387 constant that is
8385 loaded by the specified special instruction. The argument IDX
8386 matches the return value from standard_80387_constant_p. */
8389 standard_80387_constant_rtx (int idx)
8391 int i;
8393 if (! ext_80387_constants_init)
8394 init_ext_80387_constants ();
8396 switch (idx)
8398 case 3:
8399 case 4:
8400 case 5:
8401 case 6:
8402 case 7:
8403 i = idx - 3;
8404 break;
8406 default:
8407 gcc_unreachable ();
8410 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8411 XFmode);
8414 /* Return 1 if X is all 0s and 2 if x is all 1s
8415 in supported SSE/AVX vector mode. */
8418 standard_sse_constant_p (rtx x)
8420 enum machine_mode mode = GET_MODE (x);
8422 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8423 return 1;
8424 if (vector_all_ones_operand (x, mode))
8425 switch (mode)
8427 case V16QImode:
8428 case V8HImode:
8429 case V4SImode:
8430 case V2DImode:
8431 if (TARGET_SSE2)
8432 return 2;
8433 case V32QImode:
8434 case V16HImode:
8435 case V8SImode:
8436 case V4DImode:
8437 if (TARGET_AVX2)
8438 return 2;
8439 default:
8440 break;
8443 return 0;
8446 /* Return the opcode of the special instruction to be used to load
8447 the constant X. */
8449 const char *
8450 standard_sse_constant_opcode (rtx insn, rtx x)
8452 switch (standard_sse_constant_p (x))
8454 case 1:
8455 switch (get_attr_mode (insn))
8457 case MODE_TI:
8458 return "%vpxor\t%0, %d0";
8459 case MODE_V2DF:
8460 return "%vxorpd\t%0, %d0";
8461 case MODE_V4SF:
8462 return "%vxorps\t%0, %d0";
8464 case MODE_OI:
8465 return "vpxor\t%x0, %x0, %x0";
8466 case MODE_V4DF:
8467 return "vxorpd\t%x0, %x0, %x0";
8468 case MODE_V8SF:
8469 return "vxorps\t%x0, %x0, %x0";
8471 default:
8472 break;
8475 case 2:
8476 if (TARGET_AVX)
8477 return "vpcmpeqd\t%0, %0, %0";
8478 else
8479 return "pcmpeqd\t%0, %0";
8481 default:
8482 break;
8484 gcc_unreachable ();
8487 /* Returns true if OP contains a symbol reference */
8489 bool
8490 symbolic_reference_mentioned_p (rtx op)
8492 const char *fmt;
8493 int i;
8495 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8496 return true;
8498 fmt = GET_RTX_FORMAT (GET_CODE (op));
8499 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8501 if (fmt[i] == 'E')
8503 int j;
8505 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8506 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8507 return true;
8510 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8511 return true;
8514 return false;
8517 /* Return true if it is appropriate to emit `ret' instructions in the
8518 body of a function. Do this only if the epilogue is simple, needing a
8519 couple of insns. Prior to reloading, we can't tell how many registers
8520 must be saved, so return false then. Return false if there is no frame
8521 marker to de-allocate. */
8523 bool
8524 ix86_can_use_return_insn_p (void)
8526 struct ix86_frame frame;
8528 if (! reload_completed || frame_pointer_needed)
8529 return 0;
8531 /* Don't allow more than 32k pop, since that's all we can do
8532 with one instruction. */
8533 if (crtl->args.pops_args && crtl->args.size >= 32768)
8534 return 0;
8536 ix86_compute_frame_layout (&frame);
8537 return (frame.stack_pointer_offset == UNITS_PER_WORD
8538 && (frame.nregs + frame.nsseregs) == 0);
8541 /* Value should be nonzero if functions must have frame pointers.
8542 Zero means the frame pointer need not be set up (and parms may
8543 be accessed via the stack pointer) in functions that seem suitable. */
8545 static bool
8546 ix86_frame_pointer_required (void)
8548 /* If we accessed previous frames, then the generated code expects
8549 to be able to access the saved ebp value in our frame. */
8550 if (cfun->machine->accesses_prev_frame)
8551 return true;
8553 /* Several x86 os'es need a frame pointer for other reasons,
8554 usually pertaining to setjmp. */
8555 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8556 return true;
8558 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8559 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8560 return true;
8562 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8563 allocation is 4GB. */
8564 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8565 return true;
8567 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8568 turns off the frame pointer by default. Turn it back on now if
8569 we've not got a leaf function. */
8570 if (TARGET_OMIT_LEAF_FRAME_POINTER
8571 && (!crtl->is_leaf
8572 || ix86_current_function_calls_tls_descriptor))
8573 return true;
8575 if (crtl->profile && !flag_fentry)
8576 return true;
8578 return false;
8581 /* Record that the current function accesses previous call frames. */
8583 void
8584 ix86_setup_frame_addresses (void)
8586 cfun->machine->accesses_prev_frame = 1;
8589 #ifndef USE_HIDDEN_LINKONCE
8590 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8591 # define USE_HIDDEN_LINKONCE 1
8592 # else
8593 # define USE_HIDDEN_LINKONCE 0
8594 # endif
8595 #endif
8597 static int pic_labels_used;
8599 /* Fills in the label name that should be used for a pc thunk for
8600 the given register. */
8602 static void
8603 get_pc_thunk_name (char name[32], unsigned int regno)
8605 gcc_assert (!TARGET_64BIT);
8607 if (USE_HIDDEN_LINKONCE)
8608 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8609 else
8610 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8614 /* This function generates code for -fpic that loads %ebx with
8615 the return address of the caller and then returns. */
8617 static void
8618 ix86_code_end (void)
8620 rtx xops[2];
8621 int regno;
8623 for (regno = AX_REG; regno <= SP_REG; regno++)
8625 char name[32];
8626 tree decl;
8628 if (!(pic_labels_used & (1 << regno)))
8629 continue;
8631 get_pc_thunk_name (name, regno);
8633 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8634 get_identifier (name),
8635 build_function_type_list (void_type_node, NULL_TREE));
8636 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8637 NULL_TREE, void_type_node);
8638 TREE_PUBLIC (decl) = 1;
8639 TREE_STATIC (decl) = 1;
8640 DECL_IGNORED_P (decl) = 1;
8642 #if TARGET_MACHO
8643 if (TARGET_MACHO)
8645 switch_to_section (darwin_sections[text_coal_section]);
8646 fputs ("\t.weak_definition\t", asm_out_file);
8647 assemble_name (asm_out_file, name);
8648 fputs ("\n\t.private_extern\t", asm_out_file);
8649 assemble_name (asm_out_file, name);
8650 putc ('\n', asm_out_file);
8651 ASM_OUTPUT_LABEL (asm_out_file, name);
8652 DECL_WEAK (decl) = 1;
8654 else
8655 #endif
8656 if (USE_HIDDEN_LINKONCE)
8658 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8660 targetm.asm_out.unique_section (decl, 0);
8661 switch_to_section (get_named_section (decl, NULL, 0));
8663 targetm.asm_out.globalize_label (asm_out_file, name);
8664 fputs ("\t.hidden\t", asm_out_file);
8665 assemble_name (asm_out_file, name);
8666 putc ('\n', asm_out_file);
8667 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8669 else
8671 switch_to_section (text_section);
8672 ASM_OUTPUT_LABEL (asm_out_file, name);
8675 DECL_INITIAL (decl) = make_node (BLOCK);
8676 current_function_decl = decl;
8677 init_function_start (decl);
8678 first_function_block_is_cold = false;
8679 /* Make sure unwind info is emitted for the thunk if needed. */
8680 final_start_function (emit_barrier (), asm_out_file, 1);
8682 /* Pad stack IP move with 4 instructions (two NOPs count
8683 as one instruction). */
8684 if (TARGET_PAD_SHORT_FUNCTION)
8686 int i = 8;
8688 while (i--)
8689 fputs ("\tnop\n", asm_out_file);
8692 xops[0] = gen_rtx_REG (Pmode, regno);
8693 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8694 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8695 fputs ("\tret\n", asm_out_file);
8696 final_end_function ();
8697 init_insn_lengths ();
8698 free_after_compilation (cfun);
8699 set_cfun (NULL);
8700 current_function_decl = NULL;
8703 if (flag_split_stack)
8704 file_end_indicate_split_stack ();
8707 /* Emit code for the SET_GOT patterns. */
8709 const char *
8710 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8712 rtx xops[3];
8714 xops[0] = dest;
8716 if (TARGET_VXWORKS_RTP && flag_pic)
8718 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8719 xops[2] = gen_rtx_MEM (Pmode,
8720 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8721 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8723 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8724 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8725 an unadorned address. */
8726 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8727 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8728 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8729 return "";
8732 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8734 if (!flag_pic)
8736 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8738 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8740 #if TARGET_MACHO
8741 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8742 is what will be referenced by the Mach-O PIC subsystem. */
8743 if (!label)
8744 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8745 #endif
8747 targetm.asm_out.internal_label (asm_out_file, "L",
8748 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8750 else
8752 char name[32];
8753 get_pc_thunk_name (name, REGNO (dest));
8754 pic_labels_used |= 1 << REGNO (dest);
8756 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8757 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8758 output_asm_insn ("call\t%X2", xops);
8759 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8760 is what will be referenced by the Mach-O PIC subsystem. */
8761 #if TARGET_MACHO
8762 if (!label)
8763 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8764 else
8765 targetm.asm_out.internal_label (asm_out_file, "L",
8766 CODE_LABEL_NUMBER (label));
8767 #endif
8770 if (!TARGET_MACHO)
8771 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8773 return "";
8776 /* Generate an "push" pattern for input ARG. */
8778 static rtx
8779 gen_push (rtx arg)
8781 struct machine_function *m = cfun->machine;
8783 if (m->fs.cfa_reg == stack_pointer_rtx)
8784 m->fs.cfa_offset += UNITS_PER_WORD;
8785 m->fs.sp_offset += UNITS_PER_WORD;
8787 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8788 arg = gen_rtx_REG (word_mode, REGNO (arg));
8790 return gen_rtx_SET (VOIDmode,
8791 gen_rtx_MEM (word_mode,
8792 gen_rtx_PRE_DEC (Pmode,
8793 stack_pointer_rtx)),
8794 arg);
8797 /* Generate an "pop" pattern for input ARG. */
8799 static rtx
8800 gen_pop (rtx arg)
8802 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8803 arg = gen_rtx_REG (word_mode, REGNO (arg));
8805 return gen_rtx_SET (VOIDmode,
8806 arg,
8807 gen_rtx_MEM (word_mode,
8808 gen_rtx_POST_INC (Pmode,
8809 stack_pointer_rtx)));
8812 /* Return >= 0 if there is an unused call-clobbered register available
8813 for the entire function. */
8815 static unsigned int
8816 ix86_select_alt_pic_regnum (void)
8818 if (crtl->is_leaf
8819 && !crtl->profile
8820 && !ix86_current_function_calls_tls_descriptor)
8822 int i, drap;
8823 /* Can't use the same register for both PIC and DRAP. */
8824 if (crtl->drap_reg)
8825 drap = REGNO (crtl->drap_reg);
8826 else
8827 drap = -1;
8828 for (i = 2; i >= 0; --i)
8829 if (i != drap && !df_regs_ever_live_p (i))
8830 return i;
8833 return INVALID_REGNUM;
8836 /* Return TRUE if we need to save REGNO. */
8838 static bool
8839 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8841 if (pic_offset_table_rtx
8842 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8843 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8844 || crtl->profile
8845 || crtl->calls_eh_return
8846 || crtl->uses_const_pool))
8847 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8849 if (crtl->calls_eh_return && maybe_eh_return)
8851 unsigned i;
8852 for (i = 0; ; i++)
8854 unsigned test = EH_RETURN_DATA_REGNO (i);
8855 if (test == INVALID_REGNUM)
8856 break;
8857 if (test == regno)
8858 return true;
8862 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8863 return true;
8865 return (df_regs_ever_live_p (regno)
8866 && !call_used_regs[regno]
8867 && !fixed_regs[regno]
8868 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8871 /* Return number of saved general prupose registers. */
8873 static int
8874 ix86_nsaved_regs (void)
8876 int nregs = 0;
8877 int regno;
8879 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8880 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8881 nregs ++;
8882 return nregs;
8885 /* Return number of saved SSE registrers. */
8887 static int
8888 ix86_nsaved_sseregs (void)
8890 int nregs = 0;
8891 int regno;
8893 if (!TARGET_64BIT_MS_ABI)
8894 return 0;
8895 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8896 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8897 nregs ++;
8898 return nregs;
8901 /* Given FROM and TO register numbers, say whether this elimination is
8902 allowed. If stack alignment is needed, we can only replace argument
8903 pointer with hard frame pointer, or replace frame pointer with stack
8904 pointer. Otherwise, frame pointer elimination is automatically
8905 handled and all other eliminations are valid. */
8907 static bool
8908 ix86_can_eliminate (const int from, const int to)
8910 if (stack_realign_fp)
8911 return ((from == ARG_POINTER_REGNUM
8912 && to == HARD_FRAME_POINTER_REGNUM)
8913 || (from == FRAME_POINTER_REGNUM
8914 && to == STACK_POINTER_REGNUM));
8915 else
8916 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8919 /* Return the offset between two registers, one to be eliminated, and the other
8920 its replacement, at the start of a routine. */
8922 HOST_WIDE_INT
8923 ix86_initial_elimination_offset (int from, int to)
8925 struct ix86_frame frame;
8926 ix86_compute_frame_layout (&frame);
8928 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8929 return frame.hard_frame_pointer_offset;
8930 else if (from == FRAME_POINTER_REGNUM
8931 && to == HARD_FRAME_POINTER_REGNUM)
8932 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8933 else
8935 gcc_assert (to == STACK_POINTER_REGNUM);
8937 if (from == ARG_POINTER_REGNUM)
8938 return frame.stack_pointer_offset;
8940 gcc_assert (from == FRAME_POINTER_REGNUM);
8941 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8945 /* In a dynamically-aligned function, we can't know the offset from
8946 stack pointer to frame pointer, so we must ensure that setjmp
8947 eliminates fp against the hard fp (%ebp) rather than trying to
8948 index from %esp up to the top of the frame across a gap that is
8949 of unknown (at compile-time) size. */
8950 static rtx
8951 ix86_builtin_setjmp_frame_value (void)
8953 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8956 /* When using -fsplit-stack, the allocation routines set a field in
8957 the TCB to the bottom of the stack plus this much space, measured
8958 in bytes. */
8960 #define SPLIT_STACK_AVAILABLE 256
8962 /* Fill structure ix86_frame about frame of currently computed function. */
8964 static void
8965 ix86_compute_frame_layout (struct ix86_frame *frame)
8967 unsigned HOST_WIDE_INT stack_alignment_needed;
8968 HOST_WIDE_INT offset;
8969 unsigned HOST_WIDE_INT preferred_alignment;
8970 HOST_WIDE_INT size = get_frame_size ();
8971 HOST_WIDE_INT to_allocate;
8973 frame->nregs = ix86_nsaved_regs ();
8974 frame->nsseregs = ix86_nsaved_sseregs ();
8976 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8977 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8979 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8980 function prologues and leaf. */
8981 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8982 && (!crtl->is_leaf || cfun->calls_alloca != 0
8983 || ix86_current_function_calls_tls_descriptor))
8985 preferred_alignment = 16;
8986 stack_alignment_needed = 16;
8987 crtl->preferred_stack_boundary = 128;
8988 crtl->stack_alignment_needed = 128;
8991 gcc_assert (!size || stack_alignment_needed);
8992 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8993 gcc_assert (preferred_alignment <= stack_alignment_needed);
8995 /* For SEH we have to limit the amount of code movement into the prologue.
8996 At present we do this via a BLOCKAGE, at which point there's very little
8997 scheduling that can be done, which means that there's very little point
8998 in doing anything except PUSHs. */
8999 if (TARGET_SEH)
9000 cfun->machine->use_fast_prologue_epilogue = false;
9002 /* During reload iteration the amount of registers saved can change.
9003 Recompute the value as needed. Do not recompute when amount of registers
9004 didn't change as reload does multiple calls to the function and does not
9005 expect the decision to change within single iteration. */
9006 else if (!optimize_function_for_size_p (cfun)
9007 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9009 int count = frame->nregs;
9010 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9012 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9014 /* The fast prologue uses move instead of push to save registers. This
9015 is significantly longer, but also executes faster as modern hardware
9016 can execute the moves in parallel, but can't do that for push/pop.
9018 Be careful about choosing what prologue to emit: When function takes
9019 many instructions to execute we may use slow version as well as in
9020 case function is known to be outside hot spot (this is known with
9021 feedback only). Weight the size of function by number of registers
9022 to save as it is cheap to use one or two push instructions but very
9023 slow to use many of them. */
9024 if (count)
9025 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9026 if (node->frequency < NODE_FREQUENCY_NORMAL
9027 || (flag_branch_probabilities
9028 && node->frequency < NODE_FREQUENCY_HOT))
9029 cfun->machine->use_fast_prologue_epilogue = false;
9030 else
9031 cfun->machine->use_fast_prologue_epilogue
9032 = !expensive_function_p (count);
9035 frame->save_regs_using_mov
9036 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9037 /* If static stack checking is enabled and done with probes,
9038 the registers need to be saved before allocating the frame. */
9039 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9041 /* Skip return address. */
9042 offset = UNITS_PER_WORD;
9044 /* Skip pushed static chain. */
9045 if (ix86_static_chain_on_stack)
9046 offset += UNITS_PER_WORD;
9048 /* Skip saved base pointer. */
9049 if (frame_pointer_needed)
9050 offset += UNITS_PER_WORD;
9051 frame->hfp_save_offset = offset;
9053 /* The traditional frame pointer location is at the top of the frame. */
9054 frame->hard_frame_pointer_offset = offset;
9056 /* Register save area */
9057 offset += frame->nregs * UNITS_PER_WORD;
9058 frame->reg_save_offset = offset;
9060 /* On SEH target, registers are pushed just before the frame pointer
9061 location. */
9062 if (TARGET_SEH)
9063 frame->hard_frame_pointer_offset = offset;
9065 /* Align and set SSE register save area. */
9066 if (frame->nsseregs)
9068 /* The only ABI that has saved SSE registers (Win64) also has a
9069 16-byte aligned default stack, and thus we don't need to be
9070 within the re-aligned local stack frame to save them. */
9071 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9072 offset = (offset + 16 - 1) & -16;
9073 offset += frame->nsseregs * 16;
9075 frame->sse_reg_save_offset = offset;
9077 /* The re-aligned stack starts here. Values before this point are not
9078 directly comparable with values below this point. In order to make
9079 sure that no value happens to be the same before and after, force
9080 the alignment computation below to add a non-zero value. */
9081 if (stack_realign_fp)
9082 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9084 /* Va-arg area */
9085 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9086 offset += frame->va_arg_size;
9088 /* Align start of frame for local function. */
9089 if (stack_realign_fp
9090 || offset != frame->sse_reg_save_offset
9091 || size != 0
9092 || !crtl->is_leaf
9093 || cfun->calls_alloca
9094 || ix86_current_function_calls_tls_descriptor)
9095 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9097 /* Frame pointer points here. */
9098 frame->frame_pointer_offset = offset;
9100 offset += size;
9102 /* Add outgoing arguments area. Can be skipped if we eliminated
9103 all the function calls as dead code.
9104 Skipping is however impossible when function calls alloca. Alloca
9105 expander assumes that last crtl->outgoing_args_size
9106 of stack frame are unused. */
9107 if (ACCUMULATE_OUTGOING_ARGS
9108 && (!crtl->is_leaf || cfun->calls_alloca
9109 || ix86_current_function_calls_tls_descriptor))
9111 offset += crtl->outgoing_args_size;
9112 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9114 else
9115 frame->outgoing_arguments_size = 0;
9117 /* Align stack boundary. Only needed if we're calling another function
9118 or using alloca. */
9119 if (!crtl->is_leaf || cfun->calls_alloca
9120 || ix86_current_function_calls_tls_descriptor)
9121 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9123 /* We've reached end of stack frame. */
9124 frame->stack_pointer_offset = offset;
9126 /* Size prologue needs to allocate. */
9127 to_allocate = offset - frame->sse_reg_save_offset;
9129 if ((!to_allocate && frame->nregs <= 1)
9130 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9131 frame->save_regs_using_mov = false;
9133 if (ix86_using_red_zone ()
9134 && crtl->sp_is_unchanging
9135 && crtl->is_leaf
9136 && !ix86_current_function_calls_tls_descriptor)
9138 frame->red_zone_size = to_allocate;
9139 if (frame->save_regs_using_mov)
9140 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9141 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9142 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9144 else
9145 frame->red_zone_size = 0;
9146 frame->stack_pointer_offset -= frame->red_zone_size;
9148 /* The SEH frame pointer location is near the bottom of the frame.
9149 This is enforced by the fact that the difference between the
9150 stack pointer and the frame pointer is limited to 240 bytes in
9151 the unwind data structure. */
9152 if (TARGET_SEH)
9154 HOST_WIDE_INT diff;
9156 /* If we can leave the frame pointer where it is, do so. Also, returns
9157 the establisher frame for __builtin_frame_address (0). */
9158 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9159 if (diff <= SEH_MAX_FRAME_SIZE
9160 && (diff > 240 || (diff & 15) != 0)
9161 && !crtl->accesses_prior_frames)
9163 /* Ideally we'd determine what portion of the local stack frame
9164 (within the constraint of the lowest 240) is most heavily used.
9165 But without that complication, simply bias the frame pointer
9166 by 128 bytes so as to maximize the amount of the local stack
9167 frame that is addressable with 8-bit offsets. */
9168 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9173 /* This is semi-inlined memory_address_length, but simplified
9174 since we know that we're always dealing with reg+offset, and
9175 to avoid having to create and discard all that rtl. */
9177 static inline int
9178 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9180 int len = 4;
9182 if (offset == 0)
9184 /* EBP and R13 cannot be encoded without an offset. */
9185 len = (regno == BP_REG || regno == R13_REG);
9187 else if (IN_RANGE (offset, -128, 127))
9188 len = 1;
9190 /* ESP and R12 must be encoded with a SIB byte. */
9191 if (regno == SP_REG || regno == R12_REG)
9192 len++;
9194 return len;
9197 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9198 The valid base registers are taken from CFUN->MACHINE->FS. */
9200 static rtx
9201 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9203 const struct machine_function *m = cfun->machine;
9204 rtx base_reg = NULL;
9205 HOST_WIDE_INT base_offset = 0;
9207 if (m->use_fast_prologue_epilogue)
9209 /* Choose the base register most likely to allow the most scheduling
9210 opportunities. Generally FP is valid throughout the function,
9211 while DRAP must be reloaded within the epilogue. But choose either
9212 over the SP due to increased encoding size. */
9214 if (m->fs.fp_valid)
9216 base_reg = hard_frame_pointer_rtx;
9217 base_offset = m->fs.fp_offset - cfa_offset;
9219 else if (m->fs.drap_valid)
9221 base_reg = crtl->drap_reg;
9222 base_offset = 0 - cfa_offset;
9224 else if (m->fs.sp_valid)
9226 base_reg = stack_pointer_rtx;
9227 base_offset = m->fs.sp_offset - cfa_offset;
9230 else
9232 HOST_WIDE_INT toffset;
9233 int len = 16, tlen;
9235 /* Choose the base register with the smallest address encoding.
9236 With a tie, choose FP > DRAP > SP. */
9237 if (m->fs.sp_valid)
9239 base_reg = stack_pointer_rtx;
9240 base_offset = m->fs.sp_offset - cfa_offset;
9241 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9243 if (m->fs.drap_valid)
9245 toffset = 0 - cfa_offset;
9246 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9247 if (tlen <= len)
9249 base_reg = crtl->drap_reg;
9250 base_offset = toffset;
9251 len = tlen;
9254 if (m->fs.fp_valid)
9256 toffset = m->fs.fp_offset - cfa_offset;
9257 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9258 if (tlen <= len)
9260 base_reg = hard_frame_pointer_rtx;
9261 base_offset = toffset;
9262 len = tlen;
9266 gcc_assert (base_reg != NULL);
9268 return plus_constant (Pmode, base_reg, base_offset);
9271 /* Emit code to save registers in the prologue. */
9273 static void
9274 ix86_emit_save_regs (void)
9276 unsigned int regno;
9277 rtx insn;
9279 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9280 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9282 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9283 RTX_FRAME_RELATED_P (insn) = 1;
9287 /* Emit a single register save at CFA - CFA_OFFSET. */
9289 static void
9290 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9291 HOST_WIDE_INT cfa_offset)
9293 struct machine_function *m = cfun->machine;
9294 rtx reg = gen_rtx_REG (mode, regno);
9295 rtx mem, addr, base, insn;
9297 addr = choose_baseaddr (cfa_offset);
9298 mem = gen_frame_mem (mode, addr);
9300 /* For SSE saves, we need to indicate the 128-bit alignment. */
9301 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9303 insn = emit_move_insn (mem, reg);
9304 RTX_FRAME_RELATED_P (insn) = 1;
9306 base = addr;
9307 if (GET_CODE (base) == PLUS)
9308 base = XEXP (base, 0);
9309 gcc_checking_assert (REG_P (base));
9311 /* When saving registers into a re-aligned local stack frame, avoid
9312 any tricky guessing by dwarf2out. */
9313 if (m->fs.realigned)
9315 gcc_checking_assert (stack_realign_drap);
9317 if (regno == REGNO (crtl->drap_reg))
9319 /* A bit of a hack. We force the DRAP register to be saved in
9320 the re-aligned stack frame, which provides us with a copy
9321 of the CFA that will last past the prologue. Install it. */
9322 gcc_checking_assert (cfun->machine->fs.fp_valid);
9323 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9324 cfun->machine->fs.fp_offset - cfa_offset);
9325 mem = gen_rtx_MEM (mode, addr);
9326 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9328 else
9330 /* The frame pointer is a stable reference within the
9331 aligned frame. Use it. */
9332 gcc_checking_assert (cfun->machine->fs.fp_valid);
9333 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9334 cfun->machine->fs.fp_offset - cfa_offset);
9335 mem = gen_rtx_MEM (mode, addr);
9336 add_reg_note (insn, REG_CFA_EXPRESSION,
9337 gen_rtx_SET (VOIDmode, mem, reg));
9341 /* The memory may not be relative to the current CFA register,
9342 which means that we may need to generate a new pattern for
9343 use by the unwind info. */
9344 else if (base != m->fs.cfa_reg)
9346 addr = plus_constant (Pmode, m->fs.cfa_reg,
9347 m->fs.cfa_offset - cfa_offset);
9348 mem = gen_rtx_MEM (mode, addr);
9349 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9353 /* Emit code to save registers using MOV insns.
9354 First register is stored at CFA - CFA_OFFSET. */
9355 static void
9356 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9358 unsigned int regno;
9360 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9361 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9363 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9364 cfa_offset -= UNITS_PER_WORD;
9368 /* Emit code to save SSE registers using MOV insns.
9369 First register is stored at CFA - CFA_OFFSET. */
9370 static void
9371 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9373 unsigned int regno;
9375 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9376 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9378 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9379 cfa_offset -= 16;
9383 static GTY(()) rtx queued_cfa_restores;
9385 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9386 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9387 Don't add the note if the previously saved value will be left untouched
9388 within stack red-zone till return, as unwinders can find the same value
9389 in the register and on the stack. */
9391 static void
9392 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9394 if (!crtl->shrink_wrapped
9395 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9396 return;
9398 if (insn)
9400 add_reg_note (insn, REG_CFA_RESTORE, reg);
9401 RTX_FRAME_RELATED_P (insn) = 1;
9403 else
9404 queued_cfa_restores
9405 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9408 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9410 static void
9411 ix86_add_queued_cfa_restore_notes (rtx insn)
9413 rtx last;
9414 if (!queued_cfa_restores)
9415 return;
9416 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9418 XEXP (last, 1) = REG_NOTES (insn);
9419 REG_NOTES (insn) = queued_cfa_restores;
9420 queued_cfa_restores = NULL_RTX;
9421 RTX_FRAME_RELATED_P (insn) = 1;
9424 /* Expand prologue or epilogue stack adjustment.
9425 The pattern exist to put a dependency on all ebp-based memory accesses.
9426 STYLE should be negative if instructions should be marked as frame related,
9427 zero if %r11 register is live and cannot be freely used and positive
9428 otherwise. */
9430 static void
9431 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9432 int style, bool set_cfa)
9434 struct machine_function *m = cfun->machine;
9435 rtx insn;
9436 bool add_frame_related_expr = false;
9438 if (Pmode == SImode)
9439 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9440 else if (x86_64_immediate_operand (offset, DImode))
9441 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9442 else
9444 rtx tmp;
9445 /* r11 is used by indirect sibcall return as well, set before the
9446 epilogue and used after the epilogue. */
9447 if (style)
9448 tmp = gen_rtx_REG (DImode, R11_REG);
9449 else
9451 gcc_assert (src != hard_frame_pointer_rtx
9452 && dest != hard_frame_pointer_rtx);
9453 tmp = hard_frame_pointer_rtx;
9455 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9456 if (style < 0)
9457 add_frame_related_expr = true;
9459 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9462 insn = emit_insn (insn);
9463 if (style >= 0)
9464 ix86_add_queued_cfa_restore_notes (insn);
9466 if (set_cfa)
9468 rtx r;
9470 gcc_assert (m->fs.cfa_reg == src);
9471 m->fs.cfa_offset += INTVAL (offset);
9472 m->fs.cfa_reg = dest;
9474 r = gen_rtx_PLUS (Pmode, src, offset);
9475 r = gen_rtx_SET (VOIDmode, dest, r);
9476 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9477 RTX_FRAME_RELATED_P (insn) = 1;
9479 else if (style < 0)
9481 RTX_FRAME_RELATED_P (insn) = 1;
9482 if (add_frame_related_expr)
9484 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9485 r = gen_rtx_SET (VOIDmode, dest, r);
9486 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9490 if (dest == stack_pointer_rtx)
9492 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9493 bool valid = m->fs.sp_valid;
9495 if (src == hard_frame_pointer_rtx)
9497 valid = m->fs.fp_valid;
9498 ooffset = m->fs.fp_offset;
9500 else if (src == crtl->drap_reg)
9502 valid = m->fs.drap_valid;
9503 ooffset = 0;
9505 else
9507 /* Else there are two possibilities: SP itself, which we set
9508 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9509 taken care of this by hand along the eh_return path. */
9510 gcc_checking_assert (src == stack_pointer_rtx
9511 || offset == const0_rtx);
9514 m->fs.sp_offset = ooffset - INTVAL (offset);
9515 m->fs.sp_valid = valid;
9519 /* Find an available register to be used as dynamic realign argument
9520 pointer regsiter. Such a register will be written in prologue and
9521 used in begin of body, so it must not be
9522 1. parameter passing register.
9523 2. GOT pointer.
9524 We reuse static-chain register if it is available. Otherwise, we
9525 use DI for i386 and R13 for x86-64. We chose R13 since it has
9526 shorter encoding.
9528 Return: the regno of chosen register. */
9530 static unsigned int
9531 find_drap_reg (void)
9533 tree decl = cfun->decl;
9535 if (TARGET_64BIT)
9537 /* Use R13 for nested function or function need static chain.
9538 Since function with tail call may use any caller-saved
9539 registers in epilogue, DRAP must not use caller-saved
9540 register in such case. */
9541 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9542 return R13_REG;
9544 return R10_REG;
9546 else
9548 /* Use DI for nested function or function need static chain.
9549 Since function with tail call may use any caller-saved
9550 registers in epilogue, DRAP must not use caller-saved
9551 register in such case. */
9552 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9553 return DI_REG;
9555 /* Reuse static chain register if it isn't used for parameter
9556 passing. */
9557 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9559 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9560 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9561 return CX_REG;
9563 return DI_REG;
9567 /* Return minimum incoming stack alignment. */
9569 static unsigned int
9570 ix86_minimum_incoming_stack_boundary (bool sibcall)
9572 unsigned int incoming_stack_boundary;
9574 /* Prefer the one specified at command line. */
9575 if (ix86_user_incoming_stack_boundary)
9576 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9577 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9578 if -mstackrealign is used, it isn't used for sibcall check and
9579 estimated stack alignment is 128bit. */
9580 else if (!sibcall
9581 && !TARGET_64BIT
9582 && ix86_force_align_arg_pointer
9583 && crtl->stack_alignment_estimated == 128)
9584 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9585 else
9586 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9588 /* Incoming stack alignment can be changed on individual functions
9589 via force_align_arg_pointer attribute. We use the smallest
9590 incoming stack boundary. */
9591 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9592 && lookup_attribute (ix86_force_align_arg_pointer_string,
9593 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9594 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9596 /* The incoming stack frame has to be aligned at least at
9597 parm_stack_boundary. */
9598 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9599 incoming_stack_boundary = crtl->parm_stack_boundary;
9601 /* Stack at entrance of main is aligned by runtime. We use the
9602 smallest incoming stack boundary. */
9603 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9604 && DECL_NAME (current_function_decl)
9605 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9606 && DECL_FILE_SCOPE_P (current_function_decl))
9607 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9609 return incoming_stack_boundary;
9612 /* Update incoming stack boundary and estimated stack alignment. */
9614 static void
9615 ix86_update_stack_boundary (void)
9617 ix86_incoming_stack_boundary
9618 = ix86_minimum_incoming_stack_boundary (false);
9620 /* x86_64 vararg needs 16byte stack alignment for register save
9621 area. */
9622 if (TARGET_64BIT
9623 && cfun->stdarg
9624 && crtl->stack_alignment_estimated < 128)
9625 crtl->stack_alignment_estimated = 128;
9628 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9629 needed or an rtx for DRAP otherwise. */
9631 static rtx
9632 ix86_get_drap_rtx (void)
9634 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9635 crtl->need_drap = true;
9637 if (stack_realign_drap)
9639 /* Assign DRAP to vDRAP and returns vDRAP */
9640 unsigned int regno = find_drap_reg ();
9641 rtx drap_vreg;
9642 rtx arg_ptr;
9643 rtx seq, insn;
9645 arg_ptr = gen_rtx_REG (Pmode, regno);
9646 crtl->drap_reg = arg_ptr;
9648 start_sequence ();
9649 drap_vreg = copy_to_reg (arg_ptr);
9650 seq = get_insns ();
9651 end_sequence ();
9653 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9654 if (!optimize)
9656 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9657 RTX_FRAME_RELATED_P (insn) = 1;
9659 return drap_vreg;
9661 else
9662 return NULL;
9665 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9667 static rtx
9668 ix86_internal_arg_pointer (void)
9670 return virtual_incoming_args_rtx;
9673 struct scratch_reg {
9674 rtx reg;
9675 bool saved;
9678 /* Return a short-lived scratch register for use on function entry.
9679 In 32-bit mode, it is valid only after the registers are saved
9680 in the prologue. This register must be released by means of
9681 release_scratch_register_on_entry once it is dead. */
9683 static void
9684 get_scratch_register_on_entry (struct scratch_reg *sr)
9686 int regno;
9688 sr->saved = false;
9690 if (TARGET_64BIT)
9692 /* We always use R11 in 64-bit mode. */
9693 regno = R11_REG;
9695 else
9697 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9698 bool fastcall_p
9699 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9700 bool thiscall_p
9701 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9702 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9703 int regparm = ix86_function_regparm (fntype, decl);
9704 int drap_regno
9705 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9707 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9708 for the static chain register. */
9709 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9710 && drap_regno != AX_REG)
9711 regno = AX_REG;
9712 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9713 for the static chain register. */
9714 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9715 regno = AX_REG;
9716 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9717 regno = DX_REG;
9718 /* ecx is the static chain register. */
9719 else if (regparm < 3 && !fastcall_p && !thiscall_p
9720 && !static_chain_p
9721 && drap_regno != CX_REG)
9722 regno = CX_REG;
9723 else if (ix86_save_reg (BX_REG, true))
9724 regno = BX_REG;
9725 /* esi is the static chain register. */
9726 else if (!(regparm == 3 && static_chain_p)
9727 && ix86_save_reg (SI_REG, true))
9728 regno = SI_REG;
9729 else if (ix86_save_reg (DI_REG, true))
9730 regno = DI_REG;
9731 else
9733 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9734 sr->saved = true;
9738 sr->reg = gen_rtx_REG (Pmode, regno);
9739 if (sr->saved)
9741 rtx insn = emit_insn (gen_push (sr->reg));
9742 RTX_FRAME_RELATED_P (insn) = 1;
9746 /* Release a scratch register obtained from the preceding function. */
9748 static void
9749 release_scratch_register_on_entry (struct scratch_reg *sr)
9751 if (sr->saved)
9753 struct machine_function *m = cfun->machine;
9754 rtx x, insn = emit_insn (gen_pop (sr->reg));
9756 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9757 RTX_FRAME_RELATED_P (insn) = 1;
9758 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9759 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9760 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9761 m->fs.sp_offset -= UNITS_PER_WORD;
9765 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9767 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9769 static void
9770 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9772 /* We skip the probe for the first interval + a small dope of 4 words and
9773 probe that many bytes past the specified size to maintain a protection
9774 area at the botton of the stack. */
9775 const int dope = 4 * UNITS_PER_WORD;
9776 rtx size_rtx = GEN_INT (size), last;
9778 /* See if we have a constant small number of probes to generate. If so,
9779 that's the easy case. The run-time loop is made up of 11 insns in the
9780 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9781 for n # of intervals. */
9782 if (size <= 5 * PROBE_INTERVAL)
9784 HOST_WIDE_INT i, adjust;
9785 bool first_probe = true;
9787 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9788 values of N from 1 until it exceeds SIZE. If only one probe is
9789 needed, this will not generate any code. Then adjust and probe
9790 to PROBE_INTERVAL + SIZE. */
9791 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9793 if (first_probe)
9795 adjust = 2 * PROBE_INTERVAL + dope;
9796 first_probe = false;
9798 else
9799 adjust = PROBE_INTERVAL;
9801 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9802 plus_constant (Pmode, stack_pointer_rtx,
9803 -adjust)));
9804 emit_stack_probe (stack_pointer_rtx);
9807 if (first_probe)
9808 adjust = size + PROBE_INTERVAL + dope;
9809 else
9810 adjust = size + PROBE_INTERVAL - i;
9812 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9813 plus_constant (Pmode, stack_pointer_rtx,
9814 -adjust)));
9815 emit_stack_probe (stack_pointer_rtx);
9817 /* Adjust back to account for the additional first interval. */
9818 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9819 plus_constant (Pmode, stack_pointer_rtx,
9820 PROBE_INTERVAL + dope)));
9823 /* Otherwise, do the same as above, but in a loop. Note that we must be
9824 extra careful with variables wrapping around because we might be at
9825 the very top (or the very bottom) of the address space and we have
9826 to be able to handle this case properly; in particular, we use an
9827 equality test for the loop condition. */
9828 else
9830 HOST_WIDE_INT rounded_size;
9831 struct scratch_reg sr;
9833 get_scratch_register_on_entry (&sr);
9836 /* Step 1: round SIZE to the previous multiple of the interval. */
9838 rounded_size = size & -PROBE_INTERVAL;
9841 /* Step 2: compute initial and final value of the loop counter. */
9843 /* SP = SP_0 + PROBE_INTERVAL. */
9844 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9845 plus_constant (Pmode, stack_pointer_rtx,
9846 - (PROBE_INTERVAL + dope))));
9848 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9849 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9850 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9851 gen_rtx_PLUS (Pmode, sr.reg,
9852 stack_pointer_rtx)));
9855 /* Step 3: the loop
9857 while (SP != LAST_ADDR)
9859 SP = SP + PROBE_INTERVAL
9860 probe at SP
9863 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9864 values of N from 1 until it is equal to ROUNDED_SIZE. */
9866 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9869 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9870 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9872 if (size != rounded_size)
9874 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9875 plus_constant (Pmode, stack_pointer_rtx,
9876 rounded_size - size)));
9877 emit_stack_probe (stack_pointer_rtx);
9880 /* Adjust back to account for the additional first interval. */
9881 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9882 plus_constant (Pmode, stack_pointer_rtx,
9883 PROBE_INTERVAL + dope)));
9885 release_scratch_register_on_entry (&sr);
9888 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9890 /* Even if the stack pointer isn't the CFA register, we need to correctly
9891 describe the adjustments made to it, in particular differentiate the
9892 frame-related ones from the frame-unrelated ones. */
9893 if (size > 0)
9895 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9896 XVECEXP (expr, 0, 0)
9897 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9898 plus_constant (Pmode, stack_pointer_rtx, -size));
9899 XVECEXP (expr, 0, 1)
9900 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9901 plus_constant (Pmode, stack_pointer_rtx,
9902 PROBE_INTERVAL + dope + size));
9903 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9904 RTX_FRAME_RELATED_P (last) = 1;
9906 cfun->machine->fs.sp_offset += size;
9909 /* Make sure nothing is scheduled before we are done. */
9910 emit_insn (gen_blockage ());
9913 /* Adjust the stack pointer up to REG while probing it. */
9915 const char *
9916 output_adjust_stack_and_probe (rtx reg)
9918 static int labelno = 0;
9919 char loop_lab[32], end_lab[32];
9920 rtx xops[2];
9922 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9923 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9925 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9927 /* Jump to END_LAB if SP == LAST_ADDR. */
9928 xops[0] = stack_pointer_rtx;
9929 xops[1] = reg;
9930 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9931 fputs ("\tje\t", asm_out_file);
9932 assemble_name_raw (asm_out_file, end_lab);
9933 fputc ('\n', asm_out_file);
9935 /* SP = SP + PROBE_INTERVAL. */
9936 xops[1] = GEN_INT (PROBE_INTERVAL);
9937 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9939 /* Probe at SP. */
9940 xops[1] = const0_rtx;
9941 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9943 fprintf (asm_out_file, "\tjmp\t");
9944 assemble_name_raw (asm_out_file, loop_lab);
9945 fputc ('\n', asm_out_file);
9947 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9949 return "";
9952 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9953 inclusive. These are offsets from the current stack pointer. */
9955 static void
9956 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9958 /* See if we have a constant small number of probes to generate. If so,
9959 that's the easy case. The run-time loop is made up of 7 insns in the
9960 generic case while the compile-time loop is made up of n insns for n #
9961 of intervals. */
9962 if (size <= 7 * PROBE_INTERVAL)
9964 HOST_WIDE_INT i;
9966 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9967 it exceeds SIZE. If only one probe is needed, this will not
9968 generate any code. Then probe at FIRST + SIZE. */
9969 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9970 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9971 -(first + i)));
9973 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9974 -(first + size)));
9977 /* Otherwise, do the same as above, but in a loop. Note that we must be
9978 extra careful with variables wrapping around because we might be at
9979 the very top (or the very bottom) of the address space and we have
9980 to be able to handle this case properly; in particular, we use an
9981 equality test for the loop condition. */
9982 else
9984 HOST_WIDE_INT rounded_size, last;
9985 struct scratch_reg sr;
9987 get_scratch_register_on_entry (&sr);
9990 /* Step 1: round SIZE to the previous multiple of the interval. */
9992 rounded_size = size & -PROBE_INTERVAL;
9995 /* Step 2: compute initial and final value of the loop counter. */
9997 /* TEST_OFFSET = FIRST. */
9998 emit_move_insn (sr.reg, GEN_INT (-first));
10000 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10001 last = first + rounded_size;
10004 /* Step 3: the loop
10006 while (TEST_ADDR != LAST_ADDR)
10008 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10009 probe at TEST_ADDR
10012 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10013 until it is equal to ROUNDED_SIZE. */
10015 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10018 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10019 that SIZE is equal to ROUNDED_SIZE. */
10021 if (size != rounded_size)
10022 emit_stack_probe (plus_constant (Pmode,
10023 gen_rtx_PLUS (Pmode,
10024 stack_pointer_rtx,
10025 sr.reg),
10026 rounded_size - size));
10028 release_scratch_register_on_entry (&sr);
10031 /* Make sure nothing is scheduled before we are done. */
10032 emit_insn (gen_blockage ());
10035 /* Probe a range of stack addresses from REG to END, inclusive. These are
10036 offsets from the current stack pointer. */
10038 const char *
10039 output_probe_stack_range (rtx reg, rtx end)
10041 static int labelno = 0;
10042 char loop_lab[32], end_lab[32];
10043 rtx xops[3];
10045 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10046 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10048 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10050 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10051 xops[0] = reg;
10052 xops[1] = end;
10053 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10054 fputs ("\tje\t", asm_out_file);
10055 assemble_name_raw (asm_out_file, end_lab);
10056 fputc ('\n', asm_out_file);
10058 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10059 xops[1] = GEN_INT (PROBE_INTERVAL);
10060 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10062 /* Probe at TEST_ADDR. */
10063 xops[0] = stack_pointer_rtx;
10064 xops[1] = reg;
10065 xops[2] = const0_rtx;
10066 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10068 fprintf (asm_out_file, "\tjmp\t");
10069 assemble_name_raw (asm_out_file, loop_lab);
10070 fputc ('\n', asm_out_file);
10072 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10074 return "";
10077 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10078 to be generated in correct form. */
10079 static void
10080 ix86_finalize_stack_realign_flags (void)
10082 /* Check if stack realign is really needed after reload, and
10083 stores result in cfun */
10084 unsigned int incoming_stack_boundary
10085 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10086 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10087 unsigned int stack_realign = (incoming_stack_boundary
10088 < (crtl->is_leaf
10089 ? crtl->max_used_stack_slot_alignment
10090 : crtl->stack_alignment_needed));
10092 if (crtl->stack_realign_finalized)
10094 /* After stack_realign_needed is finalized, we can't no longer
10095 change it. */
10096 gcc_assert (crtl->stack_realign_needed == stack_realign);
10097 return;
10100 /* If the only reason for frame_pointer_needed is that we conservatively
10101 assumed stack realignment might be needed, but in the end nothing that
10102 needed the stack alignment had been spilled, clear frame_pointer_needed
10103 and say we don't need stack realignment. */
10104 if (stack_realign
10105 && !crtl->need_drap
10106 && frame_pointer_needed
10107 && crtl->is_leaf
10108 && flag_omit_frame_pointer
10109 && crtl->sp_is_unchanging
10110 && !ix86_current_function_calls_tls_descriptor
10111 && !crtl->accesses_prior_frames
10112 && !cfun->calls_alloca
10113 && !crtl->calls_eh_return
10114 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10115 && !ix86_frame_pointer_required ()
10116 && get_frame_size () == 0
10117 && ix86_nsaved_sseregs () == 0
10118 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10120 HARD_REG_SET set_up_by_prologue, prologue_used;
10121 basic_block bb;
10123 CLEAR_HARD_REG_SET (prologue_used);
10124 CLEAR_HARD_REG_SET (set_up_by_prologue);
10125 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10126 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10127 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10128 HARD_FRAME_POINTER_REGNUM);
10129 FOR_EACH_BB (bb)
10131 rtx insn;
10132 FOR_BB_INSNS (bb, insn)
10133 if (NONDEBUG_INSN_P (insn)
10134 && requires_stack_frame_p (insn, prologue_used,
10135 set_up_by_prologue))
10137 crtl->stack_realign_needed = stack_realign;
10138 crtl->stack_realign_finalized = true;
10139 return;
10143 frame_pointer_needed = false;
10144 stack_realign = false;
10145 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10146 crtl->stack_alignment_needed = incoming_stack_boundary;
10147 crtl->stack_alignment_estimated = incoming_stack_boundary;
10148 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10149 crtl->preferred_stack_boundary = incoming_stack_boundary;
10150 df_finish_pass (true);
10151 df_scan_alloc (NULL);
10152 df_scan_blocks ();
10153 df_compute_regs_ever_live (true);
10154 df_analyze ();
10157 crtl->stack_realign_needed = stack_realign;
10158 crtl->stack_realign_finalized = true;
10161 /* Expand the prologue into a bunch of separate insns. */
10163 void
10164 ix86_expand_prologue (void)
10166 struct machine_function *m = cfun->machine;
10167 rtx insn, t;
10168 bool pic_reg_used;
10169 struct ix86_frame frame;
10170 HOST_WIDE_INT allocate;
10171 bool int_registers_saved;
10172 bool sse_registers_saved;
10174 ix86_finalize_stack_realign_flags ();
10176 /* DRAP should not coexist with stack_realign_fp */
10177 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10179 memset (&m->fs, 0, sizeof (m->fs));
10181 /* Initialize CFA state for before the prologue. */
10182 m->fs.cfa_reg = stack_pointer_rtx;
10183 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10185 /* Track SP offset to the CFA. We continue tracking this after we've
10186 swapped the CFA register away from SP. In the case of re-alignment
10187 this is fudged; we're interested to offsets within the local frame. */
10188 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10189 m->fs.sp_valid = true;
10191 ix86_compute_frame_layout (&frame);
10193 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10195 /* We should have already generated an error for any use of
10196 ms_hook on a nested function. */
10197 gcc_checking_assert (!ix86_static_chain_on_stack);
10199 /* Check if profiling is active and we shall use profiling before
10200 prologue variant. If so sorry. */
10201 if (crtl->profile && flag_fentry != 0)
10202 sorry ("ms_hook_prologue attribute isn%'t compatible "
10203 "with -mfentry for 32-bit");
10205 /* In ix86_asm_output_function_label we emitted:
10206 8b ff movl.s %edi,%edi
10207 55 push %ebp
10208 8b ec movl.s %esp,%ebp
10210 This matches the hookable function prologue in Win32 API
10211 functions in Microsoft Windows XP Service Pack 2 and newer.
10212 Wine uses this to enable Windows apps to hook the Win32 API
10213 functions provided by Wine.
10215 What that means is that we've already set up the frame pointer. */
10217 if (frame_pointer_needed
10218 && !(crtl->drap_reg && crtl->stack_realign_needed))
10220 rtx push, mov;
10222 /* We've decided to use the frame pointer already set up.
10223 Describe this to the unwinder by pretending that both
10224 push and mov insns happen right here.
10226 Putting the unwind info here at the end of the ms_hook
10227 is done so that we can make absolutely certain we get
10228 the required byte sequence at the start of the function,
10229 rather than relying on an assembler that can produce
10230 the exact encoding required.
10232 However it does mean (in the unpatched case) that we have
10233 a 1 insn window where the asynchronous unwind info is
10234 incorrect. However, if we placed the unwind info at
10235 its correct location we would have incorrect unwind info
10236 in the patched case. Which is probably all moot since
10237 I don't expect Wine generates dwarf2 unwind info for the
10238 system libraries that use this feature. */
10240 insn = emit_insn (gen_blockage ());
10242 push = gen_push (hard_frame_pointer_rtx);
10243 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10244 stack_pointer_rtx);
10245 RTX_FRAME_RELATED_P (push) = 1;
10246 RTX_FRAME_RELATED_P (mov) = 1;
10248 RTX_FRAME_RELATED_P (insn) = 1;
10249 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10250 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10252 /* Note that gen_push incremented m->fs.cfa_offset, even
10253 though we didn't emit the push insn here. */
10254 m->fs.cfa_reg = hard_frame_pointer_rtx;
10255 m->fs.fp_offset = m->fs.cfa_offset;
10256 m->fs.fp_valid = true;
10258 else
10260 /* The frame pointer is not needed so pop %ebp again.
10261 This leaves us with a pristine state. */
10262 emit_insn (gen_pop (hard_frame_pointer_rtx));
10266 /* The first insn of a function that accepts its static chain on the
10267 stack is to push the register that would be filled in by a direct
10268 call. This insn will be skipped by the trampoline. */
10269 else if (ix86_static_chain_on_stack)
10271 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10272 emit_insn (gen_blockage ());
10274 /* We don't want to interpret this push insn as a register save,
10275 only as a stack adjustment. The real copy of the register as
10276 a save will be done later, if needed. */
10277 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10278 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10279 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10280 RTX_FRAME_RELATED_P (insn) = 1;
10283 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10284 of DRAP is needed and stack realignment is really needed after reload */
10285 if (stack_realign_drap)
10287 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10289 /* Only need to push parameter pointer reg if it is caller saved. */
10290 if (!call_used_regs[REGNO (crtl->drap_reg)])
10292 /* Push arg pointer reg */
10293 insn = emit_insn (gen_push (crtl->drap_reg));
10294 RTX_FRAME_RELATED_P (insn) = 1;
10297 /* Grab the argument pointer. */
10298 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10299 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10300 RTX_FRAME_RELATED_P (insn) = 1;
10301 m->fs.cfa_reg = crtl->drap_reg;
10302 m->fs.cfa_offset = 0;
10304 /* Align the stack. */
10305 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10306 stack_pointer_rtx,
10307 GEN_INT (-align_bytes)));
10308 RTX_FRAME_RELATED_P (insn) = 1;
10310 /* Replicate the return address on the stack so that return
10311 address can be reached via (argp - 1) slot. This is needed
10312 to implement macro RETURN_ADDR_RTX and intrinsic function
10313 expand_builtin_return_addr etc. */
10314 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10315 t = gen_frame_mem (word_mode, t);
10316 insn = emit_insn (gen_push (t));
10317 RTX_FRAME_RELATED_P (insn) = 1;
10319 /* For the purposes of frame and register save area addressing,
10320 we've started over with a new frame. */
10321 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10322 m->fs.realigned = true;
10325 int_registers_saved = (frame.nregs == 0);
10326 sse_registers_saved = (frame.nsseregs == 0);
10328 if (frame_pointer_needed && !m->fs.fp_valid)
10330 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10331 slower on all targets. Also sdb doesn't like it. */
10332 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10333 RTX_FRAME_RELATED_P (insn) = 1;
10335 /* Push registers now, before setting the frame pointer
10336 on SEH target. */
10337 if (!int_registers_saved
10338 && TARGET_SEH
10339 && !frame.save_regs_using_mov)
10341 ix86_emit_save_regs ();
10342 int_registers_saved = true;
10343 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10346 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10348 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10349 RTX_FRAME_RELATED_P (insn) = 1;
10351 if (m->fs.cfa_reg == stack_pointer_rtx)
10352 m->fs.cfa_reg = hard_frame_pointer_rtx;
10353 m->fs.fp_offset = m->fs.sp_offset;
10354 m->fs.fp_valid = true;
10358 if (!int_registers_saved)
10360 /* If saving registers via PUSH, do so now. */
10361 if (!frame.save_regs_using_mov)
10363 ix86_emit_save_regs ();
10364 int_registers_saved = true;
10365 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10368 /* When using red zone we may start register saving before allocating
10369 the stack frame saving one cycle of the prologue. However, avoid
10370 doing this if we have to probe the stack; at least on x86_64 the
10371 stack probe can turn into a call that clobbers a red zone location. */
10372 else if (ix86_using_red_zone ()
10373 && (! TARGET_STACK_PROBE
10374 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10376 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10377 int_registers_saved = true;
10381 if (stack_realign_fp)
10383 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10384 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10386 /* The computation of the size of the re-aligned stack frame means
10387 that we must allocate the size of the register save area before
10388 performing the actual alignment. Otherwise we cannot guarantee
10389 that there's enough storage above the realignment point. */
10390 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10391 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10392 GEN_INT (m->fs.sp_offset
10393 - frame.sse_reg_save_offset),
10394 -1, false);
10396 /* Align the stack. */
10397 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10398 stack_pointer_rtx,
10399 GEN_INT (-align_bytes)));
10401 /* For the purposes of register save area addressing, the stack
10402 pointer is no longer valid. As for the value of sp_offset,
10403 see ix86_compute_frame_layout, which we need to match in order
10404 to pass verification of stack_pointer_offset at the end. */
10405 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10406 m->fs.sp_valid = false;
10409 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10411 if (flag_stack_usage_info)
10413 /* We start to count from ARG_POINTER. */
10414 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10416 /* If it was realigned, take into account the fake frame. */
10417 if (stack_realign_drap)
10419 if (ix86_static_chain_on_stack)
10420 stack_size += UNITS_PER_WORD;
10422 if (!call_used_regs[REGNO (crtl->drap_reg)])
10423 stack_size += UNITS_PER_WORD;
10425 /* This over-estimates by 1 minimal-stack-alignment-unit but
10426 mitigates that by counting in the new return address slot. */
10427 current_function_dynamic_stack_size
10428 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10431 current_function_static_stack_size = stack_size;
10434 /* On SEH target with very large frame size, allocate an area to save
10435 SSE registers (as the very large allocation won't be described). */
10436 if (TARGET_SEH
10437 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10438 && !sse_registers_saved)
10440 HOST_WIDE_INT sse_size =
10441 frame.sse_reg_save_offset - frame.reg_save_offset;
10443 gcc_assert (int_registers_saved);
10445 /* No need to do stack checking as the area will be immediately
10446 written. */
10447 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10448 GEN_INT (-sse_size), -1,
10449 m->fs.cfa_reg == stack_pointer_rtx);
10450 allocate -= sse_size;
10451 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10452 sse_registers_saved = true;
10455 /* The stack has already been decremented by the instruction calling us
10456 so probe if the size is non-negative to preserve the protection area. */
10457 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10459 /* We expect the registers to be saved when probes are used. */
10460 gcc_assert (int_registers_saved);
10462 if (STACK_CHECK_MOVING_SP)
10464 ix86_adjust_stack_and_probe (allocate);
10465 allocate = 0;
10467 else
10469 HOST_WIDE_INT size = allocate;
10471 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10472 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10474 if (TARGET_STACK_PROBE)
10475 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10476 else
10477 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10481 if (allocate == 0)
10483 else if (!ix86_target_stack_probe ()
10484 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10486 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10487 GEN_INT (-allocate), -1,
10488 m->fs.cfa_reg == stack_pointer_rtx);
10490 else
10492 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10493 rtx r10 = NULL;
10494 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10495 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10496 bool eax_live = false;
10497 bool r10_live = false;
10499 if (TARGET_64BIT)
10500 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10501 if (!TARGET_64BIT_MS_ABI)
10502 eax_live = ix86_eax_live_at_start_p ();
10504 /* Note that SEH directives need to continue tracking the stack
10505 pointer even after the frame pointer has been set up. */
10506 if (eax_live)
10508 insn = emit_insn (gen_push (eax));
10509 allocate -= UNITS_PER_WORD;
10510 if (sp_is_cfa_reg || TARGET_SEH)
10512 if (sp_is_cfa_reg)
10513 m->fs.cfa_offset += UNITS_PER_WORD;
10514 RTX_FRAME_RELATED_P (insn) = 1;
10518 if (r10_live)
10520 r10 = gen_rtx_REG (Pmode, R10_REG);
10521 insn = emit_insn (gen_push (r10));
10522 allocate -= UNITS_PER_WORD;
10523 if (sp_is_cfa_reg || TARGET_SEH)
10525 if (sp_is_cfa_reg)
10526 m->fs.cfa_offset += UNITS_PER_WORD;
10527 RTX_FRAME_RELATED_P (insn) = 1;
10531 emit_move_insn (eax, GEN_INT (allocate));
10532 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10534 /* Use the fact that AX still contains ALLOCATE. */
10535 adjust_stack_insn = (Pmode == DImode
10536 ? gen_pro_epilogue_adjust_stack_di_sub
10537 : gen_pro_epilogue_adjust_stack_si_sub);
10539 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10540 stack_pointer_rtx, eax));
10542 if (sp_is_cfa_reg || TARGET_SEH)
10544 if (sp_is_cfa_reg)
10545 m->fs.cfa_offset += allocate;
10546 RTX_FRAME_RELATED_P (insn) = 1;
10547 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10548 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10549 plus_constant (Pmode, stack_pointer_rtx,
10550 -allocate)));
10552 m->fs.sp_offset += allocate;
10554 if (r10_live && eax_live)
10556 t = choose_baseaddr (m->fs.sp_offset - allocate);
10557 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10558 gen_frame_mem (word_mode, t));
10559 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10560 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10561 gen_frame_mem (word_mode, t));
10563 else if (eax_live || r10_live)
10565 t = choose_baseaddr (m->fs.sp_offset - allocate);
10566 emit_move_insn (gen_rtx_REG (word_mode,
10567 (eax_live ? AX_REG : R10_REG)),
10568 gen_frame_mem (word_mode, t));
10571 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10573 /* If we havn't already set up the frame pointer, do so now. */
10574 if (frame_pointer_needed && !m->fs.fp_valid)
10576 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10577 GEN_INT (frame.stack_pointer_offset
10578 - frame.hard_frame_pointer_offset));
10579 insn = emit_insn (insn);
10580 RTX_FRAME_RELATED_P (insn) = 1;
10581 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10583 if (m->fs.cfa_reg == stack_pointer_rtx)
10584 m->fs.cfa_reg = hard_frame_pointer_rtx;
10585 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10586 m->fs.fp_valid = true;
10589 if (!int_registers_saved)
10590 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10591 if (!sse_registers_saved)
10592 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10594 pic_reg_used = false;
10595 /* We don't use pic-register for pe-coff target. */
10596 if (pic_offset_table_rtx
10597 && DEFAULT_ABI != MS_ABI
10598 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10599 || crtl->profile))
10601 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10603 if (alt_pic_reg_used != INVALID_REGNUM)
10604 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10606 pic_reg_used = true;
10609 if (pic_reg_used)
10611 if (TARGET_64BIT)
10613 if (ix86_cmodel == CM_LARGE_PIC)
10615 rtx label, tmp_reg;
10617 gcc_assert (Pmode == DImode);
10618 label = gen_label_rtx ();
10619 emit_label (label);
10620 LABEL_PRESERVE_P (label) = 1;
10621 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10622 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10623 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10624 label));
10625 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10626 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10627 pic_offset_table_rtx, tmp_reg));
10629 else
10630 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10632 else
10634 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10635 RTX_FRAME_RELATED_P (insn) = 1;
10636 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10640 /* In the pic_reg_used case, make sure that the got load isn't deleted
10641 when mcount needs it. Blockage to avoid call movement across mcount
10642 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10643 note. */
10644 if (crtl->profile && !flag_fentry && pic_reg_used)
10645 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10647 if (crtl->drap_reg && !crtl->stack_realign_needed)
10649 /* vDRAP is setup but after reload it turns out stack realign
10650 isn't necessary, here we will emit prologue to setup DRAP
10651 without stack realign adjustment */
10652 t = choose_baseaddr (0);
10653 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10656 /* Prevent instructions from being scheduled into register save push
10657 sequence when access to the redzone area is done through frame pointer.
10658 The offset between the frame pointer and the stack pointer is calculated
10659 relative to the value of the stack pointer at the end of the function
10660 prologue, and moving instructions that access redzone area via frame
10661 pointer inside push sequence violates this assumption. */
10662 if (frame_pointer_needed && frame.red_zone_size)
10663 emit_insn (gen_memory_blockage ());
10665 /* Emit cld instruction if stringops are used in the function. */
10666 if (TARGET_CLD && ix86_current_function_needs_cld)
10667 emit_insn (gen_cld ());
10669 /* SEH requires that the prologue end within 256 bytes of the start of
10670 the function. Prevent instruction schedules that would extend that.
10671 Further, prevent alloca modifications to the stack pointer from being
10672 combined with prologue modifications. */
10673 if (TARGET_SEH)
10674 emit_insn (gen_prologue_use (stack_pointer_rtx));
10677 /* Emit code to restore REG using a POP insn. */
10679 static void
10680 ix86_emit_restore_reg_using_pop (rtx reg)
10682 struct machine_function *m = cfun->machine;
10683 rtx insn = emit_insn (gen_pop (reg));
10685 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10686 m->fs.sp_offset -= UNITS_PER_WORD;
10688 if (m->fs.cfa_reg == crtl->drap_reg
10689 && REGNO (reg) == REGNO (crtl->drap_reg))
10691 /* Previously we'd represented the CFA as an expression
10692 like *(%ebp - 8). We've just popped that value from
10693 the stack, which means we need to reset the CFA to
10694 the drap register. This will remain until we restore
10695 the stack pointer. */
10696 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10697 RTX_FRAME_RELATED_P (insn) = 1;
10699 /* This means that the DRAP register is valid for addressing too. */
10700 m->fs.drap_valid = true;
10701 return;
10704 if (m->fs.cfa_reg == stack_pointer_rtx)
10706 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10707 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10708 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10709 RTX_FRAME_RELATED_P (insn) = 1;
10711 m->fs.cfa_offset -= UNITS_PER_WORD;
10714 /* When the frame pointer is the CFA, and we pop it, we are
10715 swapping back to the stack pointer as the CFA. This happens
10716 for stack frames that don't allocate other data, so we assume
10717 the stack pointer is now pointing at the return address, i.e.
10718 the function entry state, which makes the offset be 1 word. */
10719 if (reg == hard_frame_pointer_rtx)
10721 m->fs.fp_valid = false;
10722 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10724 m->fs.cfa_reg = stack_pointer_rtx;
10725 m->fs.cfa_offset -= UNITS_PER_WORD;
10727 add_reg_note (insn, REG_CFA_DEF_CFA,
10728 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10729 GEN_INT (m->fs.cfa_offset)));
10730 RTX_FRAME_RELATED_P (insn) = 1;
10735 /* Emit code to restore saved registers using POP insns. */
10737 static void
10738 ix86_emit_restore_regs_using_pop (void)
10740 unsigned int regno;
10742 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10743 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10744 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10747 /* Emit code and notes for the LEAVE instruction. */
10749 static void
10750 ix86_emit_leave (void)
10752 struct machine_function *m = cfun->machine;
10753 rtx insn = emit_insn (ix86_gen_leave ());
10755 ix86_add_queued_cfa_restore_notes (insn);
10757 gcc_assert (m->fs.fp_valid);
10758 m->fs.sp_valid = true;
10759 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10760 m->fs.fp_valid = false;
10762 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10764 m->fs.cfa_reg = stack_pointer_rtx;
10765 m->fs.cfa_offset = m->fs.sp_offset;
10767 add_reg_note (insn, REG_CFA_DEF_CFA,
10768 plus_constant (Pmode, stack_pointer_rtx,
10769 m->fs.sp_offset));
10770 RTX_FRAME_RELATED_P (insn) = 1;
10772 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10773 m->fs.fp_offset);
10776 /* Emit code to restore saved registers using MOV insns.
10777 First register is restored from CFA - CFA_OFFSET. */
10778 static void
10779 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10780 bool maybe_eh_return)
10782 struct machine_function *m = cfun->machine;
10783 unsigned int regno;
10785 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10786 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10788 rtx reg = gen_rtx_REG (word_mode, regno);
10789 rtx insn, mem;
10791 mem = choose_baseaddr (cfa_offset);
10792 mem = gen_frame_mem (word_mode, mem);
10793 insn = emit_move_insn (reg, mem);
10795 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10797 /* Previously we'd represented the CFA as an expression
10798 like *(%ebp - 8). We've just popped that value from
10799 the stack, which means we need to reset the CFA to
10800 the drap register. This will remain until we restore
10801 the stack pointer. */
10802 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10803 RTX_FRAME_RELATED_P (insn) = 1;
10805 /* This means that the DRAP register is valid for addressing. */
10806 m->fs.drap_valid = true;
10808 else
10809 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10811 cfa_offset -= UNITS_PER_WORD;
10815 /* Emit code to restore saved registers using MOV insns.
10816 First register is restored from CFA - CFA_OFFSET. */
10817 static void
10818 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10819 bool maybe_eh_return)
10821 unsigned int regno;
10823 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10824 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10826 rtx reg = gen_rtx_REG (V4SFmode, regno);
10827 rtx mem;
10829 mem = choose_baseaddr (cfa_offset);
10830 mem = gen_rtx_MEM (V4SFmode, mem);
10831 set_mem_align (mem, 128);
10832 emit_move_insn (reg, mem);
10834 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10836 cfa_offset -= 16;
10840 /* Restore function stack, frame, and registers. */
10842 void
10843 ix86_expand_epilogue (int style)
10845 struct machine_function *m = cfun->machine;
10846 struct machine_frame_state frame_state_save = m->fs;
10847 struct ix86_frame frame;
10848 bool restore_regs_via_mov;
10849 bool using_drap;
10851 ix86_finalize_stack_realign_flags ();
10852 ix86_compute_frame_layout (&frame);
10854 m->fs.sp_valid = (!frame_pointer_needed
10855 || (crtl->sp_is_unchanging
10856 && !stack_realign_fp));
10857 gcc_assert (!m->fs.sp_valid
10858 || m->fs.sp_offset == frame.stack_pointer_offset);
10860 /* The FP must be valid if the frame pointer is present. */
10861 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10862 gcc_assert (!m->fs.fp_valid
10863 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10865 /* We must have *some* valid pointer to the stack frame. */
10866 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10868 /* The DRAP is never valid at this point. */
10869 gcc_assert (!m->fs.drap_valid);
10871 /* See the comment about red zone and frame
10872 pointer usage in ix86_expand_prologue. */
10873 if (frame_pointer_needed && frame.red_zone_size)
10874 emit_insn (gen_memory_blockage ());
10876 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10877 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10879 /* Determine the CFA offset of the end of the red-zone. */
10880 m->fs.red_zone_offset = 0;
10881 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10883 /* The red-zone begins below the return address. */
10884 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10886 /* When the register save area is in the aligned portion of
10887 the stack, determine the maximum runtime displacement that
10888 matches up with the aligned frame. */
10889 if (stack_realign_drap)
10890 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10891 + UNITS_PER_WORD);
10894 /* Special care must be taken for the normal return case of a function
10895 using eh_return: the eax and edx registers are marked as saved, but
10896 not restored along this path. Adjust the save location to match. */
10897 if (crtl->calls_eh_return && style != 2)
10898 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10900 /* EH_RETURN requires the use of moves to function properly. */
10901 if (crtl->calls_eh_return)
10902 restore_regs_via_mov = true;
10903 /* SEH requires the use of pops to identify the epilogue. */
10904 else if (TARGET_SEH)
10905 restore_regs_via_mov = false;
10906 /* If we're only restoring one register and sp is not valid then
10907 using a move instruction to restore the register since it's
10908 less work than reloading sp and popping the register. */
10909 else if (!m->fs.sp_valid && frame.nregs <= 1)
10910 restore_regs_via_mov = true;
10911 else if (TARGET_EPILOGUE_USING_MOVE
10912 && cfun->machine->use_fast_prologue_epilogue
10913 && (frame.nregs > 1
10914 || m->fs.sp_offset != frame.reg_save_offset))
10915 restore_regs_via_mov = true;
10916 else if (frame_pointer_needed
10917 && !frame.nregs
10918 && m->fs.sp_offset != frame.reg_save_offset)
10919 restore_regs_via_mov = true;
10920 else if (frame_pointer_needed
10921 && TARGET_USE_LEAVE
10922 && cfun->machine->use_fast_prologue_epilogue
10923 && frame.nregs == 1)
10924 restore_regs_via_mov = true;
10925 else
10926 restore_regs_via_mov = false;
10928 if (restore_regs_via_mov || frame.nsseregs)
10930 /* Ensure that the entire register save area is addressable via
10931 the stack pointer, if we will restore via sp. */
10932 if (TARGET_64BIT
10933 && m->fs.sp_offset > 0x7fffffff
10934 && !(m->fs.fp_valid || m->fs.drap_valid)
10935 && (frame.nsseregs + frame.nregs) != 0)
10937 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10938 GEN_INT (m->fs.sp_offset
10939 - frame.sse_reg_save_offset),
10940 style,
10941 m->fs.cfa_reg == stack_pointer_rtx);
10945 /* If there are any SSE registers to restore, then we have to do it
10946 via moves, since there's obviously no pop for SSE regs. */
10947 if (frame.nsseregs)
10948 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10949 style == 2);
10951 if (restore_regs_via_mov)
10953 rtx t;
10955 if (frame.nregs)
10956 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10958 /* eh_return epilogues need %ecx added to the stack pointer. */
10959 if (style == 2)
10961 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10963 /* Stack align doesn't work with eh_return. */
10964 gcc_assert (!stack_realign_drap);
10965 /* Neither does regparm nested functions. */
10966 gcc_assert (!ix86_static_chain_on_stack);
10968 if (frame_pointer_needed)
10970 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10971 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10972 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10974 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10975 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10977 /* Note that we use SA as a temporary CFA, as the return
10978 address is at the proper place relative to it. We
10979 pretend this happens at the FP restore insn because
10980 prior to this insn the FP would be stored at the wrong
10981 offset relative to SA, and after this insn we have no
10982 other reasonable register to use for the CFA. We don't
10983 bother resetting the CFA to the SP for the duration of
10984 the return insn. */
10985 add_reg_note (insn, REG_CFA_DEF_CFA,
10986 plus_constant (Pmode, sa, UNITS_PER_WORD));
10987 ix86_add_queued_cfa_restore_notes (insn);
10988 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10989 RTX_FRAME_RELATED_P (insn) = 1;
10991 m->fs.cfa_reg = sa;
10992 m->fs.cfa_offset = UNITS_PER_WORD;
10993 m->fs.fp_valid = false;
10995 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10996 const0_rtx, style, false);
10998 else
11000 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11001 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11002 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11003 ix86_add_queued_cfa_restore_notes (insn);
11005 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11006 if (m->fs.cfa_offset != UNITS_PER_WORD)
11008 m->fs.cfa_offset = UNITS_PER_WORD;
11009 add_reg_note (insn, REG_CFA_DEF_CFA,
11010 plus_constant (Pmode, stack_pointer_rtx,
11011 UNITS_PER_WORD));
11012 RTX_FRAME_RELATED_P (insn) = 1;
11015 m->fs.sp_offset = UNITS_PER_WORD;
11016 m->fs.sp_valid = true;
11019 else
11021 /* SEH requires that the function end with (1) a stack adjustment
11022 if necessary, (2) a sequence of pops, and (3) a return or
11023 jump instruction. Prevent insns from the function body from
11024 being scheduled into this sequence. */
11025 if (TARGET_SEH)
11027 /* Prevent a catch region from being adjacent to the standard
11028 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11029 several other flags that would be interesting to test are
11030 not yet set up. */
11031 if (flag_non_call_exceptions)
11032 emit_insn (gen_nops (const1_rtx));
11033 else
11034 emit_insn (gen_blockage ());
11037 /* First step is to deallocate the stack frame so that we can
11038 pop the registers. Also do it on SEH target for very large
11039 frame as the emitted instructions aren't allowed by the ABI in
11040 epilogues. */
11041 if (!m->fs.sp_valid
11042 || (TARGET_SEH
11043 && (m->fs.sp_offset - frame.reg_save_offset
11044 >= SEH_MAX_FRAME_SIZE)))
11046 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11047 GEN_INT (m->fs.fp_offset
11048 - frame.reg_save_offset),
11049 style, false);
11051 else if (m->fs.sp_offset != frame.reg_save_offset)
11053 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11054 GEN_INT (m->fs.sp_offset
11055 - frame.reg_save_offset),
11056 style,
11057 m->fs.cfa_reg == stack_pointer_rtx);
11060 ix86_emit_restore_regs_using_pop ();
11063 /* If we used a stack pointer and haven't already got rid of it,
11064 then do so now. */
11065 if (m->fs.fp_valid)
11067 /* If the stack pointer is valid and pointing at the frame
11068 pointer store address, then we only need a pop. */
11069 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11070 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11071 /* Leave results in shorter dependency chains on CPUs that are
11072 able to grok it fast. */
11073 else if (TARGET_USE_LEAVE
11074 || optimize_function_for_size_p (cfun)
11075 || !cfun->machine->use_fast_prologue_epilogue)
11076 ix86_emit_leave ();
11077 else
11079 pro_epilogue_adjust_stack (stack_pointer_rtx,
11080 hard_frame_pointer_rtx,
11081 const0_rtx, style, !using_drap);
11082 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11086 if (using_drap)
11088 int param_ptr_offset = UNITS_PER_WORD;
11089 rtx insn;
11091 gcc_assert (stack_realign_drap);
11093 if (ix86_static_chain_on_stack)
11094 param_ptr_offset += UNITS_PER_WORD;
11095 if (!call_used_regs[REGNO (crtl->drap_reg)])
11096 param_ptr_offset += UNITS_PER_WORD;
11098 insn = emit_insn (gen_rtx_SET
11099 (VOIDmode, stack_pointer_rtx,
11100 gen_rtx_PLUS (Pmode,
11101 crtl->drap_reg,
11102 GEN_INT (-param_ptr_offset))));
11103 m->fs.cfa_reg = stack_pointer_rtx;
11104 m->fs.cfa_offset = param_ptr_offset;
11105 m->fs.sp_offset = param_ptr_offset;
11106 m->fs.realigned = false;
11108 add_reg_note (insn, REG_CFA_DEF_CFA,
11109 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11110 GEN_INT (param_ptr_offset)));
11111 RTX_FRAME_RELATED_P (insn) = 1;
11113 if (!call_used_regs[REGNO (crtl->drap_reg)])
11114 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11117 /* At this point the stack pointer must be valid, and we must have
11118 restored all of the registers. We may not have deallocated the
11119 entire stack frame. We've delayed this until now because it may
11120 be possible to merge the local stack deallocation with the
11121 deallocation forced by ix86_static_chain_on_stack. */
11122 gcc_assert (m->fs.sp_valid);
11123 gcc_assert (!m->fs.fp_valid);
11124 gcc_assert (!m->fs.realigned);
11125 if (m->fs.sp_offset != UNITS_PER_WORD)
11127 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11128 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11129 style, true);
11131 else
11132 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11134 /* Sibcall epilogues don't want a return instruction. */
11135 if (style == 0)
11137 m->fs = frame_state_save;
11138 return;
11141 if (crtl->args.pops_args && crtl->args.size)
11143 rtx popc = GEN_INT (crtl->args.pops_args);
11145 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11146 address, do explicit add, and jump indirectly to the caller. */
11148 if (crtl->args.pops_args >= 65536)
11150 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11151 rtx insn;
11153 /* There is no "pascal" calling convention in any 64bit ABI. */
11154 gcc_assert (!TARGET_64BIT);
11156 insn = emit_insn (gen_pop (ecx));
11157 m->fs.cfa_offset -= UNITS_PER_WORD;
11158 m->fs.sp_offset -= UNITS_PER_WORD;
11160 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11161 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11162 add_reg_note (insn, REG_CFA_REGISTER,
11163 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11164 RTX_FRAME_RELATED_P (insn) = 1;
11166 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11167 popc, -1, true);
11168 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11170 else
11171 emit_jump_insn (gen_simple_return_pop_internal (popc));
11173 else
11174 emit_jump_insn (gen_simple_return_internal ());
11176 /* Restore the state back to the state from the prologue,
11177 so that it's correct for the next epilogue. */
11178 m->fs = frame_state_save;
11181 /* Reset from the function's potential modifications. */
11183 static void
11184 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11185 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11187 if (pic_offset_table_rtx)
11188 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11189 #if TARGET_MACHO
11190 /* Mach-O doesn't support labels at the end of objects, so if
11191 it looks like we might want one, insert a NOP. */
11193 rtx insn = get_last_insn ();
11194 rtx deleted_debug_label = NULL_RTX;
11195 while (insn
11196 && NOTE_P (insn)
11197 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11199 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11200 notes only, instead set their CODE_LABEL_NUMBER to -1,
11201 otherwise there would be code generation differences
11202 in between -g and -g0. */
11203 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11204 deleted_debug_label = insn;
11205 insn = PREV_INSN (insn);
11207 if (insn
11208 && (LABEL_P (insn)
11209 || (NOTE_P (insn)
11210 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11211 fputs ("\tnop\n", file);
11212 else if (deleted_debug_label)
11213 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11214 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11215 CODE_LABEL_NUMBER (insn) = -1;
11217 #endif
11221 /* Return a scratch register to use in the split stack prologue. The
11222 split stack prologue is used for -fsplit-stack. It is the first
11223 instructions in the function, even before the regular prologue.
11224 The scratch register can be any caller-saved register which is not
11225 used for parameters or for the static chain. */
11227 static unsigned int
11228 split_stack_prologue_scratch_regno (void)
11230 if (TARGET_64BIT)
11231 return R11_REG;
11232 else
11234 bool is_fastcall, is_thiscall;
11235 int regparm;
11237 is_fastcall = (lookup_attribute ("fastcall",
11238 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11239 != NULL);
11240 is_thiscall = (lookup_attribute ("thiscall",
11241 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11242 != NULL);
11243 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11245 if (is_fastcall)
11247 if (DECL_STATIC_CHAIN (cfun->decl))
11249 sorry ("-fsplit-stack does not support fastcall with "
11250 "nested function");
11251 return INVALID_REGNUM;
11253 return AX_REG;
11255 else if (is_thiscall)
11257 if (!DECL_STATIC_CHAIN (cfun->decl))
11258 return DX_REG;
11259 return AX_REG;
11261 else if (regparm < 3)
11263 if (!DECL_STATIC_CHAIN (cfun->decl))
11264 return CX_REG;
11265 else
11267 if (regparm >= 2)
11269 sorry ("-fsplit-stack does not support 2 register "
11270 " parameters for a nested function");
11271 return INVALID_REGNUM;
11273 return DX_REG;
11276 else
11278 /* FIXME: We could make this work by pushing a register
11279 around the addition and comparison. */
11280 sorry ("-fsplit-stack does not support 3 register parameters");
11281 return INVALID_REGNUM;
11286 /* A SYMBOL_REF for the function which allocates new stackspace for
11287 -fsplit-stack. */
11289 static GTY(()) rtx split_stack_fn;
11291 /* A SYMBOL_REF for the more stack function when using the large
11292 model. */
11294 static GTY(()) rtx split_stack_fn_large;
11296 /* Handle -fsplit-stack. These are the first instructions in the
11297 function, even before the regular prologue. */
11299 void
11300 ix86_expand_split_stack_prologue (void)
11302 struct ix86_frame frame;
11303 HOST_WIDE_INT allocate;
11304 unsigned HOST_WIDE_INT args_size;
11305 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11306 rtx scratch_reg = NULL_RTX;
11307 rtx varargs_label = NULL_RTX;
11308 rtx fn;
11310 gcc_assert (flag_split_stack && reload_completed);
11312 ix86_finalize_stack_realign_flags ();
11313 ix86_compute_frame_layout (&frame);
11314 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11316 /* This is the label we will branch to if we have enough stack
11317 space. We expect the basic block reordering pass to reverse this
11318 branch if optimizing, so that we branch in the unlikely case. */
11319 label = gen_label_rtx ();
11321 /* We need to compare the stack pointer minus the frame size with
11322 the stack boundary in the TCB. The stack boundary always gives
11323 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11324 can compare directly. Otherwise we need to do an addition. */
11326 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11327 UNSPEC_STACK_CHECK);
11328 limit = gen_rtx_CONST (Pmode, limit);
11329 limit = gen_rtx_MEM (Pmode, limit);
11330 if (allocate < SPLIT_STACK_AVAILABLE)
11331 current = stack_pointer_rtx;
11332 else
11334 unsigned int scratch_regno;
11335 rtx offset;
11337 /* We need a scratch register to hold the stack pointer minus
11338 the required frame size. Since this is the very start of the
11339 function, the scratch register can be any caller-saved
11340 register which is not used for parameters. */
11341 offset = GEN_INT (- allocate);
11342 scratch_regno = split_stack_prologue_scratch_regno ();
11343 if (scratch_regno == INVALID_REGNUM)
11344 return;
11345 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11346 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11348 /* We don't use ix86_gen_add3 in this case because it will
11349 want to split to lea, but when not optimizing the insn
11350 will not be split after this point. */
11351 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11352 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11353 offset)));
11355 else
11357 emit_move_insn (scratch_reg, offset);
11358 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11359 stack_pointer_rtx));
11361 current = scratch_reg;
11364 ix86_expand_branch (GEU, current, limit, label);
11365 jump_insn = get_last_insn ();
11366 JUMP_LABEL (jump_insn) = label;
11368 /* Mark the jump as very likely to be taken. */
11369 add_reg_note (jump_insn, REG_BR_PROB,
11370 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11372 if (split_stack_fn == NULL_RTX)
11373 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11374 fn = split_stack_fn;
11376 /* Get more stack space. We pass in the desired stack space and the
11377 size of the arguments to copy to the new stack. In 32-bit mode
11378 we push the parameters; __morestack will return on a new stack
11379 anyhow. In 64-bit mode we pass the parameters in r10 and
11380 r11. */
11381 allocate_rtx = GEN_INT (allocate);
11382 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11383 call_fusage = NULL_RTX;
11384 if (TARGET_64BIT)
11386 rtx reg10, reg11;
11388 reg10 = gen_rtx_REG (Pmode, R10_REG);
11389 reg11 = gen_rtx_REG (Pmode, R11_REG);
11391 /* If this function uses a static chain, it will be in %r10.
11392 Preserve it across the call to __morestack. */
11393 if (DECL_STATIC_CHAIN (cfun->decl))
11395 rtx rax;
11397 rax = gen_rtx_REG (word_mode, AX_REG);
11398 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11399 use_reg (&call_fusage, rax);
11402 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11403 && DEFAULT_ABI != MS_ABI)
11405 HOST_WIDE_INT argval;
11407 gcc_assert (Pmode == DImode);
11408 /* When using the large model we need to load the address
11409 into a register, and we've run out of registers. So we
11410 switch to a different calling convention, and we call a
11411 different function: __morestack_large. We pass the
11412 argument size in the upper 32 bits of r10 and pass the
11413 frame size in the lower 32 bits. */
11414 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11415 gcc_assert ((args_size & 0xffffffff) == args_size);
11417 if (split_stack_fn_large == NULL_RTX)
11418 split_stack_fn_large =
11419 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11421 if (ix86_cmodel == CM_LARGE_PIC)
11423 rtx label, x;
11425 label = gen_label_rtx ();
11426 emit_label (label);
11427 LABEL_PRESERVE_P (label) = 1;
11428 emit_insn (gen_set_rip_rex64 (reg10, label));
11429 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11430 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11431 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11432 UNSPEC_GOT);
11433 x = gen_rtx_CONST (Pmode, x);
11434 emit_move_insn (reg11, x);
11435 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11436 x = gen_const_mem (Pmode, x);
11437 emit_move_insn (reg11, x);
11439 else
11440 emit_move_insn (reg11, split_stack_fn_large);
11442 fn = reg11;
11444 argval = ((args_size << 16) << 16) + allocate;
11445 emit_move_insn (reg10, GEN_INT (argval));
11447 else
11449 emit_move_insn (reg10, allocate_rtx);
11450 emit_move_insn (reg11, GEN_INT (args_size));
11451 use_reg (&call_fusage, reg11);
11454 use_reg (&call_fusage, reg10);
11456 else
11458 emit_insn (gen_push (GEN_INT (args_size)));
11459 emit_insn (gen_push (allocate_rtx));
11461 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11462 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11463 NULL_RTX, false);
11464 add_function_usage_to (call_insn, call_fusage);
11466 /* In order to make call/return prediction work right, we now need
11467 to execute a return instruction. See
11468 libgcc/config/i386/morestack.S for the details on how this works.
11470 For flow purposes gcc must not see this as a return
11471 instruction--we need control flow to continue at the subsequent
11472 label. Therefore, we use an unspec. */
11473 gcc_assert (crtl->args.pops_args < 65536);
11474 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11476 /* If we are in 64-bit mode and this function uses a static chain,
11477 we saved %r10 in %rax before calling _morestack. */
11478 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11479 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11480 gen_rtx_REG (word_mode, AX_REG));
11482 /* If this function calls va_start, we need to store a pointer to
11483 the arguments on the old stack, because they may not have been
11484 all copied to the new stack. At this point the old stack can be
11485 found at the frame pointer value used by __morestack, because
11486 __morestack has set that up before calling back to us. Here we
11487 store that pointer in a scratch register, and in
11488 ix86_expand_prologue we store the scratch register in a stack
11489 slot. */
11490 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11492 unsigned int scratch_regno;
11493 rtx frame_reg;
11494 int words;
11496 scratch_regno = split_stack_prologue_scratch_regno ();
11497 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11498 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11500 /* 64-bit:
11501 fp -> old fp value
11502 return address within this function
11503 return address of caller of this function
11504 stack arguments
11505 So we add three words to get to the stack arguments.
11507 32-bit:
11508 fp -> old fp value
11509 return address within this function
11510 first argument to __morestack
11511 second argument to __morestack
11512 return address of caller of this function
11513 stack arguments
11514 So we add five words to get to the stack arguments.
11516 words = TARGET_64BIT ? 3 : 5;
11517 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11518 gen_rtx_PLUS (Pmode, frame_reg,
11519 GEN_INT (words * UNITS_PER_WORD))));
11521 varargs_label = gen_label_rtx ();
11522 emit_jump_insn (gen_jump (varargs_label));
11523 JUMP_LABEL (get_last_insn ()) = varargs_label;
11525 emit_barrier ();
11528 emit_label (label);
11529 LABEL_NUSES (label) = 1;
11531 /* If this function calls va_start, we now have to set the scratch
11532 register for the case where we do not call __morestack. In this
11533 case we need to set it based on the stack pointer. */
11534 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11536 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11537 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11538 GEN_INT (UNITS_PER_WORD))));
11540 emit_label (varargs_label);
11541 LABEL_NUSES (varargs_label) = 1;
11545 /* We may have to tell the dataflow pass that the split stack prologue
11546 is initializing a scratch register. */
11548 static void
11549 ix86_live_on_entry (bitmap regs)
11551 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11553 gcc_assert (flag_split_stack);
11554 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11558 /* Determine if op is suitable SUBREG RTX for address. */
11560 static bool
11561 ix86_address_subreg_operand (rtx op)
11563 enum machine_mode mode;
11565 if (!REG_P (op))
11566 return false;
11568 mode = GET_MODE (op);
11570 if (GET_MODE_CLASS (mode) != MODE_INT)
11571 return false;
11573 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11574 failures when the register is one word out of a two word structure. */
11575 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11576 return false;
11578 /* Allow only SUBREGs of non-eliminable hard registers. */
11579 return register_no_elim_operand (op, mode);
11582 /* Extract the parts of an RTL expression that is a valid memory address
11583 for an instruction. Return 0 if the structure of the address is
11584 grossly off. Return -1 if the address contains ASHIFT, so it is not
11585 strictly valid, but still used for computing length of lea instruction. */
11588 ix86_decompose_address (rtx addr, struct ix86_address *out)
11590 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11591 rtx base_reg, index_reg;
11592 HOST_WIDE_INT scale = 1;
11593 rtx scale_rtx = NULL_RTX;
11594 rtx tmp;
11595 int retval = 1;
11596 enum ix86_address_seg seg = SEG_DEFAULT;
11598 /* Allow zero-extended SImode addresses,
11599 they will be emitted with addr32 prefix. */
11600 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11602 if (GET_CODE (addr) == ZERO_EXTEND
11603 && GET_MODE (XEXP (addr, 0)) == SImode)
11605 addr = XEXP (addr, 0);
11606 if (CONST_INT_P (addr))
11607 return 0;
11609 else if (GET_CODE (addr) == AND
11610 && const_32bit_mask (XEXP (addr, 1), DImode))
11612 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11613 if (addr == NULL_RTX)
11614 return 0;
11616 if (CONST_INT_P (addr))
11617 return 0;
11621 /* Allow SImode subregs of DImode addresses,
11622 they will be emitted with addr32 prefix. */
11623 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11625 if (GET_CODE (addr) == SUBREG
11626 && GET_MODE (SUBREG_REG (addr)) == DImode)
11628 addr = SUBREG_REG (addr);
11629 if (CONST_INT_P (addr))
11630 return 0;
11634 if (REG_P (addr))
11635 base = addr;
11636 else if (GET_CODE (addr) == SUBREG)
11638 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11639 base = addr;
11640 else
11641 return 0;
11643 else if (GET_CODE (addr) == PLUS)
11645 rtx addends[4], op;
11646 int n = 0, i;
11648 op = addr;
11651 if (n >= 4)
11652 return 0;
11653 addends[n++] = XEXP (op, 1);
11654 op = XEXP (op, 0);
11656 while (GET_CODE (op) == PLUS);
11657 if (n >= 4)
11658 return 0;
11659 addends[n] = op;
11661 for (i = n; i >= 0; --i)
11663 op = addends[i];
11664 switch (GET_CODE (op))
11666 case MULT:
11667 if (index)
11668 return 0;
11669 index = XEXP (op, 0);
11670 scale_rtx = XEXP (op, 1);
11671 break;
11673 case ASHIFT:
11674 if (index)
11675 return 0;
11676 index = XEXP (op, 0);
11677 tmp = XEXP (op, 1);
11678 if (!CONST_INT_P (tmp))
11679 return 0;
11680 scale = INTVAL (tmp);
11681 if ((unsigned HOST_WIDE_INT) scale > 3)
11682 return 0;
11683 scale = 1 << scale;
11684 break;
11686 case ZERO_EXTEND:
11687 op = XEXP (op, 0);
11688 if (GET_CODE (op) != UNSPEC)
11689 return 0;
11690 /* FALLTHRU */
11692 case UNSPEC:
11693 if (XINT (op, 1) == UNSPEC_TP
11694 && TARGET_TLS_DIRECT_SEG_REFS
11695 && seg == SEG_DEFAULT)
11696 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11697 else
11698 return 0;
11699 break;
11701 case SUBREG:
11702 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11703 return 0;
11704 /* FALLTHRU */
11706 case REG:
11707 if (!base)
11708 base = op;
11709 else if (!index)
11710 index = op;
11711 else
11712 return 0;
11713 break;
11715 case CONST:
11716 case CONST_INT:
11717 case SYMBOL_REF:
11718 case LABEL_REF:
11719 if (disp)
11720 return 0;
11721 disp = op;
11722 break;
11724 default:
11725 return 0;
11729 else if (GET_CODE (addr) == MULT)
11731 index = XEXP (addr, 0); /* index*scale */
11732 scale_rtx = XEXP (addr, 1);
11734 else if (GET_CODE (addr) == ASHIFT)
11736 /* We're called for lea too, which implements ashift on occasion. */
11737 index = XEXP (addr, 0);
11738 tmp = XEXP (addr, 1);
11739 if (!CONST_INT_P (tmp))
11740 return 0;
11741 scale = INTVAL (tmp);
11742 if ((unsigned HOST_WIDE_INT) scale > 3)
11743 return 0;
11744 scale = 1 << scale;
11745 retval = -1;
11747 else if (CONST_INT_P (addr))
11749 if (!x86_64_immediate_operand (addr, VOIDmode))
11750 return 0;
11752 /* Constant addresses are sign extended to 64bit, we have to
11753 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11754 if (TARGET_X32
11755 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11756 return 0;
11758 disp = addr;
11760 else
11761 disp = addr; /* displacement */
11763 if (index)
11765 if (REG_P (index))
11767 else if (GET_CODE (index) == SUBREG
11768 && ix86_address_subreg_operand (SUBREG_REG (index)))
11770 else
11771 return 0;
11774 /* Address override works only on the (%reg) part of %fs:(%reg). */
11775 if (seg != SEG_DEFAULT
11776 && ((base && GET_MODE (base) != word_mode)
11777 || (index && GET_MODE (index) != word_mode)))
11778 return 0;
11780 /* Extract the integral value of scale. */
11781 if (scale_rtx)
11783 if (!CONST_INT_P (scale_rtx))
11784 return 0;
11785 scale = INTVAL (scale_rtx);
11788 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11789 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11791 /* Avoid useless 0 displacement. */
11792 if (disp == const0_rtx && (base || index))
11793 disp = NULL_RTX;
11795 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11796 if (base_reg && index_reg && scale == 1
11797 && (index_reg == arg_pointer_rtx
11798 || index_reg == frame_pointer_rtx
11799 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11801 rtx tmp;
11802 tmp = base, base = index, index = tmp;
11803 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11806 /* Special case: %ebp cannot be encoded as a base without a displacement.
11807 Similarly %r13. */
11808 if (!disp
11809 && base_reg
11810 && (base_reg == hard_frame_pointer_rtx
11811 || base_reg == frame_pointer_rtx
11812 || base_reg == arg_pointer_rtx
11813 || (REG_P (base_reg)
11814 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11815 || REGNO (base_reg) == R13_REG))))
11816 disp = const0_rtx;
11818 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11819 Avoid this by transforming to [%esi+0].
11820 Reload calls address legitimization without cfun defined, so we need
11821 to test cfun for being non-NULL. */
11822 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11823 && base_reg && !index_reg && !disp
11824 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11825 disp = const0_rtx;
11827 /* Special case: encode reg+reg instead of reg*2. */
11828 if (!base && index && scale == 2)
11829 base = index, base_reg = index_reg, scale = 1;
11831 /* Special case: scaling cannot be encoded without base or displacement. */
11832 if (!base && !disp && index && scale != 1)
11833 disp = const0_rtx;
11835 out->base = base;
11836 out->index = index;
11837 out->disp = disp;
11838 out->scale = scale;
11839 out->seg = seg;
11841 return retval;
11844 /* Return cost of the memory address x.
11845 For i386, it is better to use a complex address than let gcc copy
11846 the address into a reg and make a new pseudo. But not if the address
11847 requires to two regs - that would mean more pseudos with longer
11848 lifetimes. */
11849 static int
11850 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11851 addr_space_t as ATTRIBUTE_UNUSED,
11852 bool speed ATTRIBUTE_UNUSED)
11854 struct ix86_address parts;
11855 int cost = 1;
11856 int ok = ix86_decompose_address (x, &parts);
11858 gcc_assert (ok);
11860 if (parts.base && GET_CODE (parts.base) == SUBREG)
11861 parts.base = SUBREG_REG (parts.base);
11862 if (parts.index && GET_CODE (parts.index) == SUBREG)
11863 parts.index = SUBREG_REG (parts.index);
11865 /* Attempt to minimize number of registers in the address. */
11866 if ((parts.base
11867 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11868 || (parts.index
11869 && (!REG_P (parts.index)
11870 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11871 cost++;
11873 if (parts.base
11874 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11875 && parts.index
11876 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11877 && parts.base != parts.index)
11878 cost++;
11880 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11881 since it's predecode logic can't detect the length of instructions
11882 and it degenerates to vector decoded. Increase cost of such
11883 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11884 to split such addresses or even refuse such addresses at all.
11886 Following addressing modes are affected:
11887 [base+scale*index]
11888 [scale*index+disp]
11889 [base+index]
11891 The first and last case may be avoidable by explicitly coding the zero in
11892 memory address, but I don't have AMD-K6 machine handy to check this
11893 theory. */
11895 if (TARGET_K6
11896 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11897 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11898 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11899 cost += 10;
11901 return cost;
11904 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11905 this is used for to form addresses to local data when -fPIC is in
11906 use. */
11908 static bool
11909 darwin_local_data_pic (rtx disp)
11911 return (GET_CODE (disp) == UNSPEC
11912 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11915 /* Determine if a given RTX is a valid constant. We already know this
11916 satisfies CONSTANT_P. */
11918 static bool
11919 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11921 switch (GET_CODE (x))
11923 case CONST:
11924 x = XEXP (x, 0);
11926 if (GET_CODE (x) == PLUS)
11928 if (!CONST_INT_P (XEXP (x, 1)))
11929 return false;
11930 x = XEXP (x, 0);
11933 if (TARGET_MACHO && darwin_local_data_pic (x))
11934 return true;
11936 /* Only some unspecs are valid as "constants". */
11937 if (GET_CODE (x) == UNSPEC)
11938 switch (XINT (x, 1))
11940 case UNSPEC_GOT:
11941 case UNSPEC_GOTOFF:
11942 case UNSPEC_PLTOFF:
11943 return TARGET_64BIT;
11944 case UNSPEC_TPOFF:
11945 case UNSPEC_NTPOFF:
11946 x = XVECEXP (x, 0, 0);
11947 return (GET_CODE (x) == SYMBOL_REF
11948 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11949 case UNSPEC_DTPOFF:
11950 x = XVECEXP (x, 0, 0);
11951 return (GET_CODE (x) == SYMBOL_REF
11952 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11953 default:
11954 return false;
11957 /* We must have drilled down to a symbol. */
11958 if (GET_CODE (x) == LABEL_REF)
11959 return true;
11960 if (GET_CODE (x) != SYMBOL_REF)
11961 return false;
11962 /* FALLTHRU */
11964 case SYMBOL_REF:
11965 /* TLS symbols are never valid. */
11966 if (SYMBOL_REF_TLS_MODEL (x))
11967 return false;
11969 /* DLLIMPORT symbols are never valid. */
11970 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11971 && SYMBOL_REF_DLLIMPORT_P (x))
11972 return false;
11974 #if TARGET_MACHO
11975 /* mdynamic-no-pic */
11976 if (MACHO_DYNAMIC_NO_PIC_P)
11977 return machopic_symbol_defined_p (x);
11978 #endif
11979 break;
11981 case CONST_DOUBLE:
11982 if (GET_MODE (x) == TImode
11983 && x != CONST0_RTX (TImode)
11984 && !TARGET_64BIT)
11985 return false;
11986 break;
11988 case CONST_VECTOR:
11989 if (!standard_sse_constant_p (x))
11990 return false;
11992 default:
11993 break;
11996 /* Otherwise we handle everything else in the move patterns. */
11997 return true;
12000 /* Determine if it's legal to put X into the constant pool. This
12001 is not possible for the address of thread-local symbols, which
12002 is checked above. */
12004 static bool
12005 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12007 /* We can always put integral constants and vectors in memory. */
12008 switch (GET_CODE (x))
12010 case CONST_INT:
12011 case CONST_DOUBLE:
12012 case CONST_VECTOR:
12013 return false;
12015 default:
12016 break;
12018 return !ix86_legitimate_constant_p (mode, x);
12021 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12022 otherwise zero. */
12024 static bool
12025 is_imported_p (rtx x)
12027 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12028 || GET_CODE (x) != SYMBOL_REF)
12029 return false;
12031 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12035 /* Nonzero if the constant value X is a legitimate general operand
12036 when generating PIC code. It is given that flag_pic is on and
12037 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12039 bool
12040 legitimate_pic_operand_p (rtx x)
12042 rtx inner;
12044 switch (GET_CODE (x))
12046 case CONST:
12047 inner = XEXP (x, 0);
12048 if (GET_CODE (inner) == PLUS
12049 && CONST_INT_P (XEXP (inner, 1)))
12050 inner = XEXP (inner, 0);
12052 /* Only some unspecs are valid as "constants". */
12053 if (GET_CODE (inner) == UNSPEC)
12054 switch (XINT (inner, 1))
12056 case UNSPEC_GOT:
12057 case UNSPEC_GOTOFF:
12058 case UNSPEC_PLTOFF:
12059 return TARGET_64BIT;
12060 case UNSPEC_TPOFF:
12061 x = XVECEXP (inner, 0, 0);
12062 return (GET_CODE (x) == SYMBOL_REF
12063 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12064 case UNSPEC_MACHOPIC_OFFSET:
12065 return legitimate_pic_address_disp_p (x);
12066 default:
12067 return false;
12069 /* FALLTHRU */
12071 case SYMBOL_REF:
12072 case LABEL_REF:
12073 return legitimate_pic_address_disp_p (x);
12075 default:
12076 return true;
12080 /* Determine if a given CONST RTX is a valid memory displacement
12081 in PIC mode. */
12083 bool
12084 legitimate_pic_address_disp_p (rtx disp)
12086 bool saw_plus;
12088 /* In 64bit mode we can allow direct addresses of symbols and labels
12089 when they are not dynamic symbols. */
12090 if (TARGET_64BIT)
12092 rtx op0 = disp, op1;
12094 switch (GET_CODE (disp))
12096 case LABEL_REF:
12097 return true;
12099 case CONST:
12100 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12101 break;
12102 op0 = XEXP (XEXP (disp, 0), 0);
12103 op1 = XEXP (XEXP (disp, 0), 1);
12104 if (!CONST_INT_P (op1)
12105 || INTVAL (op1) >= 16*1024*1024
12106 || INTVAL (op1) < -16*1024*1024)
12107 break;
12108 if (GET_CODE (op0) == LABEL_REF)
12109 return true;
12110 if (GET_CODE (op0) == CONST
12111 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12112 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12113 return true;
12114 if (GET_CODE (op0) == UNSPEC
12115 && XINT (op0, 1) == UNSPEC_PCREL)
12116 return true;
12117 if (GET_CODE (op0) != SYMBOL_REF)
12118 break;
12119 /* FALLTHRU */
12121 case SYMBOL_REF:
12122 /* TLS references should always be enclosed in UNSPEC.
12123 The dllimported symbol needs always to be resolved. */
12124 if (SYMBOL_REF_TLS_MODEL (op0)
12125 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12126 return false;
12128 if (DEFAULT_ABI == MS_ABI)
12130 if (is_imported_p (op0))
12131 return true;
12133 if (SYMBOL_REF_FAR_ADDR_P (op0)
12134 || !SYMBOL_REF_LOCAL_P (op0))
12135 break;
12137 /* Function-symbols need to be resolved only for
12138 large-model.
12139 For the small-model we don't need to resolve anything
12140 here. */
12141 if ((ix86_cmodel != CM_LARGE_PIC
12142 && SYMBOL_REF_FUNCTION_P (op0))
12143 || ix86_cmodel == CM_SMALL_PIC)
12144 return true;
12145 /* Non-external symbols don't need to be resolved for
12146 large, and medium-model. */
12147 if ((ix86_cmodel == CM_LARGE_PIC
12148 || ix86_cmodel == CM_MEDIUM_PIC)
12149 && !SYMBOL_REF_EXTERNAL_P (op0))
12150 return true;
12152 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12153 && SYMBOL_REF_LOCAL_P (op0)
12154 && ix86_cmodel != CM_LARGE_PIC)
12155 return true;
12156 break;
12158 default:
12159 break;
12162 if (GET_CODE (disp) != CONST)
12163 return false;
12164 disp = XEXP (disp, 0);
12166 if (TARGET_64BIT)
12168 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12169 of GOT tables. We should not need these anyway. */
12170 if (GET_CODE (disp) != UNSPEC
12171 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12172 && XINT (disp, 1) != UNSPEC_GOTOFF
12173 && XINT (disp, 1) != UNSPEC_PCREL
12174 && XINT (disp, 1) != UNSPEC_PLTOFF))
12175 return false;
12177 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12178 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12179 return false;
12180 return true;
12183 saw_plus = false;
12184 if (GET_CODE (disp) == PLUS)
12186 if (!CONST_INT_P (XEXP (disp, 1)))
12187 return false;
12188 disp = XEXP (disp, 0);
12189 saw_plus = true;
12192 if (TARGET_MACHO && darwin_local_data_pic (disp))
12193 return true;
12195 if (GET_CODE (disp) != UNSPEC)
12196 return false;
12198 switch (XINT (disp, 1))
12200 case UNSPEC_GOT:
12201 if (saw_plus)
12202 return false;
12203 /* We need to check for both symbols and labels because VxWorks loads
12204 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12205 details. */
12206 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12207 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12208 case UNSPEC_GOTOFF:
12209 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12210 While ABI specify also 32bit relocation but we don't produce it in
12211 small PIC model at all. */
12212 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12213 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12214 && !TARGET_64BIT)
12215 return DEFAULT_ABI != MS_ABI && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12216 return false;
12217 case UNSPEC_GOTTPOFF:
12218 case UNSPEC_GOTNTPOFF:
12219 case UNSPEC_INDNTPOFF:
12220 if (saw_plus)
12221 return false;
12222 disp = XVECEXP (disp, 0, 0);
12223 return (GET_CODE (disp) == SYMBOL_REF
12224 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12225 case UNSPEC_NTPOFF:
12226 disp = XVECEXP (disp, 0, 0);
12227 return (GET_CODE (disp) == SYMBOL_REF
12228 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12229 case UNSPEC_DTPOFF:
12230 disp = XVECEXP (disp, 0, 0);
12231 return (GET_CODE (disp) == SYMBOL_REF
12232 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12235 return false;
12238 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12239 replace the input X, or the original X if no replacement is called for.
12240 The output parameter *WIN is 1 if the calling macro should goto WIN,
12241 0 if it should not. */
12243 bool
12244 ix86_legitimize_reload_address (rtx x,
12245 enum machine_mode mode ATTRIBUTE_UNUSED,
12246 int opnum, int type,
12247 int ind_levels ATTRIBUTE_UNUSED)
12249 /* Reload can generate:
12251 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12252 (reg:DI 97))
12253 (reg:DI 2 cx))
12255 This RTX is rejected from ix86_legitimate_address_p due to
12256 non-strictness of base register 97. Following this rejection,
12257 reload pushes all three components into separate registers,
12258 creating invalid memory address RTX.
12260 Following code reloads only the invalid part of the
12261 memory address RTX. */
12263 if (GET_CODE (x) == PLUS
12264 && REG_P (XEXP (x, 1))
12265 && GET_CODE (XEXP (x, 0)) == PLUS
12266 && REG_P (XEXP (XEXP (x, 0), 1)))
12268 rtx base, index;
12269 bool something_reloaded = false;
12271 base = XEXP (XEXP (x, 0), 1);
12272 if (!REG_OK_FOR_BASE_STRICT_P (base))
12274 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12275 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12276 opnum, (enum reload_type) type);
12277 something_reloaded = true;
12280 index = XEXP (x, 1);
12281 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12283 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12284 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12285 opnum, (enum reload_type) type);
12286 something_reloaded = true;
12289 gcc_assert (something_reloaded);
12290 return true;
12293 return false;
12296 /* Recognizes RTL expressions that are valid memory addresses for an
12297 instruction. The MODE argument is the machine mode for the MEM
12298 expression that wants to use this address.
12300 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12301 convert common non-canonical forms to canonical form so that they will
12302 be recognized. */
12304 static bool
12305 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12306 rtx addr, bool strict)
12308 struct ix86_address parts;
12309 rtx base, index, disp;
12310 HOST_WIDE_INT scale;
12312 if (ix86_decompose_address (addr, &parts) <= 0)
12313 /* Decomposition failed. */
12314 return false;
12316 base = parts.base;
12317 index = parts.index;
12318 disp = parts.disp;
12319 scale = parts.scale;
12321 /* Validate base register. */
12322 if (base)
12324 rtx reg;
12326 if (REG_P (base))
12327 reg = base;
12328 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12329 reg = SUBREG_REG (base);
12330 else
12331 /* Base is not a register. */
12332 return false;
12334 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12335 return false;
12337 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12338 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12339 /* Base is not valid. */
12340 return false;
12343 /* Validate index register. */
12344 if (index)
12346 rtx reg;
12348 if (REG_P (index))
12349 reg = index;
12350 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12351 reg = SUBREG_REG (index);
12352 else
12353 /* Index is not a register. */
12354 return false;
12356 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12357 return false;
12359 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12360 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12361 /* Index is not valid. */
12362 return false;
12365 /* Index and base should have the same mode. */
12366 if (base && index
12367 && GET_MODE (base) != GET_MODE (index))
12368 return false;
12370 /* Validate scale factor. */
12371 if (scale != 1)
12373 if (!index)
12374 /* Scale without index. */
12375 return false;
12377 if (scale != 2 && scale != 4 && scale != 8)
12378 /* Scale is not a valid multiplier. */
12379 return false;
12382 /* Validate displacement. */
12383 if (disp)
12385 if (GET_CODE (disp) == CONST
12386 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12387 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12388 switch (XINT (XEXP (disp, 0), 1))
12390 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12391 used. While ABI specify also 32bit relocations, we don't produce
12392 them at all and use IP relative instead. */
12393 case UNSPEC_GOT:
12394 case UNSPEC_GOTOFF:
12395 gcc_assert (flag_pic);
12396 if (!TARGET_64BIT)
12397 goto is_legitimate_pic;
12399 /* 64bit address unspec. */
12400 return false;
12402 case UNSPEC_GOTPCREL:
12403 case UNSPEC_PCREL:
12404 gcc_assert (flag_pic);
12405 goto is_legitimate_pic;
12407 case UNSPEC_GOTTPOFF:
12408 case UNSPEC_GOTNTPOFF:
12409 case UNSPEC_INDNTPOFF:
12410 case UNSPEC_NTPOFF:
12411 case UNSPEC_DTPOFF:
12412 break;
12414 case UNSPEC_STACK_CHECK:
12415 gcc_assert (flag_split_stack);
12416 break;
12418 default:
12419 /* Invalid address unspec. */
12420 return false;
12423 else if (SYMBOLIC_CONST (disp)
12424 && (flag_pic
12425 || (TARGET_MACHO
12426 #if TARGET_MACHO
12427 && MACHOPIC_INDIRECT
12428 && !machopic_operand_p (disp)
12429 #endif
12433 is_legitimate_pic:
12434 if (TARGET_64BIT && (index || base))
12436 /* foo@dtpoff(%rX) is ok. */
12437 if (GET_CODE (disp) != CONST
12438 || GET_CODE (XEXP (disp, 0)) != PLUS
12439 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12440 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12441 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12442 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12443 /* Non-constant pic memory reference. */
12444 return false;
12446 else if ((!TARGET_MACHO || flag_pic)
12447 && ! legitimate_pic_address_disp_p (disp))
12448 /* Displacement is an invalid pic construct. */
12449 return false;
12450 #if TARGET_MACHO
12451 else if (MACHO_DYNAMIC_NO_PIC_P
12452 && !ix86_legitimate_constant_p (Pmode, disp))
12453 /* displacment must be referenced via non_lazy_pointer */
12454 return false;
12455 #endif
12457 /* This code used to verify that a symbolic pic displacement
12458 includes the pic_offset_table_rtx register.
12460 While this is good idea, unfortunately these constructs may
12461 be created by "adds using lea" optimization for incorrect
12462 code like:
12464 int a;
12465 int foo(int i)
12467 return *(&a+i);
12470 This code is nonsensical, but results in addressing
12471 GOT table with pic_offset_table_rtx base. We can't
12472 just refuse it easily, since it gets matched by
12473 "addsi3" pattern, that later gets split to lea in the
12474 case output register differs from input. While this
12475 can be handled by separate addsi pattern for this case
12476 that never results in lea, this seems to be easier and
12477 correct fix for crash to disable this test. */
12479 else if (GET_CODE (disp) != LABEL_REF
12480 && !CONST_INT_P (disp)
12481 && (GET_CODE (disp) != CONST
12482 || !ix86_legitimate_constant_p (Pmode, disp))
12483 && (GET_CODE (disp) != SYMBOL_REF
12484 || !ix86_legitimate_constant_p (Pmode, disp)))
12485 /* Displacement is not constant. */
12486 return false;
12487 else if (TARGET_64BIT
12488 && !x86_64_immediate_operand (disp, VOIDmode))
12489 /* Displacement is out of range. */
12490 return false;
12493 /* Everything looks valid. */
12494 return true;
12497 /* Determine if a given RTX is a valid constant address. */
12499 bool
12500 constant_address_p (rtx x)
12502 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12505 /* Return a unique alias set for the GOT. */
12507 static alias_set_type
12508 ix86_GOT_alias_set (void)
12510 static alias_set_type set = -1;
12511 if (set == -1)
12512 set = new_alias_set ();
12513 return set;
12516 /* Return a legitimate reference for ORIG (an address) using the
12517 register REG. If REG is 0, a new pseudo is generated.
12519 There are two types of references that must be handled:
12521 1. Global data references must load the address from the GOT, via
12522 the PIC reg. An insn is emitted to do this load, and the reg is
12523 returned.
12525 2. Static data references, constant pool addresses, and code labels
12526 compute the address as an offset from the GOT, whose base is in
12527 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12528 differentiate them from global data objects. The returned
12529 address is the PIC reg + an unspec constant.
12531 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12532 reg also appears in the address. */
12534 static rtx
12535 legitimize_pic_address (rtx orig, rtx reg)
12537 rtx addr = orig;
12538 rtx new_rtx = orig;
12540 #if TARGET_MACHO
12541 if (TARGET_MACHO && !TARGET_64BIT)
12543 if (reg == 0)
12544 reg = gen_reg_rtx (Pmode);
12545 /* Use the generic Mach-O PIC machinery. */
12546 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12548 #endif
12550 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12552 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12553 if (tmp)
12554 return tmp;
12557 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12558 new_rtx = addr;
12559 else if (TARGET_64BIT && DEFAULT_ABI != MS_ABI
12560 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12562 rtx tmpreg;
12563 /* This symbol may be referenced via a displacement from the PIC
12564 base address (@GOTOFF). */
12566 if (reload_in_progress)
12567 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12568 if (GET_CODE (addr) == CONST)
12569 addr = XEXP (addr, 0);
12570 if (GET_CODE (addr) == PLUS)
12572 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12573 UNSPEC_GOTOFF);
12574 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12576 else
12577 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12578 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12579 if (!reg)
12580 tmpreg = gen_reg_rtx (Pmode);
12581 else
12582 tmpreg = reg;
12583 emit_move_insn (tmpreg, new_rtx);
12585 if (reg != 0)
12587 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12588 tmpreg, 1, OPTAB_DIRECT);
12589 new_rtx = reg;
12591 else
12592 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12594 else if (!TARGET_64BIT && DEFAULT_ABI != MS_ABI && gotoff_operand (addr, Pmode))
12596 /* This symbol may be referenced via a displacement from the PIC
12597 base address (@GOTOFF). */
12599 if (reload_in_progress)
12600 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12601 if (GET_CODE (addr) == CONST)
12602 addr = XEXP (addr, 0);
12603 if (GET_CODE (addr) == PLUS)
12605 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12606 UNSPEC_GOTOFF);
12607 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12609 else
12610 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12611 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12612 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12614 if (reg != 0)
12616 emit_move_insn (reg, new_rtx);
12617 new_rtx = reg;
12620 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12621 /* We can't use @GOTOFF for text labels on VxWorks;
12622 see gotoff_operand. */
12623 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12625 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12626 if (tmp)
12627 return tmp;
12629 /* For x64 PE-COFF there is no GOT table. So we use address
12630 directly. */
12631 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12633 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12634 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12636 if (reg == 0)
12637 reg = gen_reg_rtx (Pmode);
12638 emit_move_insn (reg, new_rtx);
12639 new_rtx = reg;
12641 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12643 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12644 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12645 new_rtx = gen_const_mem (Pmode, new_rtx);
12646 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12648 if (reg == 0)
12649 reg = gen_reg_rtx (Pmode);
12650 /* Use directly gen_movsi, otherwise the address is loaded
12651 into register for CSE. We don't want to CSE this addresses,
12652 instead we CSE addresses from the GOT table, so skip this. */
12653 emit_insn (gen_movsi (reg, new_rtx));
12654 new_rtx = reg;
12656 else
12658 /* This symbol must be referenced via a load from the
12659 Global Offset Table (@GOT). */
12661 if (reload_in_progress)
12662 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12663 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12664 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12665 if (TARGET_64BIT)
12666 new_rtx = force_reg (Pmode, new_rtx);
12667 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12668 new_rtx = gen_const_mem (Pmode, new_rtx);
12669 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12671 if (reg == 0)
12672 reg = gen_reg_rtx (Pmode);
12673 emit_move_insn (reg, new_rtx);
12674 new_rtx = reg;
12677 else
12679 if (CONST_INT_P (addr)
12680 && !x86_64_immediate_operand (addr, VOIDmode))
12682 if (reg)
12684 emit_move_insn (reg, addr);
12685 new_rtx = reg;
12687 else
12688 new_rtx = force_reg (Pmode, addr);
12690 else if (GET_CODE (addr) == CONST)
12692 addr = XEXP (addr, 0);
12694 /* We must match stuff we generate before. Assume the only
12695 unspecs that can get here are ours. Not that we could do
12696 anything with them anyway.... */
12697 if (GET_CODE (addr) == UNSPEC
12698 || (GET_CODE (addr) == PLUS
12699 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12700 return orig;
12701 gcc_assert (GET_CODE (addr) == PLUS);
12703 if (GET_CODE (addr) == PLUS)
12705 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12707 /* Check first to see if this is a constant offset from a @GOTOFF
12708 symbol reference. */
12709 if (DEFAULT_ABI != MS_ABI && gotoff_operand (op0, Pmode)
12710 && CONST_INT_P (op1))
12712 if (!TARGET_64BIT)
12714 if (reload_in_progress)
12715 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12716 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12717 UNSPEC_GOTOFF);
12718 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12719 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12720 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12722 if (reg != 0)
12724 emit_move_insn (reg, new_rtx);
12725 new_rtx = reg;
12728 else
12730 if (INTVAL (op1) < -16*1024*1024
12731 || INTVAL (op1) >= 16*1024*1024)
12733 if (!x86_64_immediate_operand (op1, Pmode))
12734 op1 = force_reg (Pmode, op1);
12735 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12739 else
12741 rtx base = legitimize_pic_address (op0, reg);
12742 enum machine_mode mode = GET_MODE (base);
12743 new_rtx
12744 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12746 if (CONST_INT_P (new_rtx))
12748 if (INTVAL (new_rtx) < -16*1024*1024
12749 || INTVAL (new_rtx) >= 16*1024*1024)
12751 if (!x86_64_immediate_operand (new_rtx, mode))
12752 new_rtx = force_reg (mode, new_rtx);
12753 new_rtx
12754 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12756 else
12757 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12759 else
12761 if (GET_CODE (new_rtx) == PLUS
12762 && CONSTANT_P (XEXP (new_rtx, 1)))
12764 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12765 new_rtx = XEXP (new_rtx, 1);
12767 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12772 return new_rtx;
12775 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12777 static rtx
12778 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12780 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12782 if (GET_MODE (tp) != tp_mode)
12784 gcc_assert (GET_MODE (tp) == SImode);
12785 gcc_assert (tp_mode == DImode);
12787 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12790 if (to_reg)
12791 tp = copy_to_mode_reg (tp_mode, tp);
12793 return tp;
12796 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12798 static GTY(()) rtx ix86_tls_symbol;
12800 static rtx
12801 ix86_tls_get_addr (void)
12803 if (!ix86_tls_symbol)
12805 const char *sym
12806 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12807 ? "___tls_get_addr" : "__tls_get_addr");
12809 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12812 return ix86_tls_symbol;
12815 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12817 static GTY(()) rtx ix86_tls_module_base_symbol;
12820 ix86_tls_module_base (void)
12822 if (!ix86_tls_module_base_symbol)
12824 ix86_tls_module_base_symbol
12825 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12827 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12828 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12831 return ix86_tls_module_base_symbol;
12834 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12835 false if we expect this to be used for a memory address and true if
12836 we expect to load the address into a register. */
12838 static rtx
12839 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12841 rtx dest, base, off;
12842 rtx pic = NULL_RTX, tp = NULL_RTX;
12843 enum machine_mode tp_mode = Pmode;
12844 int type;
12846 switch (model)
12848 case TLS_MODEL_GLOBAL_DYNAMIC:
12849 dest = gen_reg_rtx (Pmode);
12851 if (!TARGET_64BIT)
12853 if (flag_pic && DEFAULT_ABI != MS_ABI)
12854 pic = pic_offset_table_rtx;
12855 else
12857 pic = gen_reg_rtx (Pmode);
12858 emit_insn (gen_set_got (pic));
12862 if (TARGET_GNU2_TLS)
12864 if (TARGET_64BIT)
12865 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12866 else
12867 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12869 tp = get_thread_pointer (Pmode, true);
12870 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12872 if (GET_MODE (x) != Pmode)
12873 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12875 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12877 else
12879 rtx caddr = ix86_tls_get_addr ();
12881 if (TARGET_64BIT)
12883 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12884 rtx insns;
12886 start_sequence ();
12887 emit_call_insn
12888 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12889 insns = get_insns ();
12890 end_sequence ();
12892 if (GET_MODE (x) != Pmode)
12893 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12895 RTL_CONST_CALL_P (insns) = 1;
12896 emit_libcall_block (insns, dest, rax, x);
12898 else
12899 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12901 break;
12903 case TLS_MODEL_LOCAL_DYNAMIC:
12904 base = gen_reg_rtx (Pmode);
12906 if (!TARGET_64BIT)
12908 if (flag_pic)
12909 pic = pic_offset_table_rtx;
12910 else
12912 pic = gen_reg_rtx (Pmode);
12913 emit_insn (gen_set_got (pic));
12917 if (TARGET_GNU2_TLS)
12919 rtx tmp = ix86_tls_module_base ();
12921 if (TARGET_64BIT)
12922 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12923 else
12924 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12926 tp = get_thread_pointer (Pmode, true);
12927 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12928 gen_rtx_MINUS (Pmode, tmp, tp));
12930 else
12932 rtx caddr = ix86_tls_get_addr ();
12934 if (TARGET_64BIT)
12936 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12937 rtx insns, eqv;
12939 start_sequence ();
12940 emit_call_insn
12941 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
12942 insns = get_insns ();
12943 end_sequence ();
12945 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12946 share the LD_BASE result with other LD model accesses. */
12947 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12948 UNSPEC_TLS_LD_BASE);
12950 RTL_CONST_CALL_P (insns) = 1;
12951 emit_libcall_block (insns, base, rax, eqv);
12953 else
12954 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12957 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12958 off = gen_rtx_CONST (Pmode, off);
12960 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12962 if (TARGET_GNU2_TLS)
12964 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12966 if (GET_MODE (x) != Pmode)
12967 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12969 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12971 break;
12973 case TLS_MODEL_INITIAL_EXEC:
12974 if (TARGET_64BIT)
12976 if (TARGET_SUN_TLS && !TARGET_X32)
12978 /* The Sun linker took the AMD64 TLS spec literally
12979 and can only handle %rax as destination of the
12980 initial executable code sequence. */
12982 dest = gen_reg_rtx (DImode);
12983 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12984 return dest;
12987 /* Generate DImode references to avoid %fs:(%reg32)
12988 problems and linker IE->LE relaxation bug. */
12989 tp_mode = DImode;
12990 pic = NULL;
12991 type = UNSPEC_GOTNTPOFF;
12993 else if (flag_pic)
12995 if (reload_in_progress)
12996 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12997 pic = pic_offset_table_rtx;
12998 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13000 else if (!TARGET_ANY_GNU_TLS)
13002 pic = gen_reg_rtx (Pmode);
13003 emit_insn (gen_set_got (pic));
13004 type = UNSPEC_GOTTPOFF;
13006 else
13008 pic = NULL;
13009 type = UNSPEC_INDNTPOFF;
13012 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13013 off = gen_rtx_CONST (tp_mode, off);
13014 if (pic)
13015 off = gen_rtx_PLUS (tp_mode, pic, off);
13016 off = gen_const_mem (tp_mode, off);
13017 set_mem_alias_set (off, ix86_GOT_alias_set ());
13019 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13021 base = get_thread_pointer (tp_mode,
13022 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13023 off = force_reg (tp_mode, off);
13024 return gen_rtx_PLUS (tp_mode, base, off);
13026 else
13028 base = get_thread_pointer (Pmode, true);
13029 dest = gen_reg_rtx (Pmode);
13030 emit_insn (ix86_gen_sub3 (dest, base, off));
13032 break;
13034 case TLS_MODEL_LOCAL_EXEC:
13035 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13036 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13037 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13038 off = gen_rtx_CONST (Pmode, off);
13040 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13042 base = get_thread_pointer (Pmode,
13043 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13044 return gen_rtx_PLUS (Pmode, base, off);
13046 else
13048 base = get_thread_pointer (Pmode, true);
13049 dest = gen_reg_rtx (Pmode);
13050 emit_insn (ix86_gen_sub3 (dest, base, off));
13052 break;
13054 default:
13055 gcc_unreachable ();
13058 return dest;
13061 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13062 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13063 unique refptr-DECL symbol corresponding to symbol DECL. */
13065 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13066 htab_t dllimport_map;
13068 static tree
13069 get_dllimport_decl (tree decl, bool beimport)
13071 struct tree_map *h, in;
13072 void **loc;
13073 const char *name;
13074 const char *prefix;
13075 size_t namelen, prefixlen;
13076 char *imp_name;
13077 tree to;
13078 rtx rtl;
13080 if (!dllimport_map)
13081 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13083 in.hash = htab_hash_pointer (decl);
13084 in.base.from = decl;
13085 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13086 h = (struct tree_map *) *loc;
13087 if (h)
13088 return h->to;
13090 *loc = h = ggc_alloc_tree_map ();
13091 h->hash = in.hash;
13092 h->base.from = decl;
13093 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13094 VAR_DECL, NULL, ptr_type_node);
13095 DECL_ARTIFICIAL (to) = 1;
13096 DECL_IGNORED_P (to) = 1;
13097 DECL_EXTERNAL (to) = 1;
13098 TREE_READONLY (to) = 1;
13100 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13101 name = targetm.strip_name_encoding (name);
13102 if (beimport)
13103 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13104 ? "*__imp_" : "*__imp__";
13105 else
13106 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13107 namelen = strlen (name);
13108 prefixlen = strlen (prefix);
13109 imp_name = (char *) alloca (namelen + prefixlen + 1);
13110 memcpy (imp_name, prefix, prefixlen);
13111 memcpy (imp_name + prefixlen, name, namelen + 1);
13113 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13114 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13115 SET_SYMBOL_REF_DECL (rtl, to);
13116 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13117 if (!beimport)
13119 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13120 #ifdef SUB_TARGET_RECORD_STUB
13121 SUB_TARGET_RECORD_STUB (name);
13122 #endif
13125 rtl = gen_const_mem (Pmode, rtl);
13126 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13128 SET_DECL_RTL (to, rtl);
13129 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13131 return to;
13134 /* Expand SYMBOL into its corresponding far-addresse symbol.
13135 WANT_REG is true if we require the result be a register. */
13137 static rtx
13138 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13140 tree imp_decl;
13141 rtx x;
13143 gcc_assert (SYMBOL_REF_DECL (symbol));
13144 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13146 x = DECL_RTL (imp_decl);
13147 if (want_reg)
13148 x = force_reg (Pmode, x);
13149 return x;
13152 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13153 true if we require the result be a register. */
13155 static rtx
13156 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13158 tree imp_decl;
13159 rtx x;
13161 gcc_assert (SYMBOL_REF_DECL (symbol));
13162 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13164 x = DECL_RTL (imp_decl);
13165 if (want_reg)
13166 x = force_reg (Pmode, x);
13167 return x;
13170 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13171 is true if we require the result be a register. */
13173 static rtx
13174 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13176 if (DEFAULT_ABI != MS_ABI)
13177 return NULL_RTX;
13179 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13181 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13182 return legitimize_dllimport_symbol (addr, inreg);
13183 if (GET_CODE (addr) == CONST
13184 && GET_CODE (XEXP (addr, 0)) == PLUS
13185 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13186 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13188 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13189 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13193 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13194 return NULL_RTX;
13195 if (GET_CODE (addr) == SYMBOL_REF
13196 && !is_imported_p (addr)
13197 && SYMBOL_REF_EXTERNAL_P (addr)
13198 && SYMBOL_REF_DECL (addr))
13199 return legitimize_pe_coff_extern_decl (addr, inreg);
13201 if (GET_CODE (addr) == CONST
13202 && GET_CODE (XEXP (addr, 0)) == PLUS
13203 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13204 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13205 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13206 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13208 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13209 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13211 return NULL_RTX;
13214 /* Try machine-dependent ways of modifying an illegitimate address
13215 to be legitimate. If we find one, return the new, valid address.
13216 This macro is used in only one place: `memory_address' in explow.c.
13218 OLDX is the address as it was before break_out_memory_refs was called.
13219 In some cases it is useful to look at this to decide what needs to be done.
13221 It is always safe for this macro to do nothing. It exists to recognize
13222 opportunities to optimize the output.
13224 For the 80386, we handle X+REG by loading X into a register R and
13225 using R+REG. R will go in a general reg and indexing will be used.
13226 However, if REG is a broken-out memory address or multiplication,
13227 nothing needs to be done because REG can certainly go in a general reg.
13229 When -fpic is used, special handling is needed for symbolic references.
13230 See comments by legitimize_pic_address in i386.c for details. */
13232 static rtx
13233 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13234 enum machine_mode mode)
13236 int changed = 0;
13237 unsigned log;
13239 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13240 if (log)
13241 return legitimize_tls_address (x, (enum tls_model) log, false);
13242 if (GET_CODE (x) == CONST
13243 && GET_CODE (XEXP (x, 0)) == PLUS
13244 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13245 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13247 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13248 (enum tls_model) log, false);
13249 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13252 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13254 rtx tmp = legitimize_pe_coff_symbol (x, true);
13255 if (tmp)
13256 return tmp;
13259 if (flag_pic && SYMBOLIC_CONST (x))
13260 return legitimize_pic_address (x, 0);
13262 #if TARGET_MACHO
13263 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13264 return machopic_indirect_data_reference (x, 0);
13265 #endif
13267 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13268 if (GET_CODE (x) == ASHIFT
13269 && CONST_INT_P (XEXP (x, 1))
13270 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13272 changed = 1;
13273 log = INTVAL (XEXP (x, 1));
13274 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13275 GEN_INT (1 << log));
13278 if (GET_CODE (x) == PLUS)
13280 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13282 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13283 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13284 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13286 changed = 1;
13287 log = INTVAL (XEXP (XEXP (x, 0), 1));
13288 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13289 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13290 GEN_INT (1 << log));
13293 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13294 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13295 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13297 changed = 1;
13298 log = INTVAL (XEXP (XEXP (x, 1), 1));
13299 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13300 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13301 GEN_INT (1 << log));
13304 /* Put multiply first if it isn't already. */
13305 if (GET_CODE (XEXP (x, 1)) == MULT)
13307 rtx tmp = XEXP (x, 0);
13308 XEXP (x, 0) = XEXP (x, 1);
13309 XEXP (x, 1) = tmp;
13310 changed = 1;
13313 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13314 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13315 created by virtual register instantiation, register elimination, and
13316 similar optimizations. */
13317 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13319 changed = 1;
13320 x = gen_rtx_PLUS (Pmode,
13321 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13322 XEXP (XEXP (x, 1), 0)),
13323 XEXP (XEXP (x, 1), 1));
13326 /* Canonicalize
13327 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13328 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13329 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13330 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13331 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13332 && CONSTANT_P (XEXP (x, 1)))
13334 rtx constant;
13335 rtx other = NULL_RTX;
13337 if (CONST_INT_P (XEXP (x, 1)))
13339 constant = XEXP (x, 1);
13340 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13342 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13344 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13345 other = XEXP (x, 1);
13347 else
13348 constant = 0;
13350 if (constant)
13352 changed = 1;
13353 x = gen_rtx_PLUS (Pmode,
13354 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13355 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13356 plus_constant (Pmode, other,
13357 INTVAL (constant)));
13361 if (changed && ix86_legitimate_address_p (mode, x, false))
13362 return x;
13364 if (GET_CODE (XEXP (x, 0)) == MULT)
13366 changed = 1;
13367 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13370 if (GET_CODE (XEXP (x, 1)) == MULT)
13372 changed = 1;
13373 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13376 if (changed
13377 && REG_P (XEXP (x, 1))
13378 && REG_P (XEXP (x, 0)))
13379 return x;
13381 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13383 changed = 1;
13384 x = legitimize_pic_address (x, 0);
13387 if (changed && ix86_legitimate_address_p (mode, x, false))
13388 return x;
13390 if (REG_P (XEXP (x, 0)))
13392 rtx temp = gen_reg_rtx (Pmode);
13393 rtx val = force_operand (XEXP (x, 1), temp);
13394 if (val != temp)
13396 val = convert_to_mode (Pmode, val, 1);
13397 emit_move_insn (temp, val);
13400 XEXP (x, 1) = temp;
13401 return x;
13404 else if (REG_P (XEXP (x, 1)))
13406 rtx temp = gen_reg_rtx (Pmode);
13407 rtx val = force_operand (XEXP (x, 0), temp);
13408 if (val != temp)
13410 val = convert_to_mode (Pmode, val, 1);
13411 emit_move_insn (temp, val);
13414 XEXP (x, 0) = temp;
13415 return x;
13419 return x;
13422 /* Print an integer constant expression in assembler syntax. Addition
13423 and subtraction are the only arithmetic that may appear in these
13424 expressions. FILE is the stdio stream to write to, X is the rtx, and
13425 CODE is the operand print code from the output string. */
13427 static void
13428 output_pic_addr_const (FILE *file, rtx x, int code)
13430 char buf[256];
13432 switch (GET_CODE (x))
13434 case PC:
13435 gcc_assert (flag_pic);
13436 putc ('.', file);
13437 break;
13439 case SYMBOL_REF:
13440 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13441 output_addr_const (file, x);
13442 else
13444 const char *name = XSTR (x, 0);
13446 /* Mark the decl as referenced so that cgraph will
13447 output the function. */
13448 if (SYMBOL_REF_DECL (x))
13449 mark_decl_referenced (SYMBOL_REF_DECL (x));
13451 #if TARGET_MACHO
13452 if (MACHOPIC_INDIRECT
13453 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13454 name = machopic_indirection_name (x, /*stub_p=*/true);
13455 #endif
13456 assemble_name (file, name);
13458 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13459 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13460 fputs ("@PLT", file);
13461 break;
13463 case LABEL_REF:
13464 x = XEXP (x, 0);
13465 /* FALLTHRU */
13466 case CODE_LABEL:
13467 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13468 assemble_name (asm_out_file, buf);
13469 break;
13471 case CONST_INT:
13472 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13473 break;
13475 case CONST:
13476 /* This used to output parentheses around the expression,
13477 but that does not work on the 386 (either ATT or BSD assembler). */
13478 output_pic_addr_const (file, XEXP (x, 0), code);
13479 break;
13481 case CONST_DOUBLE:
13482 if (GET_MODE (x) == VOIDmode)
13484 /* We can use %d if the number is <32 bits and positive. */
13485 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13486 fprintf (file, "0x%lx%08lx",
13487 (unsigned long) CONST_DOUBLE_HIGH (x),
13488 (unsigned long) CONST_DOUBLE_LOW (x));
13489 else
13490 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13492 else
13493 /* We can't handle floating point constants;
13494 TARGET_PRINT_OPERAND must handle them. */
13495 output_operand_lossage ("floating constant misused");
13496 break;
13498 case PLUS:
13499 /* Some assemblers need integer constants to appear first. */
13500 if (CONST_INT_P (XEXP (x, 0)))
13502 output_pic_addr_const (file, XEXP (x, 0), code);
13503 putc ('+', file);
13504 output_pic_addr_const (file, XEXP (x, 1), code);
13506 else
13508 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13509 output_pic_addr_const (file, XEXP (x, 1), code);
13510 putc ('+', file);
13511 output_pic_addr_const (file, XEXP (x, 0), code);
13513 break;
13515 case MINUS:
13516 if (!TARGET_MACHO)
13517 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13518 output_pic_addr_const (file, XEXP (x, 0), code);
13519 putc ('-', file);
13520 output_pic_addr_const (file, XEXP (x, 1), code);
13521 if (!TARGET_MACHO)
13522 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13523 break;
13525 case UNSPEC:
13526 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13528 bool f = i386_asm_output_addr_const_extra (file, x);
13529 gcc_assert (f);
13530 break;
13533 gcc_assert (XVECLEN (x, 0) == 1);
13534 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13535 switch (XINT (x, 1))
13537 case UNSPEC_GOT:
13538 fputs ("@GOT", file);
13539 break;
13540 case UNSPEC_GOTOFF:
13541 fputs ("@GOTOFF", file);
13542 break;
13543 case UNSPEC_PLTOFF:
13544 fputs ("@PLTOFF", file);
13545 break;
13546 case UNSPEC_PCREL:
13547 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13548 "(%rip)" : "[rip]", file);
13549 break;
13550 case UNSPEC_GOTPCREL:
13551 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13552 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13553 break;
13554 case UNSPEC_GOTTPOFF:
13555 /* FIXME: This might be @TPOFF in Sun ld too. */
13556 fputs ("@gottpoff", file);
13557 break;
13558 case UNSPEC_TPOFF:
13559 fputs ("@tpoff", file);
13560 break;
13561 case UNSPEC_NTPOFF:
13562 if (TARGET_64BIT)
13563 fputs ("@tpoff", file);
13564 else
13565 fputs ("@ntpoff", file);
13566 break;
13567 case UNSPEC_DTPOFF:
13568 fputs ("@dtpoff", file);
13569 break;
13570 case UNSPEC_GOTNTPOFF:
13571 if (TARGET_64BIT)
13572 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13573 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13574 else
13575 fputs ("@gotntpoff", file);
13576 break;
13577 case UNSPEC_INDNTPOFF:
13578 fputs ("@indntpoff", file);
13579 break;
13580 #if TARGET_MACHO
13581 case UNSPEC_MACHOPIC_OFFSET:
13582 putc ('-', file);
13583 machopic_output_function_base_name (file);
13584 break;
13585 #endif
13586 default:
13587 output_operand_lossage ("invalid UNSPEC as operand");
13588 break;
13590 break;
13592 default:
13593 output_operand_lossage ("invalid expression as operand");
13597 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13598 We need to emit DTP-relative relocations. */
13600 static void ATTRIBUTE_UNUSED
13601 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13603 fputs (ASM_LONG, file);
13604 output_addr_const (file, x);
13605 fputs ("@dtpoff", file);
13606 switch (size)
13608 case 4:
13609 break;
13610 case 8:
13611 fputs (", 0", file);
13612 break;
13613 default:
13614 gcc_unreachable ();
13618 /* Return true if X is a representation of the PIC register. This copes
13619 with calls from ix86_find_base_term, where the register might have
13620 been replaced by a cselib value. */
13622 static bool
13623 ix86_pic_register_p (rtx x)
13625 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13626 return (pic_offset_table_rtx
13627 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13628 else
13629 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13632 /* Helper function for ix86_delegitimize_address.
13633 Attempt to delegitimize TLS local-exec accesses. */
13635 static rtx
13636 ix86_delegitimize_tls_address (rtx orig_x)
13638 rtx x = orig_x, unspec;
13639 struct ix86_address addr;
13641 if (!TARGET_TLS_DIRECT_SEG_REFS)
13642 return orig_x;
13643 if (MEM_P (x))
13644 x = XEXP (x, 0);
13645 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13646 return orig_x;
13647 if (ix86_decompose_address (x, &addr) == 0
13648 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13649 || addr.disp == NULL_RTX
13650 || GET_CODE (addr.disp) != CONST)
13651 return orig_x;
13652 unspec = XEXP (addr.disp, 0);
13653 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13654 unspec = XEXP (unspec, 0);
13655 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13656 return orig_x;
13657 x = XVECEXP (unspec, 0, 0);
13658 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13659 if (unspec != XEXP (addr.disp, 0))
13660 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13661 if (addr.index)
13663 rtx idx = addr.index;
13664 if (addr.scale != 1)
13665 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13666 x = gen_rtx_PLUS (Pmode, idx, x);
13668 if (addr.base)
13669 x = gen_rtx_PLUS (Pmode, addr.base, x);
13670 if (MEM_P (orig_x))
13671 x = replace_equiv_address_nv (orig_x, x);
13672 return x;
13675 /* In the name of slightly smaller debug output, and to cater to
13676 general assembler lossage, recognize PIC+GOTOFF and turn it back
13677 into a direct symbol reference.
13679 On Darwin, this is necessary to avoid a crash, because Darwin
13680 has a different PIC label for each routine but the DWARF debugging
13681 information is not associated with any particular routine, so it's
13682 necessary to remove references to the PIC label from RTL stored by
13683 the DWARF output code. */
13685 static rtx
13686 ix86_delegitimize_address (rtx x)
13688 rtx orig_x = delegitimize_mem_from_attrs (x);
13689 /* addend is NULL or some rtx if x is something+GOTOFF where
13690 something doesn't include the PIC register. */
13691 rtx addend = NULL_RTX;
13692 /* reg_addend is NULL or a multiple of some register. */
13693 rtx reg_addend = NULL_RTX;
13694 /* const_addend is NULL or a const_int. */
13695 rtx const_addend = NULL_RTX;
13696 /* This is the result, or NULL. */
13697 rtx result = NULL_RTX;
13699 x = orig_x;
13701 if (MEM_P (x))
13702 x = XEXP (x, 0);
13704 if (TARGET_64BIT)
13706 if (GET_CODE (x) == CONST
13707 && GET_CODE (XEXP (x, 0)) == PLUS
13708 && GET_MODE (XEXP (x, 0)) == Pmode
13709 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13710 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13711 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13713 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13714 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13715 if (MEM_P (orig_x))
13716 x = replace_equiv_address_nv (orig_x, x);
13717 return x;
13719 if (GET_CODE (x) != CONST
13720 || GET_CODE (XEXP (x, 0)) != UNSPEC
13721 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13722 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13723 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13724 return ix86_delegitimize_tls_address (orig_x);
13725 x = XVECEXP (XEXP (x, 0), 0, 0);
13726 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13728 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13729 GET_MODE (x), 0);
13730 if (x == NULL_RTX)
13731 return orig_x;
13733 return x;
13736 if (GET_CODE (x) != PLUS
13737 || GET_CODE (XEXP (x, 1)) != CONST)
13738 return ix86_delegitimize_tls_address (orig_x);
13740 if (ix86_pic_register_p (XEXP (x, 0)))
13741 /* %ebx + GOT/GOTOFF */
13743 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13745 /* %ebx + %reg * scale + GOT/GOTOFF */
13746 reg_addend = XEXP (x, 0);
13747 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13748 reg_addend = XEXP (reg_addend, 1);
13749 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13750 reg_addend = XEXP (reg_addend, 0);
13751 else
13753 reg_addend = NULL_RTX;
13754 addend = XEXP (x, 0);
13757 else
13758 addend = XEXP (x, 0);
13760 x = XEXP (XEXP (x, 1), 0);
13761 if (GET_CODE (x) == PLUS
13762 && CONST_INT_P (XEXP (x, 1)))
13764 const_addend = XEXP (x, 1);
13765 x = XEXP (x, 0);
13768 if (GET_CODE (x) == UNSPEC
13769 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13770 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13771 result = XVECEXP (x, 0, 0);
13773 if (TARGET_MACHO && darwin_local_data_pic (x)
13774 && !MEM_P (orig_x))
13775 result = XVECEXP (x, 0, 0);
13777 if (! result)
13778 return ix86_delegitimize_tls_address (orig_x);
13780 if (const_addend)
13781 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13782 if (reg_addend)
13783 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13784 if (addend)
13786 /* If the rest of original X doesn't involve the PIC register, add
13787 addend and subtract pic_offset_table_rtx. This can happen e.g.
13788 for code like:
13789 leal (%ebx, %ecx, 4), %ecx
13791 movl foo@GOTOFF(%ecx), %edx
13792 in which case we return (%ecx - %ebx) + foo. */
13793 if (pic_offset_table_rtx)
13794 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13795 pic_offset_table_rtx),
13796 result);
13797 else
13798 return orig_x;
13800 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13802 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13803 if (result == NULL_RTX)
13804 return orig_x;
13806 return result;
13809 /* If X is a machine specific address (i.e. a symbol or label being
13810 referenced as a displacement from the GOT implemented using an
13811 UNSPEC), then return the base term. Otherwise return X. */
13814 ix86_find_base_term (rtx x)
13816 rtx term;
13818 if (TARGET_64BIT)
13820 if (GET_CODE (x) != CONST)
13821 return x;
13822 term = XEXP (x, 0);
13823 if (GET_CODE (term) == PLUS
13824 && (CONST_INT_P (XEXP (term, 1))
13825 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13826 term = XEXP (term, 0);
13827 if (GET_CODE (term) != UNSPEC
13828 || (XINT (term, 1) != UNSPEC_GOTPCREL
13829 && XINT (term, 1) != UNSPEC_PCREL))
13830 return x;
13832 return XVECEXP (term, 0, 0);
13835 return ix86_delegitimize_address (x);
13838 static void
13839 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13840 bool fp, FILE *file)
13842 const char *suffix;
13844 if (mode == CCFPmode || mode == CCFPUmode)
13846 code = ix86_fp_compare_code_to_integer (code);
13847 mode = CCmode;
13849 if (reverse)
13850 code = reverse_condition (code);
13852 switch (code)
13854 case EQ:
13855 switch (mode)
13857 case CCAmode:
13858 suffix = "a";
13859 break;
13861 case CCCmode:
13862 suffix = "c";
13863 break;
13865 case CCOmode:
13866 suffix = "o";
13867 break;
13869 case CCSmode:
13870 suffix = "s";
13871 break;
13873 default:
13874 suffix = "e";
13876 break;
13877 case NE:
13878 switch (mode)
13880 case CCAmode:
13881 suffix = "na";
13882 break;
13884 case CCCmode:
13885 suffix = "nc";
13886 break;
13888 case CCOmode:
13889 suffix = "no";
13890 break;
13892 case CCSmode:
13893 suffix = "ns";
13894 break;
13896 default:
13897 suffix = "ne";
13899 break;
13900 case GT:
13901 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13902 suffix = "g";
13903 break;
13904 case GTU:
13905 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13906 Those same assemblers have the same but opposite lossage on cmov. */
13907 if (mode == CCmode)
13908 suffix = fp ? "nbe" : "a";
13909 else if (mode == CCCmode)
13910 suffix = "b";
13911 else
13912 gcc_unreachable ();
13913 break;
13914 case LT:
13915 switch (mode)
13917 case CCNOmode:
13918 case CCGOCmode:
13919 suffix = "s";
13920 break;
13922 case CCmode:
13923 case CCGCmode:
13924 suffix = "l";
13925 break;
13927 default:
13928 gcc_unreachable ();
13930 break;
13931 case LTU:
13932 gcc_assert (mode == CCmode || mode == CCCmode);
13933 suffix = "b";
13934 break;
13935 case GE:
13936 switch (mode)
13938 case CCNOmode:
13939 case CCGOCmode:
13940 suffix = "ns";
13941 break;
13943 case CCmode:
13944 case CCGCmode:
13945 suffix = "ge";
13946 break;
13948 default:
13949 gcc_unreachable ();
13951 break;
13952 case GEU:
13953 /* ??? As above. */
13954 gcc_assert (mode == CCmode || mode == CCCmode);
13955 suffix = fp ? "nb" : "ae";
13956 break;
13957 case LE:
13958 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13959 suffix = "le";
13960 break;
13961 case LEU:
13962 /* ??? As above. */
13963 if (mode == CCmode)
13964 suffix = "be";
13965 else if (mode == CCCmode)
13966 suffix = fp ? "nb" : "ae";
13967 else
13968 gcc_unreachable ();
13969 break;
13970 case UNORDERED:
13971 suffix = fp ? "u" : "p";
13972 break;
13973 case ORDERED:
13974 suffix = fp ? "nu" : "np";
13975 break;
13976 default:
13977 gcc_unreachable ();
13979 fputs (suffix, file);
13982 /* Print the name of register X to FILE based on its machine mode and number.
13983 If CODE is 'w', pretend the mode is HImode.
13984 If CODE is 'b', pretend the mode is QImode.
13985 If CODE is 'k', pretend the mode is SImode.
13986 If CODE is 'q', pretend the mode is DImode.
13987 If CODE is 'x', pretend the mode is V4SFmode.
13988 If CODE is 't', pretend the mode is V8SFmode.
13989 If CODE is 'h', pretend the reg is the 'high' byte register.
13990 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13991 If CODE is 'd', duplicate the operand for AVX instruction.
13994 void
13995 print_reg (rtx x, int code, FILE *file)
13997 const char *reg;
13998 unsigned int regno;
13999 bool duplicated = code == 'd' && TARGET_AVX;
14001 if (ASSEMBLER_DIALECT == ASM_ATT)
14002 putc ('%', file);
14004 if (x == pc_rtx)
14006 gcc_assert (TARGET_64BIT);
14007 fputs ("rip", file);
14008 return;
14011 regno = true_regnum (x);
14012 gcc_assert (regno != ARG_POINTER_REGNUM
14013 && regno != FRAME_POINTER_REGNUM
14014 && regno != FLAGS_REG
14015 && regno != FPSR_REG
14016 && regno != FPCR_REG);
14018 if (code == 'w' || MMX_REG_P (x))
14019 code = 2;
14020 else if (code == 'b')
14021 code = 1;
14022 else if (code == 'k')
14023 code = 4;
14024 else if (code == 'q')
14025 code = 8;
14026 else if (code == 'y')
14027 code = 3;
14028 else if (code == 'h')
14029 code = 0;
14030 else if (code == 'x')
14031 code = 16;
14032 else if (code == 't')
14033 code = 32;
14034 else
14035 code = GET_MODE_SIZE (GET_MODE (x));
14037 /* Irritatingly, AMD extended registers use different naming convention
14038 from the normal registers: "r%d[bwd]" */
14039 if (REX_INT_REGNO_P (regno))
14041 gcc_assert (TARGET_64BIT);
14042 putc ('r', file);
14043 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14044 switch (code)
14046 case 0:
14047 error ("extended registers have no high halves");
14048 break;
14049 case 1:
14050 putc ('b', file);
14051 break;
14052 case 2:
14053 putc ('w', file);
14054 break;
14055 case 4:
14056 putc ('d', file);
14057 break;
14058 case 8:
14059 /* no suffix */
14060 break;
14061 default:
14062 error ("unsupported operand size for extended register");
14063 break;
14065 return;
14068 reg = NULL;
14069 switch (code)
14071 case 3:
14072 if (STACK_TOP_P (x))
14074 reg = "st(0)";
14075 break;
14077 /* FALLTHRU */
14078 case 8:
14079 case 4:
14080 case 12:
14081 if (! ANY_FP_REG_P (x))
14082 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14083 /* FALLTHRU */
14084 case 16:
14085 case 2:
14086 normal:
14087 reg = hi_reg_name[regno];
14088 break;
14089 case 1:
14090 if (regno >= ARRAY_SIZE (qi_reg_name))
14091 goto normal;
14092 reg = qi_reg_name[regno];
14093 break;
14094 case 0:
14095 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14096 goto normal;
14097 reg = qi_high_reg_name[regno];
14098 break;
14099 case 32:
14100 if (SSE_REG_P (x))
14102 gcc_assert (!duplicated);
14103 putc ('y', file);
14104 fputs (hi_reg_name[regno] + 1, file);
14105 return;
14107 break;
14108 default:
14109 gcc_unreachable ();
14112 fputs (reg, file);
14113 if (duplicated)
14115 if (ASSEMBLER_DIALECT == ASM_ATT)
14116 fprintf (file, ", %%%s", reg);
14117 else
14118 fprintf (file, ", %s", reg);
14122 /* Locate some local-dynamic symbol still in use by this function
14123 so that we can print its name in some tls_local_dynamic_base
14124 pattern. */
14126 static int
14127 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14129 rtx x = *px;
14131 if (GET_CODE (x) == SYMBOL_REF
14132 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14134 cfun->machine->some_ld_name = XSTR (x, 0);
14135 return 1;
14138 return 0;
14141 static const char *
14142 get_some_local_dynamic_name (void)
14144 rtx insn;
14146 if (cfun->machine->some_ld_name)
14147 return cfun->machine->some_ld_name;
14149 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14150 if (NONDEBUG_INSN_P (insn)
14151 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14152 return cfun->machine->some_ld_name;
14154 return NULL;
14157 /* Meaning of CODE:
14158 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14159 C -- print opcode suffix for set/cmov insn.
14160 c -- like C, but print reversed condition
14161 F,f -- likewise, but for floating-point.
14162 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14163 otherwise nothing
14164 R -- print the prefix for register names.
14165 z -- print the opcode suffix for the size of the current operand.
14166 Z -- likewise, with special suffixes for x87 instructions.
14167 * -- print a star (in certain assembler syntax)
14168 A -- print an absolute memory reference.
14169 E -- print address with DImode register names if TARGET_64BIT.
14170 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14171 s -- print a shift double count, followed by the assemblers argument
14172 delimiter.
14173 b -- print the QImode name of the register for the indicated operand.
14174 %b0 would print %al if operands[0] is reg 0.
14175 w -- likewise, print the HImode name of the register.
14176 k -- likewise, print the SImode name of the register.
14177 q -- likewise, print the DImode name of the register.
14178 x -- likewise, print the V4SFmode name of the register.
14179 t -- likewise, print the V8SFmode name of the register.
14180 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14181 y -- print "st(0)" instead of "st" as a register.
14182 d -- print duplicated register operand for AVX instruction.
14183 D -- print condition for SSE cmp instruction.
14184 P -- if PIC, print an @PLT suffix.
14185 p -- print raw symbol name.
14186 X -- don't print any sort of PIC '@' suffix for a symbol.
14187 & -- print some in-use local-dynamic symbol name.
14188 H -- print a memory address offset by 8; used for sse high-parts
14189 Y -- print condition for XOP pcom* instruction.
14190 + -- print a branch hint as 'cs' or 'ds' prefix
14191 ; -- print a semicolon (after prefixes due to bug in older gas).
14192 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14193 @ -- print a segment register of thread base pointer load
14194 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14197 void
14198 ix86_print_operand (FILE *file, rtx x, int code)
14200 if (code)
14202 switch (code)
14204 case 'A':
14205 switch (ASSEMBLER_DIALECT)
14207 case ASM_ATT:
14208 putc ('*', file);
14209 break;
14211 case ASM_INTEL:
14212 /* Intel syntax. For absolute addresses, registers should not
14213 be surrounded by braces. */
14214 if (!REG_P (x))
14216 putc ('[', file);
14217 ix86_print_operand (file, x, 0);
14218 putc (']', file);
14219 return;
14221 break;
14223 default:
14224 gcc_unreachable ();
14227 ix86_print_operand (file, x, 0);
14228 return;
14230 case 'E':
14231 /* Wrap address in an UNSPEC to declare special handling. */
14232 if (TARGET_64BIT)
14233 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14235 output_address (x);
14236 return;
14238 case 'L':
14239 if (ASSEMBLER_DIALECT == ASM_ATT)
14240 putc ('l', file);
14241 return;
14243 case 'W':
14244 if (ASSEMBLER_DIALECT == ASM_ATT)
14245 putc ('w', file);
14246 return;
14248 case 'B':
14249 if (ASSEMBLER_DIALECT == ASM_ATT)
14250 putc ('b', file);
14251 return;
14253 case 'Q':
14254 if (ASSEMBLER_DIALECT == ASM_ATT)
14255 putc ('l', file);
14256 return;
14258 case 'S':
14259 if (ASSEMBLER_DIALECT == ASM_ATT)
14260 putc ('s', file);
14261 return;
14263 case 'T':
14264 if (ASSEMBLER_DIALECT == ASM_ATT)
14265 putc ('t', file);
14266 return;
14268 case 'O':
14269 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14270 if (ASSEMBLER_DIALECT != ASM_ATT)
14271 return;
14273 switch (GET_MODE_SIZE (GET_MODE (x)))
14275 case 2:
14276 putc ('w', file);
14277 break;
14279 case 4:
14280 putc ('l', file);
14281 break;
14283 case 8:
14284 putc ('q', file);
14285 break;
14287 default:
14288 output_operand_lossage
14289 ("invalid operand size for operand code 'O'");
14290 return;
14293 putc ('.', file);
14294 #endif
14295 return;
14297 case 'z':
14298 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14300 /* Opcodes don't get size suffixes if using Intel opcodes. */
14301 if (ASSEMBLER_DIALECT == ASM_INTEL)
14302 return;
14304 switch (GET_MODE_SIZE (GET_MODE (x)))
14306 case 1:
14307 putc ('b', file);
14308 return;
14310 case 2:
14311 putc ('w', file);
14312 return;
14314 case 4:
14315 putc ('l', file);
14316 return;
14318 case 8:
14319 putc ('q', file);
14320 return;
14322 default:
14323 output_operand_lossage
14324 ("invalid operand size for operand code 'z'");
14325 return;
14329 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14330 warning
14331 (0, "non-integer operand used with operand code 'z'");
14332 /* FALLTHRU */
14334 case 'Z':
14335 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14336 if (ASSEMBLER_DIALECT == ASM_INTEL)
14337 return;
14339 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14341 switch (GET_MODE_SIZE (GET_MODE (x)))
14343 case 2:
14344 #ifdef HAVE_AS_IX86_FILDS
14345 putc ('s', file);
14346 #endif
14347 return;
14349 case 4:
14350 putc ('l', file);
14351 return;
14353 case 8:
14354 #ifdef HAVE_AS_IX86_FILDQ
14355 putc ('q', file);
14356 #else
14357 fputs ("ll", file);
14358 #endif
14359 return;
14361 default:
14362 break;
14365 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14367 /* 387 opcodes don't get size suffixes
14368 if the operands are registers. */
14369 if (STACK_REG_P (x))
14370 return;
14372 switch (GET_MODE_SIZE (GET_MODE (x)))
14374 case 4:
14375 putc ('s', file);
14376 return;
14378 case 8:
14379 putc ('l', file);
14380 return;
14382 case 12:
14383 case 16:
14384 putc ('t', file);
14385 return;
14387 default:
14388 break;
14391 else
14393 output_operand_lossage
14394 ("invalid operand type used with operand code 'Z'");
14395 return;
14398 output_operand_lossage
14399 ("invalid operand size for operand code 'Z'");
14400 return;
14402 case 'd':
14403 case 'b':
14404 case 'w':
14405 case 'k':
14406 case 'q':
14407 case 'h':
14408 case 't':
14409 case 'y':
14410 case 'x':
14411 case 'X':
14412 case 'P':
14413 case 'p':
14414 break;
14416 case 's':
14417 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14419 ix86_print_operand (file, x, 0);
14420 fputs (", ", file);
14422 return;
14424 case 'Y':
14425 switch (GET_CODE (x))
14427 case NE:
14428 fputs ("neq", file);
14429 break;
14430 case EQ:
14431 fputs ("eq", file);
14432 break;
14433 case GE:
14434 case GEU:
14435 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14436 break;
14437 case GT:
14438 case GTU:
14439 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14440 break;
14441 case LE:
14442 case LEU:
14443 fputs ("le", file);
14444 break;
14445 case LT:
14446 case LTU:
14447 fputs ("lt", file);
14448 break;
14449 case UNORDERED:
14450 fputs ("unord", file);
14451 break;
14452 case ORDERED:
14453 fputs ("ord", file);
14454 break;
14455 case UNEQ:
14456 fputs ("ueq", file);
14457 break;
14458 case UNGE:
14459 fputs ("nlt", file);
14460 break;
14461 case UNGT:
14462 fputs ("nle", file);
14463 break;
14464 case UNLE:
14465 fputs ("ule", file);
14466 break;
14467 case UNLT:
14468 fputs ("ult", file);
14469 break;
14470 case LTGT:
14471 fputs ("une", file);
14472 break;
14473 default:
14474 output_operand_lossage ("operand is not a condition code, "
14475 "invalid operand code 'Y'");
14476 return;
14478 return;
14480 case 'D':
14481 /* Little bit of braindamage here. The SSE compare instructions
14482 does use completely different names for the comparisons that the
14483 fp conditional moves. */
14484 switch (GET_CODE (x))
14486 case UNEQ:
14487 if (TARGET_AVX)
14489 fputs ("eq_us", file);
14490 break;
14492 case EQ:
14493 fputs ("eq", file);
14494 break;
14495 case UNLT:
14496 if (TARGET_AVX)
14498 fputs ("nge", file);
14499 break;
14501 case LT:
14502 fputs ("lt", file);
14503 break;
14504 case UNLE:
14505 if (TARGET_AVX)
14507 fputs ("ngt", file);
14508 break;
14510 case LE:
14511 fputs ("le", file);
14512 break;
14513 case UNORDERED:
14514 fputs ("unord", file);
14515 break;
14516 case LTGT:
14517 if (TARGET_AVX)
14519 fputs ("neq_oq", file);
14520 break;
14522 case NE:
14523 fputs ("neq", file);
14524 break;
14525 case GE:
14526 if (TARGET_AVX)
14528 fputs ("ge", file);
14529 break;
14531 case UNGE:
14532 fputs ("nlt", file);
14533 break;
14534 case GT:
14535 if (TARGET_AVX)
14537 fputs ("gt", file);
14538 break;
14540 case UNGT:
14541 fputs ("nle", file);
14542 break;
14543 case ORDERED:
14544 fputs ("ord", file);
14545 break;
14546 default:
14547 output_operand_lossage ("operand is not a condition code, "
14548 "invalid operand code 'D'");
14549 return;
14551 return;
14553 case 'F':
14554 case 'f':
14555 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14556 if (ASSEMBLER_DIALECT == ASM_ATT)
14557 putc ('.', file);
14558 #endif
14560 case 'C':
14561 case 'c':
14562 if (!COMPARISON_P (x))
14564 output_operand_lossage ("operand is not a condition code, "
14565 "invalid operand code '%c'", code);
14566 return;
14568 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14569 code == 'c' || code == 'f',
14570 code == 'F' || code == 'f',
14571 file);
14572 return;
14574 case 'H':
14575 if (!offsettable_memref_p (x))
14577 output_operand_lossage ("operand is not an offsettable memory "
14578 "reference, invalid operand code 'H'");
14579 return;
14581 /* It doesn't actually matter what mode we use here, as we're
14582 only going to use this for printing. */
14583 x = adjust_address_nv (x, DImode, 8);
14584 break;
14586 case 'K':
14587 gcc_assert (CONST_INT_P (x));
14589 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14590 #ifdef HAVE_AS_IX86_HLE
14591 fputs ("xacquire ", file);
14592 #else
14593 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14594 #endif
14595 else if (INTVAL (x) & IX86_HLE_RELEASE)
14596 #ifdef HAVE_AS_IX86_HLE
14597 fputs ("xrelease ", file);
14598 #else
14599 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14600 #endif
14601 /* We do not want to print value of the operand. */
14602 return;
14604 case '*':
14605 if (ASSEMBLER_DIALECT == ASM_ATT)
14606 putc ('*', file);
14607 return;
14609 case '&':
14611 const char *name = get_some_local_dynamic_name ();
14612 if (name == NULL)
14613 output_operand_lossage ("'%%&' used without any "
14614 "local dynamic TLS references");
14615 else
14616 assemble_name (file, name);
14617 return;
14620 case '+':
14622 rtx x;
14624 if (!optimize
14625 || optimize_function_for_size_p (cfun)
14626 || !TARGET_BRANCH_PREDICTION_HINTS)
14627 return;
14629 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14630 if (x)
14632 int pred_val = INTVAL (XEXP (x, 0));
14634 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14635 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14637 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14638 bool cputaken
14639 = final_forward_branch_p (current_output_insn) == 0;
14641 /* Emit hints only in the case default branch prediction
14642 heuristics would fail. */
14643 if (taken != cputaken)
14645 /* We use 3e (DS) prefix for taken branches and
14646 2e (CS) prefix for not taken branches. */
14647 if (taken)
14648 fputs ("ds ; ", file);
14649 else
14650 fputs ("cs ; ", file);
14654 return;
14657 case ';':
14658 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14659 putc (';', file);
14660 #endif
14661 return;
14663 case '@':
14664 if (ASSEMBLER_DIALECT == ASM_ATT)
14665 putc ('%', file);
14667 /* The kernel uses a different segment register for performance
14668 reasons; a system call would not have to trash the userspace
14669 segment register, which would be expensive. */
14670 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14671 fputs ("fs", file);
14672 else
14673 fputs ("gs", file);
14674 return;
14676 case '~':
14677 putc (TARGET_AVX2 ? 'i' : 'f', file);
14678 return;
14680 case '^':
14681 if (TARGET_64BIT && Pmode != word_mode)
14682 fputs ("addr32 ", file);
14683 return;
14685 default:
14686 output_operand_lossage ("invalid operand code '%c'", code);
14690 if (REG_P (x))
14691 print_reg (x, code, file);
14693 else if (MEM_P (x))
14695 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14696 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14697 && GET_MODE (x) != BLKmode)
14699 const char * size;
14700 switch (GET_MODE_SIZE (GET_MODE (x)))
14702 case 1: size = "BYTE"; break;
14703 case 2: size = "WORD"; break;
14704 case 4: size = "DWORD"; break;
14705 case 8: size = "QWORD"; break;
14706 case 12: size = "TBYTE"; break;
14707 case 16:
14708 if (GET_MODE (x) == XFmode)
14709 size = "TBYTE";
14710 else
14711 size = "XMMWORD";
14712 break;
14713 case 32: size = "YMMWORD"; break;
14714 default:
14715 gcc_unreachable ();
14718 /* Check for explicit size override (codes 'b', 'w', 'k',
14719 'q' and 'x') */
14720 if (code == 'b')
14721 size = "BYTE";
14722 else if (code == 'w')
14723 size = "WORD";
14724 else if (code == 'k')
14725 size = "DWORD";
14726 else if (code == 'q')
14727 size = "QWORD";
14728 else if (code == 'x')
14729 size = "XMMWORD";
14731 fputs (size, file);
14732 fputs (" PTR ", file);
14735 x = XEXP (x, 0);
14736 /* Avoid (%rip) for call operands. */
14737 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14738 && !CONST_INT_P (x))
14739 output_addr_const (file, x);
14740 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14741 output_operand_lossage ("invalid constraints for operand");
14742 else
14743 output_address (x);
14746 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14748 REAL_VALUE_TYPE r;
14749 long l;
14751 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14752 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14754 if (ASSEMBLER_DIALECT == ASM_ATT)
14755 putc ('$', file);
14756 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14757 if (code == 'q')
14758 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14759 (unsigned long long) (int) l);
14760 else
14761 fprintf (file, "0x%08x", (unsigned int) l);
14764 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14766 REAL_VALUE_TYPE r;
14767 long l[2];
14769 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14770 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14772 if (ASSEMBLER_DIALECT == ASM_ATT)
14773 putc ('$', file);
14774 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14777 /* These float cases don't actually occur as immediate operands. */
14778 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14780 char dstr[30];
14782 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14783 fputs (dstr, file);
14786 else
14788 /* We have patterns that allow zero sets of memory, for instance.
14789 In 64-bit mode, we should probably support all 8-byte vectors,
14790 since we can in fact encode that into an immediate. */
14791 if (GET_CODE (x) == CONST_VECTOR)
14793 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14794 x = const0_rtx;
14797 if (code != 'P' && code != 'p')
14799 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14801 if (ASSEMBLER_DIALECT == ASM_ATT)
14802 putc ('$', file);
14804 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14805 || GET_CODE (x) == LABEL_REF)
14807 if (ASSEMBLER_DIALECT == ASM_ATT)
14808 putc ('$', file);
14809 else
14810 fputs ("OFFSET FLAT:", file);
14813 if (CONST_INT_P (x))
14814 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14815 else if (flag_pic || MACHOPIC_INDIRECT)
14816 output_pic_addr_const (file, x, code);
14817 else
14818 output_addr_const (file, x);
14822 static bool
14823 ix86_print_operand_punct_valid_p (unsigned char code)
14825 return (code == '@' || code == '*' || code == '+' || code == '&'
14826 || code == ';' || code == '~' || code == '^');
14829 /* Print a memory operand whose address is ADDR. */
14831 static void
14832 ix86_print_operand_address (FILE *file, rtx addr)
14834 struct ix86_address parts;
14835 rtx base, index, disp;
14836 int scale;
14837 int ok;
14838 bool vsib = false;
14839 int code = 0;
14841 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14843 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14844 gcc_assert (parts.index == NULL_RTX);
14845 parts.index = XVECEXP (addr, 0, 1);
14846 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14847 addr = XVECEXP (addr, 0, 0);
14848 vsib = true;
14850 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14852 gcc_assert (TARGET_64BIT);
14853 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14854 code = 'q';
14856 else
14857 ok = ix86_decompose_address (addr, &parts);
14859 gcc_assert (ok);
14861 base = parts.base;
14862 index = parts.index;
14863 disp = parts.disp;
14864 scale = parts.scale;
14866 switch (parts.seg)
14868 case SEG_DEFAULT:
14869 break;
14870 case SEG_FS:
14871 case SEG_GS:
14872 if (ASSEMBLER_DIALECT == ASM_ATT)
14873 putc ('%', file);
14874 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14875 break;
14876 default:
14877 gcc_unreachable ();
14880 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14881 if (TARGET_64BIT && !base && !index)
14883 rtx symbol = disp;
14885 if (GET_CODE (disp) == CONST
14886 && GET_CODE (XEXP (disp, 0)) == PLUS
14887 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14888 symbol = XEXP (XEXP (disp, 0), 0);
14890 if (GET_CODE (symbol) == LABEL_REF
14891 || (GET_CODE (symbol) == SYMBOL_REF
14892 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14893 base = pc_rtx;
14895 if (!base && !index)
14897 /* Displacement only requires special attention. */
14899 if (CONST_INT_P (disp))
14901 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14902 fputs ("ds:", file);
14903 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14905 else if (flag_pic)
14906 output_pic_addr_const (file, disp, 0);
14907 else
14908 output_addr_const (file, disp);
14910 else
14912 /* Print SImode register names to force addr32 prefix. */
14913 if (SImode_address_operand (addr, VOIDmode))
14915 #ifdef ENABLE_CHECKING
14916 gcc_assert (TARGET_64BIT);
14917 switch (GET_CODE (addr))
14919 case SUBREG:
14920 gcc_assert (GET_MODE (addr) == SImode);
14921 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14922 break;
14923 case ZERO_EXTEND:
14924 case AND:
14925 gcc_assert (GET_MODE (addr) == DImode);
14926 break;
14927 default:
14928 gcc_unreachable ();
14930 #endif
14931 gcc_assert (!code);
14932 code = 'k';
14934 else if (code == 0
14935 && TARGET_X32
14936 && disp
14937 && CONST_INT_P (disp)
14938 && INTVAL (disp) < -16*1024*1024)
14940 /* X32 runs in 64-bit mode, where displacement, DISP, in
14941 address DISP(%r64), is encoded as 32-bit immediate sign-
14942 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14943 address is %r64 + 0xffffffffbffffd00. When %r64 <
14944 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14945 which is invalid for x32. The correct address is %r64
14946 - 0x40000300 == 0xf7ffdd64. To properly encode
14947 -0x40000300(%r64) for x32, we zero-extend negative
14948 displacement by forcing addr32 prefix which truncates
14949 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14950 zero-extend all negative displacements, including -1(%rsp).
14951 However, for small negative displacements, sign-extension
14952 won't cause overflow. We only zero-extend negative
14953 displacements if they < -16*1024*1024, which is also used
14954 to check legitimate address displacements for PIC. */
14955 code = 'k';
14958 if (ASSEMBLER_DIALECT == ASM_ATT)
14960 if (disp)
14962 if (flag_pic)
14963 output_pic_addr_const (file, disp, 0);
14964 else if (GET_CODE (disp) == LABEL_REF)
14965 output_asm_label (disp);
14966 else
14967 output_addr_const (file, disp);
14970 putc ('(', file);
14971 if (base)
14972 print_reg (base, code, file);
14973 if (index)
14975 putc (',', file);
14976 print_reg (index, vsib ? 0 : code, file);
14977 if (scale != 1 || vsib)
14978 fprintf (file, ",%d", scale);
14980 putc (')', file);
14982 else
14984 rtx offset = NULL_RTX;
14986 if (disp)
14988 /* Pull out the offset of a symbol; print any symbol itself. */
14989 if (GET_CODE (disp) == CONST
14990 && GET_CODE (XEXP (disp, 0)) == PLUS
14991 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14993 offset = XEXP (XEXP (disp, 0), 1);
14994 disp = gen_rtx_CONST (VOIDmode,
14995 XEXP (XEXP (disp, 0), 0));
14998 if (flag_pic)
14999 output_pic_addr_const (file, disp, 0);
15000 else if (GET_CODE (disp) == LABEL_REF)
15001 output_asm_label (disp);
15002 else if (CONST_INT_P (disp))
15003 offset = disp;
15004 else
15005 output_addr_const (file, disp);
15008 putc ('[', file);
15009 if (base)
15011 print_reg (base, code, file);
15012 if (offset)
15014 if (INTVAL (offset) >= 0)
15015 putc ('+', file);
15016 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15019 else if (offset)
15020 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15021 else
15022 putc ('0', file);
15024 if (index)
15026 putc ('+', file);
15027 print_reg (index, vsib ? 0 : code, file);
15028 if (scale != 1 || vsib)
15029 fprintf (file, "*%d", scale);
15031 putc (']', file);
15036 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15038 static bool
15039 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15041 rtx op;
15043 if (GET_CODE (x) != UNSPEC)
15044 return false;
15046 op = XVECEXP (x, 0, 0);
15047 switch (XINT (x, 1))
15049 case UNSPEC_GOTTPOFF:
15050 output_addr_const (file, op);
15051 /* FIXME: This might be @TPOFF in Sun ld. */
15052 fputs ("@gottpoff", file);
15053 break;
15054 case UNSPEC_TPOFF:
15055 output_addr_const (file, op);
15056 fputs ("@tpoff", file);
15057 break;
15058 case UNSPEC_NTPOFF:
15059 output_addr_const (file, op);
15060 if (TARGET_64BIT)
15061 fputs ("@tpoff", file);
15062 else
15063 fputs ("@ntpoff", file);
15064 break;
15065 case UNSPEC_DTPOFF:
15066 output_addr_const (file, op);
15067 fputs ("@dtpoff", file);
15068 break;
15069 case UNSPEC_GOTNTPOFF:
15070 output_addr_const (file, op);
15071 if (TARGET_64BIT)
15072 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15073 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15074 else
15075 fputs ("@gotntpoff", file);
15076 break;
15077 case UNSPEC_INDNTPOFF:
15078 output_addr_const (file, op);
15079 fputs ("@indntpoff", file);
15080 break;
15081 #if TARGET_MACHO
15082 case UNSPEC_MACHOPIC_OFFSET:
15083 output_addr_const (file, op);
15084 putc ('-', file);
15085 machopic_output_function_base_name (file);
15086 break;
15087 #endif
15089 case UNSPEC_STACK_CHECK:
15091 int offset;
15093 gcc_assert (flag_split_stack);
15095 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15096 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15097 #else
15098 gcc_unreachable ();
15099 #endif
15101 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15103 break;
15105 default:
15106 return false;
15109 return true;
15112 /* Split one or more double-mode RTL references into pairs of half-mode
15113 references. The RTL can be REG, offsettable MEM, integer constant, or
15114 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15115 split and "num" is its length. lo_half and hi_half are output arrays
15116 that parallel "operands". */
15118 void
15119 split_double_mode (enum machine_mode mode, rtx operands[],
15120 int num, rtx lo_half[], rtx hi_half[])
15122 enum machine_mode half_mode;
15123 unsigned int byte;
15125 switch (mode)
15127 case TImode:
15128 half_mode = DImode;
15129 break;
15130 case DImode:
15131 half_mode = SImode;
15132 break;
15133 default:
15134 gcc_unreachable ();
15137 byte = GET_MODE_SIZE (half_mode);
15139 while (num--)
15141 rtx op = operands[num];
15143 /* simplify_subreg refuse to split volatile memory addresses,
15144 but we still have to handle it. */
15145 if (MEM_P (op))
15147 lo_half[num] = adjust_address (op, half_mode, 0);
15148 hi_half[num] = adjust_address (op, half_mode, byte);
15150 else
15152 lo_half[num] = simplify_gen_subreg (half_mode, op,
15153 GET_MODE (op) == VOIDmode
15154 ? mode : GET_MODE (op), 0);
15155 hi_half[num] = simplify_gen_subreg (half_mode, op,
15156 GET_MODE (op) == VOIDmode
15157 ? mode : GET_MODE (op), byte);
15162 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15163 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15164 is the expression of the binary operation. The output may either be
15165 emitted here, or returned to the caller, like all output_* functions.
15167 There is no guarantee that the operands are the same mode, as they
15168 might be within FLOAT or FLOAT_EXTEND expressions. */
15170 #ifndef SYSV386_COMPAT
15171 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15172 wants to fix the assemblers because that causes incompatibility
15173 with gcc. No-one wants to fix gcc because that causes
15174 incompatibility with assemblers... You can use the option of
15175 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15176 #define SYSV386_COMPAT 1
15177 #endif
15179 const char *
15180 output_387_binary_op (rtx insn, rtx *operands)
15182 static char buf[40];
15183 const char *p;
15184 const char *ssep;
15185 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15187 #ifdef ENABLE_CHECKING
15188 /* Even if we do not want to check the inputs, this documents input
15189 constraints. Which helps in understanding the following code. */
15190 if (STACK_REG_P (operands[0])
15191 && ((REG_P (operands[1])
15192 && REGNO (operands[0]) == REGNO (operands[1])
15193 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15194 || (REG_P (operands[2])
15195 && REGNO (operands[0]) == REGNO (operands[2])
15196 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15197 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15198 ; /* ok */
15199 else
15200 gcc_assert (is_sse);
15201 #endif
15203 switch (GET_CODE (operands[3]))
15205 case PLUS:
15206 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15207 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15208 p = "fiadd";
15209 else
15210 p = "fadd";
15211 ssep = "vadd";
15212 break;
15214 case MINUS:
15215 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15216 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15217 p = "fisub";
15218 else
15219 p = "fsub";
15220 ssep = "vsub";
15221 break;
15223 case MULT:
15224 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15225 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15226 p = "fimul";
15227 else
15228 p = "fmul";
15229 ssep = "vmul";
15230 break;
15232 case DIV:
15233 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15234 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15235 p = "fidiv";
15236 else
15237 p = "fdiv";
15238 ssep = "vdiv";
15239 break;
15241 default:
15242 gcc_unreachable ();
15245 if (is_sse)
15247 if (TARGET_AVX)
15249 strcpy (buf, ssep);
15250 if (GET_MODE (operands[0]) == SFmode)
15251 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15252 else
15253 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15255 else
15257 strcpy (buf, ssep + 1);
15258 if (GET_MODE (operands[0]) == SFmode)
15259 strcat (buf, "ss\t{%2, %0|%0, %2}");
15260 else
15261 strcat (buf, "sd\t{%2, %0|%0, %2}");
15263 return buf;
15265 strcpy (buf, p);
15267 switch (GET_CODE (operands[3]))
15269 case MULT:
15270 case PLUS:
15271 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15273 rtx temp = operands[2];
15274 operands[2] = operands[1];
15275 operands[1] = temp;
15278 /* know operands[0] == operands[1]. */
15280 if (MEM_P (operands[2]))
15282 p = "%Z2\t%2";
15283 break;
15286 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15288 if (STACK_TOP_P (operands[0]))
15289 /* How is it that we are storing to a dead operand[2]?
15290 Well, presumably operands[1] is dead too. We can't
15291 store the result to st(0) as st(0) gets popped on this
15292 instruction. Instead store to operands[2] (which I
15293 think has to be st(1)). st(1) will be popped later.
15294 gcc <= 2.8.1 didn't have this check and generated
15295 assembly code that the Unixware assembler rejected. */
15296 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15297 else
15298 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15299 break;
15302 if (STACK_TOP_P (operands[0]))
15303 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15304 else
15305 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15306 break;
15308 case MINUS:
15309 case DIV:
15310 if (MEM_P (operands[1]))
15312 p = "r%Z1\t%1";
15313 break;
15316 if (MEM_P (operands[2]))
15318 p = "%Z2\t%2";
15319 break;
15322 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15324 #if SYSV386_COMPAT
15325 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15326 derived assemblers, confusingly reverse the direction of
15327 the operation for fsub{r} and fdiv{r} when the
15328 destination register is not st(0). The Intel assembler
15329 doesn't have this brain damage. Read !SYSV386_COMPAT to
15330 figure out what the hardware really does. */
15331 if (STACK_TOP_P (operands[0]))
15332 p = "{p\t%0, %2|rp\t%2, %0}";
15333 else
15334 p = "{rp\t%2, %0|p\t%0, %2}";
15335 #else
15336 if (STACK_TOP_P (operands[0]))
15337 /* As above for fmul/fadd, we can't store to st(0). */
15338 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15339 else
15340 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15341 #endif
15342 break;
15345 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15347 #if SYSV386_COMPAT
15348 if (STACK_TOP_P (operands[0]))
15349 p = "{rp\t%0, %1|p\t%1, %0}";
15350 else
15351 p = "{p\t%1, %0|rp\t%0, %1}";
15352 #else
15353 if (STACK_TOP_P (operands[0]))
15354 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15355 else
15356 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15357 #endif
15358 break;
15361 if (STACK_TOP_P (operands[0]))
15363 if (STACK_TOP_P (operands[1]))
15364 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15365 else
15366 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15367 break;
15369 else if (STACK_TOP_P (operands[1]))
15371 #if SYSV386_COMPAT
15372 p = "{\t%1, %0|r\t%0, %1}";
15373 #else
15374 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15375 #endif
15377 else
15379 #if SYSV386_COMPAT
15380 p = "{r\t%2, %0|\t%0, %2}";
15381 #else
15382 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15383 #endif
15385 break;
15387 default:
15388 gcc_unreachable ();
15391 strcat (buf, p);
15392 return buf;
15395 /* Check if a 256bit AVX register is referenced inside of EXP. */
15397 static int
15398 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15400 rtx exp = *pexp;
15402 if (GET_CODE (exp) == SUBREG)
15403 exp = SUBREG_REG (exp);
15405 if (REG_P (exp)
15406 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15407 return 1;
15409 return 0;
15412 /* Return needed mode for entity in optimize_mode_switching pass. */
15414 static int
15415 ix86_avx_u128_mode_needed (rtx insn)
15417 if (CALL_P (insn))
15419 rtx link;
15421 /* Needed mode is set to AVX_U128_CLEAN if there are
15422 no 256bit modes used in function arguments. */
15423 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15424 link;
15425 link = XEXP (link, 1))
15427 if (GET_CODE (XEXP (link, 0)) == USE)
15429 rtx arg = XEXP (XEXP (link, 0), 0);
15431 if (ix86_check_avx256_register (&arg, NULL))
15432 return AVX_U128_ANY;
15436 return AVX_U128_CLEAN;
15439 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15440 changes state only when a 256bit register is written to, but we need
15441 to prevent the compiler from moving optimal insertion point above
15442 eventual read from 256bit register. */
15443 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15444 return AVX_U128_DIRTY;
15446 return AVX_U128_ANY;
15449 /* Return mode that i387 must be switched into
15450 prior to the execution of insn. */
15452 static int
15453 ix86_i387_mode_needed (int entity, rtx insn)
15455 enum attr_i387_cw mode;
15457 /* The mode UNINITIALIZED is used to store control word after a
15458 function call or ASM pattern. The mode ANY specify that function
15459 has no requirements on the control word and make no changes in the
15460 bits we are interested in. */
15462 if (CALL_P (insn)
15463 || (NONJUMP_INSN_P (insn)
15464 && (asm_noperands (PATTERN (insn)) >= 0
15465 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15466 return I387_CW_UNINITIALIZED;
15468 if (recog_memoized (insn) < 0)
15469 return I387_CW_ANY;
15471 mode = get_attr_i387_cw (insn);
15473 switch (entity)
15475 case I387_TRUNC:
15476 if (mode == I387_CW_TRUNC)
15477 return mode;
15478 break;
15480 case I387_FLOOR:
15481 if (mode == I387_CW_FLOOR)
15482 return mode;
15483 break;
15485 case I387_CEIL:
15486 if (mode == I387_CW_CEIL)
15487 return mode;
15488 break;
15490 case I387_MASK_PM:
15491 if (mode == I387_CW_MASK_PM)
15492 return mode;
15493 break;
15495 default:
15496 gcc_unreachable ();
15499 return I387_CW_ANY;
15502 /* Return mode that entity must be switched into
15503 prior to the execution of insn. */
15506 ix86_mode_needed (int entity, rtx insn)
15508 switch (entity)
15510 case AVX_U128:
15511 return ix86_avx_u128_mode_needed (insn);
15512 case I387_TRUNC:
15513 case I387_FLOOR:
15514 case I387_CEIL:
15515 case I387_MASK_PM:
15516 return ix86_i387_mode_needed (entity, insn);
15517 default:
15518 gcc_unreachable ();
15520 return 0;
15523 /* Check if a 256bit AVX register is referenced in stores. */
15525 static void
15526 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15528 if (ix86_check_avx256_register (&dest, NULL))
15530 bool *used = (bool *) data;
15531 *used = true;
15535 /* Calculate mode of upper 128bit AVX registers after the insn. */
15537 static int
15538 ix86_avx_u128_mode_after (int mode, rtx insn)
15540 rtx pat = PATTERN (insn);
15542 if (vzeroupper_operation (pat, VOIDmode)
15543 || vzeroall_operation (pat, VOIDmode))
15544 return AVX_U128_CLEAN;
15546 /* We know that state is clean after CALL insn if there are no
15547 256bit registers used in the function return register. */
15548 if (CALL_P (insn))
15550 bool avx_reg256_found = false;
15551 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15552 if (!avx_reg256_found)
15553 return AVX_U128_CLEAN;
15556 /* Otherwise, return current mode. Remember that if insn
15557 references AVX 256bit registers, the mode was already changed
15558 to DIRTY from MODE_NEEDED. */
15559 return mode;
15562 /* Return the mode that an insn results in. */
15565 ix86_mode_after (int entity, int mode, rtx insn)
15567 switch (entity)
15569 case AVX_U128:
15570 return ix86_avx_u128_mode_after (mode, insn);
15571 case I387_TRUNC:
15572 case I387_FLOOR:
15573 case I387_CEIL:
15574 case I387_MASK_PM:
15575 return mode;
15576 default:
15577 gcc_unreachable ();
15581 static int
15582 ix86_avx_u128_mode_entry (void)
15584 tree arg;
15586 /* Entry mode is set to AVX_U128_DIRTY if there are
15587 256bit modes used in function arguments. */
15588 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15589 arg = TREE_CHAIN (arg))
15591 rtx incoming = DECL_INCOMING_RTL (arg);
15593 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15594 return AVX_U128_DIRTY;
15597 return AVX_U128_CLEAN;
15600 /* Return a mode that ENTITY is assumed to be
15601 switched to at function entry. */
15604 ix86_mode_entry (int entity)
15606 switch (entity)
15608 case AVX_U128:
15609 return ix86_avx_u128_mode_entry ();
15610 case I387_TRUNC:
15611 case I387_FLOOR:
15612 case I387_CEIL:
15613 case I387_MASK_PM:
15614 return I387_CW_ANY;
15615 default:
15616 gcc_unreachable ();
15620 static int
15621 ix86_avx_u128_mode_exit (void)
15623 rtx reg = crtl->return_rtx;
15625 /* Exit mode is set to AVX_U128_DIRTY if there are
15626 256bit modes used in the function return register. */
15627 if (reg && ix86_check_avx256_register (&reg, NULL))
15628 return AVX_U128_DIRTY;
15630 return AVX_U128_CLEAN;
15633 /* Return a mode that ENTITY is assumed to be
15634 switched to at function exit. */
15637 ix86_mode_exit (int entity)
15639 switch (entity)
15641 case AVX_U128:
15642 return ix86_avx_u128_mode_exit ();
15643 case I387_TRUNC:
15644 case I387_FLOOR:
15645 case I387_CEIL:
15646 case I387_MASK_PM:
15647 return I387_CW_ANY;
15648 default:
15649 gcc_unreachable ();
15653 /* Output code to initialize control word copies used by trunc?f?i and
15654 rounding patterns. CURRENT_MODE is set to current control word,
15655 while NEW_MODE is set to new control word. */
15657 static void
15658 emit_i387_cw_initialization (int mode)
15660 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15661 rtx new_mode;
15663 enum ix86_stack_slot slot;
15665 rtx reg = gen_reg_rtx (HImode);
15667 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15668 emit_move_insn (reg, copy_rtx (stored_mode));
15670 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15671 || optimize_function_for_size_p (cfun))
15673 switch (mode)
15675 case I387_CW_TRUNC:
15676 /* round toward zero (truncate) */
15677 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15678 slot = SLOT_CW_TRUNC;
15679 break;
15681 case I387_CW_FLOOR:
15682 /* round down toward -oo */
15683 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15684 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15685 slot = SLOT_CW_FLOOR;
15686 break;
15688 case I387_CW_CEIL:
15689 /* round up toward +oo */
15690 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15691 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15692 slot = SLOT_CW_CEIL;
15693 break;
15695 case I387_CW_MASK_PM:
15696 /* mask precision exception for nearbyint() */
15697 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15698 slot = SLOT_CW_MASK_PM;
15699 break;
15701 default:
15702 gcc_unreachable ();
15705 else
15707 switch (mode)
15709 case I387_CW_TRUNC:
15710 /* round toward zero (truncate) */
15711 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15712 slot = SLOT_CW_TRUNC;
15713 break;
15715 case I387_CW_FLOOR:
15716 /* round down toward -oo */
15717 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15718 slot = SLOT_CW_FLOOR;
15719 break;
15721 case I387_CW_CEIL:
15722 /* round up toward +oo */
15723 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15724 slot = SLOT_CW_CEIL;
15725 break;
15727 case I387_CW_MASK_PM:
15728 /* mask precision exception for nearbyint() */
15729 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15730 slot = SLOT_CW_MASK_PM;
15731 break;
15733 default:
15734 gcc_unreachable ();
15738 gcc_assert (slot < MAX_386_STACK_LOCALS);
15740 new_mode = assign_386_stack_local (HImode, slot);
15741 emit_move_insn (new_mode, reg);
15744 /* Emit vzeroupper. */
15746 void
15747 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15749 int i;
15751 /* Cancel automatic vzeroupper insertion if there are
15752 live call-saved SSE registers at the insertion point. */
15754 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15755 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15756 return;
15758 if (TARGET_64BIT)
15759 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15760 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15761 return;
15763 emit_insn (gen_avx_vzeroupper ());
15766 /* Generate one or more insns to set ENTITY to MODE. */
15768 void
15769 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15771 switch (entity)
15773 case AVX_U128:
15774 if (mode == AVX_U128_CLEAN)
15775 ix86_avx_emit_vzeroupper (regs_live);
15776 break;
15777 case I387_TRUNC:
15778 case I387_FLOOR:
15779 case I387_CEIL:
15780 case I387_MASK_PM:
15781 if (mode != I387_CW_ANY
15782 && mode != I387_CW_UNINITIALIZED)
15783 emit_i387_cw_initialization (mode);
15784 break;
15785 default:
15786 gcc_unreachable ();
15790 /* Output code for INSN to convert a float to a signed int. OPERANDS
15791 are the insn operands. The output may be [HSD]Imode and the input
15792 operand may be [SDX]Fmode. */
15794 const char *
15795 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15797 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15798 int dimode_p = GET_MODE (operands[0]) == DImode;
15799 int round_mode = get_attr_i387_cw (insn);
15801 /* Jump through a hoop or two for DImode, since the hardware has no
15802 non-popping instruction. We used to do this a different way, but
15803 that was somewhat fragile and broke with post-reload splitters. */
15804 if ((dimode_p || fisttp) && !stack_top_dies)
15805 output_asm_insn ("fld\t%y1", operands);
15807 gcc_assert (STACK_TOP_P (operands[1]));
15808 gcc_assert (MEM_P (operands[0]));
15809 gcc_assert (GET_MODE (operands[1]) != TFmode);
15811 if (fisttp)
15812 output_asm_insn ("fisttp%Z0\t%0", operands);
15813 else
15815 if (round_mode != I387_CW_ANY)
15816 output_asm_insn ("fldcw\t%3", operands);
15817 if (stack_top_dies || dimode_p)
15818 output_asm_insn ("fistp%Z0\t%0", operands);
15819 else
15820 output_asm_insn ("fist%Z0\t%0", operands);
15821 if (round_mode != I387_CW_ANY)
15822 output_asm_insn ("fldcw\t%2", operands);
15825 return "";
15828 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15829 have the values zero or one, indicates the ffreep insn's operand
15830 from the OPERANDS array. */
15832 static const char *
15833 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15835 if (TARGET_USE_FFREEP)
15836 #ifdef HAVE_AS_IX86_FFREEP
15837 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15838 #else
15840 static char retval[32];
15841 int regno = REGNO (operands[opno]);
15843 gcc_assert (STACK_REGNO_P (regno));
15845 regno -= FIRST_STACK_REG;
15847 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15848 return retval;
15850 #endif
15852 return opno ? "fstp\t%y1" : "fstp\t%y0";
15856 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15857 should be used. UNORDERED_P is true when fucom should be used. */
15859 const char *
15860 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15862 int stack_top_dies;
15863 rtx cmp_op0, cmp_op1;
15864 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15866 if (eflags_p)
15868 cmp_op0 = operands[0];
15869 cmp_op1 = operands[1];
15871 else
15873 cmp_op0 = operands[1];
15874 cmp_op1 = operands[2];
15877 if (is_sse)
15879 if (GET_MODE (operands[0]) == SFmode)
15880 if (unordered_p)
15881 return "%vucomiss\t{%1, %0|%0, %1}";
15882 else
15883 return "%vcomiss\t{%1, %0|%0, %1}";
15884 else
15885 if (unordered_p)
15886 return "%vucomisd\t{%1, %0|%0, %1}";
15887 else
15888 return "%vcomisd\t{%1, %0|%0, %1}";
15891 gcc_assert (STACK_TOP_P (cmp_op0));
15893 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15895 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15897 if (stack_top_dies)
15899 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15900 return output_387_ffreep (operands, 1);
15902 else
15903 return "ftst\n\tfnstsw\t%0";
15906 if (STACK_REG_P (cmp_op1)
15907 && stack_top_dies
15908 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15909 && REGNO (cmp_op1) != FIRST_STACK_REG)
15911 /* If both the top of the 387 stack dies, and the other operand
15912 is also a stack register that dies, then this must be a
15913 `fcompp' float compare */
15915 if (eflags_p)
15917 /* There is no double popping fcomi variant. Fortunately,
15918 eflags is immune from the fstp's cc clobbering. */
15919 if (unordered_p)
15920 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15921 else
15922 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15923 return output_387_ffreep (operands, 0);
15925 else
15927 if (unordered_p)
15928 return "fucompp\n\tfnstsw\t%0";
15929 else
15930 return "fcompp\n\tfnstsw\t%0";
15933 else
15935 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15937 static const char * const alt[16] =
15939 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15940 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15941 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15942 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15944 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15945 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15946 NULL,
15947 NULL,
15949 "fcomi\t{%y1, %0|%0, %y1}",
15950 "fcomip\t{%y1, %0|%0, %y1}",
15951 "fucomi\t{%y1, %0|%0, %y1}",
15952 "fucomip\t{%y1, %0|%0, %y1}",
15954 NULL,
15955 NULL,
15956 NULL,
15957 NULL
15960 int mask;
15961 const char *ret;
15963 mask = eflags_p << 3;
15964 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15965 mask |= unordered_p << 1;
15966 mask |= stack_top_dies;
15968 gcc_assert (mask < 16);
15969 ret = alt[mask];
15970 gcc_assert (ret);
15972 return ret;
15976 void
15977 ix86_output_addr_vec_elt (FILE *file, int value)
15979 const char *directive = ASM_LONG;
15981 #ifdef ASM_QUAD
15982 if (TARGET_LP64)
15983 directive = ASM_QUAD;
15984 #else
15985 gcc_assert (!TARGET_64BIT);
15986 #endif
15988 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15991 void
15992 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15994 const char *directive = ASM_LONG;
15996 #ifdef ASM_QUAD
15997 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15998 directive = ASM_QUAD;
15999 #else
16000 gcc_assert (!TARGET_64BIT);
16001 #endif
16002 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16003 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16004 fprintf (file, "%s%s%d-%s%d\n",
16005 directive, LPREFIX, value, LPREFIX, rel);
16006 else if (HAVE_AS_GOTOFF_IN_DATA)
16007 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16008 #if TARGET_MACHO
16009 else if (TARGET_MACHO)
16011 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16012 machopic_output_function_base_name (file);
16013 putc ('\n', file);
16015 #endif
16016 else
16017 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16018 GOT_SYMBOL_NAME, LPREFIX, value);
16021 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16022 for the target. */
16024 void
16025 ix86_expand_clear (rtx dest)
16027 rtx tmp;
16029 /* We play register width games, which are only valid after reload. */
16030 gcc_assert (reload_completed);
16032 /* Avoid HImode and its attendant prefix byte. */
16033 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16034 dest = gen_rtx_REG (SImode, REGNO (dest));
16035 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16037 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16038 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16040 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16041 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16044 emit_insn (tmp);
16047 /* X is an unchanging MEM. If it is a constant pool reference, return
16048 the constant pool rtx, else NULL. */
16051 maybe_get_pool_constant (rtx x)
16053 x = ix86_delegitimize_address (XEXP (x, 0));
16055 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16056 return get_pool_constant (x);
16058 return NULL_RTX;
16061 void
16062 ix86_expand_move (enum machine_mode mode, rtx operands[])
16064 rtx op0, op1;
16065 enum tls_model model;
16067 op0 = operands[0];
16068 op1 = operands[1];
16070 if (GET_CODE (op1) == SYMBOL_REF)
16072 rtx tmp;
16074 model = SYMBOL_REF_TLS_MODEL (op1);
16075 if (model)
16077 op1 = legitimize_tls_address (op1, model, true);
16078 op1 = force_operand (op1, op0);
16079 if (op1 == op0)
16080 return;
16081 op1 = convert_to_mode (mode, op1, 1);
16083 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16084 op1 = tmp;
16086 else if (GET_CODE (op1) == CONST
16087 && GET_CODE (XEXP (op1, 0)) == PLUS
16088 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16090 rtx addend = XEXP (XEXP (op1, 0), 1);
16091 rtx symbol = XEXP (XEXP (op1, 0), 0);
16092 rtx tmp;
16094 model = SYMBOL_REF_TLS_MODEL (symbol);
16095 if (model)
16096 tmp = legitimize_tls_address (symbol, model, true);
16097 else
16098 tmp = legitimize_pe_coff_symbol (symbol, true);
16100 if (tmp)
16102 tmp = force_operand (tmp, NULL);
16103 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16104 op0, 1, OPTAB_DIRECT);
16105 if (tmp == op0)
16106 return;
16107 op1 = convert_to_mode (mode, tmp, 1);
16111 if ((flag_pic || MACHOPIC_INDIRECT)
16112 && symbolic_operand (op1, mode))
16114 if (TARGET_MACHO && !TARGET_64BIT)
16116 #if TARGET_MACHO
16117 /* dynamic-no-pic */
16118 if (MACHOPIC_INDIRECT)
16120 rtx temp = ((reload_in_progress
16121 || ((op0 && REG_P (op0))
16122 && mode == Pmode))
16123 ? op0 : gen_reg_rtx (Pmode));
16124 op1 = machopic_indirect_data_reference (op1, temp);
16125 if (MACHOPIC_PURE)
16126 op1 = machopic_legitimize_pic_address (op1, mode,
16127 temp == op1 ? 0 : temp);
16129 if (op0 != op1 && GET_CODE (op0) != MEM)
16131 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16132 emit_insn (insn);
16133 return;
16135 if (GET_CODE (op0) == MEM)
16136 op1 = force_reg (Pmode, op1);
16137 else
16139 rtx temp = op0;
16140 if (GET_CODE (temp) != REG)
16141 temp = gen_reg_rtx (Pmode);
16142 temp = legitimize_pic_address (op1, temp);
16143 if (temp == op0)
16144 return;
16145 op1 = temp;
16147 /* dynamic-no-pic */
16148 #endif
16150 else
16152 if (MEM_P (op0))
16153 op1 = force_reg (mode, op1);
16154 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16156 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16157 op1 = legitimize_pic_address (op1, reg);
16158 if (op0 == op1)
16159 return;
16160 op1 = convert_to_mode (mode, op1, 1);
16164 else
16166 if (MEM_P (op0)
16167 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16168 || !push_operand (op0, mode))
16169 && MEM_P (op1))
16170 op1 = force_reg (mode, op1);
16172 if (push_operand (op0, mode)
16173 && ! general_no_elim_operand (op1, mode))
16174 op1 = copy_to_mode_reg (mode, op1);
16176 /* Force large constants in 64bit compilation into register
16177 to get them CSEed. */
16178 if (can_create_pseudo_p ()
16179 && (mode == DImode) && TARGET_64BIT
16180 && immediate_operand (op1, mode)
16181 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16182 && !register_operand (op0, mode)
16183 && optimize)
16184 op1 = copy_to_mode_reg (mode, op1);
16186 if (can_create_pseudo_p ()
16187 && FLOAT_MODE_P (mode)
16188 && GET_CODE (op1) == CONST_DOUBLE)
16190 /* If we are loading a floating point constant to a register,
16191 force the value to memory now, since we'll get better code
16192 out the back end. */
16194 op1 = validize_mem (force_const_mem (mode, op1));
16195 if (!register_operand (op0, mode))
16197 rtx temp = gen_reg_rtx (mode);
16198 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16199 emit_move_insn (op0, temp);
16200 return;
16205 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16208 void
16209 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16211 rtx op0 = operands[0], op1 = operands[1];
16212 unsigned int align = GET_MODE_ALIGNMENT (mode);
16214 /* Force constants other than zero into memory. We do not know how
16215 the instructions used to build constants modify the upper 64 bits
16216 of the register, once we have that information we may be able
16217 to handle some of them more efficiently. */
16218 if (can_create_pseudo_p ()
16219 && register_operand (op0, mode)
16220 && (CONSTANT_P (op1)
16221 || (GET_CODE (op1) == SUBREG
16222 && CONSTANT_P (SUBREG_REG (op1))))
16223 && !standard_sse_constant_p (op1))
16224 op1 = validize_mem (force_const_mem (mode, op1));
16226 /* We need to check memory alignment for SSE mode since attribute
16227 can make operands unaligned. */
16228 if (can_create_pseudo_p ()
16229 && SSE_REG_MODE_P (mode)
16230 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16231 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16233 rtx tmp[2];
16235 /* ix86_expand_vector_move_misalign() does not like constants ... */
16236 if (CONSTANT_P (op1)
16237 || (GET_CODE (op1) == SUBREG
16238 && CONSTANT_P (SUBREG_REG (op1))))
16239 op1 = validize_mem (force_const_mem (mode, op1));
16241 /* ... nor both arguments in memory. */
16242 if (!register_operand (op0, mode)
16243 && !register_operand (op1, mode))
16244 op1 = force_reg (mode, op1);
16246 tmp[0] = op0; tmp[1] = op1;
16247 ix86_expand_vector_move_misalign (mode, tmp);
16248 return;
16251 /* Make operand1 a register if it isn't already. */
16252 if (can_create_pseudo_p ()
16253 && !register_operand (op0, mode)
16254 && !register_operand (op1, mode))
16256 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16257 return;
16260 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16263 /* Split 32-byte AVX unaligned load and store if needed. */
16265 static void
16266 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16268 rtx m;
16269 rtx (*extract) (rtx, rtx, rtx);
16270 rtx (*load_unaligned) (rtx, rtx);
16271 rtx (*store_unaligned) (rtx, rtx);
16272 enum machine_mode mode;
16274 switch (GET_MODE (op0))
16276 default:
16277 gcc_unreachable ();
16278 case V32QImode:
16279 extract = gen_avx_vextractf128v32qi;
16280 load_unaligned = gen_avx_loaddqu256;
16281 store_unaligned = gen_avx_storedqu256;
16282 mode = V16QImode;
16283 break;
16284 case V8SFmode:
16285 extract = gen_avx_vextractf128v8sf;
16286 load_unaligned = gen_avx_loadups256;
16287 store_unaligned = gen_avx_storeups256;
16288 mode = V4SFmode;
16289 break;
16290 case V4DFmode:
16291 extract = gen_avx_vextractf128v4df;
16292 load_unaligned = gen_avx_loadupd256;
16293 store_unaligned = gen_avx_storeupd256;
16294 mode = V2DFmode;
16295 break;
16298 if (MEM_P (op1))
16300 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16302 rtx r = gen_reg_rtx (mode);
16303 m = adjust_address (op1, mode, 0);
16304 emit_move_insn (r, m);
16305 m = adjust_address (op1, mode, 16);
16306 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16307 emit_move_insn (op0, r);
16309 else
16310 emit_insn (load_unaligned (op0, op1));
16312 else if (MEM_P (op0))
16314 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16316 m = adjust_address (op0, mode, 0);
16317 emit_insn (extract (m, op1, const0_rtx));
16318 m = adjust_address (op0, mode, 16);
16319 emit_insn (extract (m, op1, const1_rtx));
16321 else
16322 emit_insn (store_unaligned (op0, op1));
16324 else
16325 gcc_unreachable ();
16328 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16329 straight to ix86_expand_vector_move. */
16330 /* Code generation for scalar reg-reg moves of single and double precision data:
16331 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16332 movaps reg, reg
16333 else
16334 movss reg, reg
16335 if (x86_sse_partial_reg_dependency == true)
16336 movapd reg, reg
16337 else
16338 movsd reg, reg
16340 Code generation for scalar loads of double precision data:
16341 if (x86_sse_split_regs == true)
16342 movlpd mem, reg (gas syntax)
16343 else
16344 movsd mem, reg
16346 Code generation for unaligned packed loads of single precision data
16347 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16348 if (x86_sse_unaligned_move_optimal)
16349 movups mem, reg
16351 if (x86_sse_partial_reg_dependency == true)
16353 xorps reg, reg
16354 movlps mem, reg
16355 movhps mem+8, reg
16357 else
16359 movlps mem, reg
16360 movhps mem+8, reg
16363 Code generation for unaligned packed loads of double precision data
16364 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16365 if (x86_sse_unaligned_move_optimal)
16366 movupd mem, reg
16368 if (x86_sse_split_regs == true)
16370 movlpd mem, reg
16371 movhpd mem+8, reg
16373 else
16375 movsd mem, reg
16376 movhpd mem+8, reg
16380 void
16381 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16383 rtx op0, op1, m;
16385 op0 = operands[0];
16386 op1 = operands[1];
16388 if (TARGET_AVX
16389 && GET_MODE_SIZE (mode) == 32)
16391 switch (GET_MODE_CLASS (mode))
16393 case MODE_VECTOR_INT:
16394 case MODE_INT:
16395 op0 = gen_lowpart (V32QImode, op0);
16396 op1 = gen_lowpart (V32QImode, op1);
16397 /* FALLTHRU */
16399 case MODE_VECTOR_FLOAT:
16400 ix86_avx256_split_vector_move_misalign (op0, op1);
16401 break;
16403 default:
16404 gcc_unreachable ();
16407 return;
16410 if (MEM_P (op1))
16412 /* ??? If we have typed data, then it would appear that using
16413 movdqu is the only way to get unaligned data loaded with
16414 integer type. */
16415 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16417 op0 = gen_lowpart (V16QImode, op0);
16418 op1 = gen_lowpart (V16QImode, op1);
16419 /* We will eventually emit movups based on insn attributes. */
16420 emit_insn (gen_sse2_loaddqu (op0, op1));
16422 else if (TARGET_SSE2 && mode == V2DFmode)
16424 rtx zero;
16426 if (TARGET_AVX
16427 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16428 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16429 || optimize_function_for_size_p (cfun))
16431 /* We will eventually emit movups based on insn attributes. */
16432 emit_insn (gen_sse2_loadupd (op0, op1));
16433 return;
16436 /* When SSE registers are split into halves, we can avoid
16437 writing to the top half twice. */
16438 if (TARGET_SSE_SPLIT_REGS)
16440 emit_clobber (op0);
16441 zero = op0;
16443 else
16445 /* ??? Not sure about the best option for the Intel chips.
16446 The following would seem to satisfy; the register is
16447 entirely cleared, breaking the dependency chain. We
16448 then store to the upper half, with a dependency depth
16449 of one. A rumor has it that Intel recommends two movsd
16450 followed by an unpacklpd, but this is unconfirmed. And
16451 given that the dependency depth of the unpacklpd would
16452 still be one, I'm not sure why this would be better. */
16453 zero = CONST0_RTX (V2DFmode);
16456 m = adjust_address (op1, DFmode, 0);
16457 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16458 m = adjust_address (op1, DFmode, 8);
16459 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16461 else
16463 if (TARGET_AVX
16464 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16465 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16466 || optimize_function_for_size_p (cfun))
16468 op0 = gen_lowpart (V4SFmode, op0);
16469 op1 = gen_lowpart (V4SFmode, op1);
16470 emit_insn (gen_sse_loadups (op0, op1));
16471 return;
16474 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16475 emit_move_insn (op0, CONST0_RTX (mode));
16476 else
16477 emit_clobber (op0);
16479 if (mode != V4SFmode)
16480 op0 = gen_lowpart (V4SFmode, op0);
16482 m = adjust_address (op1, V2SFmode, 0);
16483 emit_insn (gen_sse_loadlps (op0, op0, m));
16484 m = adjust_address (op1, V2SFmode, 8);
16485 emit_insn (gen_sse_loadhps (op0, op0, m));
16488 else if (MEM_P (op0))
16490 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16492 op0 = gen_lowpart (V16QImode, op0);
16493 op1 = gen_lowpart (V16QImode, op1);
16494 /* We will eventually emit movups based on insn attributes. */
16495 emit_insn (gen_sse2_storedqu (op0, op1));
16497 else if (TARGET_SSE2 && mode == V2DFmode)
16499 if (TARGET_AVX
16500 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16501 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16502 || optimize_function_for_size_p (cfun))
16503 /* We will eventually emit movups based on insn attributes. */
16504 emit_insn (gen_sse2_storeupd (op0, op1));
16505 else
16507 m = adjust_address (op0, DFmode, 0);
16508 emit_insn (gen_sse2_storelpd (m, op1));
16509 m = adjust_address (op0, DFmode, 8);
16510 emit_insn (gen_sse2_storehpd (m, op1));
16513 else
16515 if (mode != V4SFmode)
16516 op1 = gen_lowpart (V4SFmode, op1);
16518 if (TARGET_AVX
16519 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16520 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16521 || optimize_function_for_size_p (cfun))
16523 op0 = gen_lowpart (V4SFmode, op0);
16524 emit_insn (gen_sse_storeups (op0, op1));
16526 else
16528 m = adjust_address (op0, V2SFmode, 0);
16529 emit_insn (gen_sse_storelps (m, op1));
16530 m = adjust_address (op0, V2SFmode, 8);
16531 emit_insn (gen_sse_storehps (m, op1));
16535 else
16536 gcc_unreachable ();
16539 /* Expand a push in MODE. This is some mode for which we do not support
16540 proper push instructions, at least from the registers that we expect
16541 the value to live in. */
16543 void
16544 ix86_expand_push (enum machine_mode mode, rtx x)
16546 rtx tmp;
16548 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16549 GEN_INT (-GET_MODE_SIZE (mode)),
16550 stack_pointer_rtx, 1, OPTAB_DIRECT);
16551 if (tmp != stack_pointer_rtx)
16552 emit_move_insn (stack_pointer_rtx, tmp);
16554 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16556 /* When we push an operand onto stack, it has to be aligned at least
16557 at the function argument boundary. However since we don't have
16558 the argument type, we can't determine the actual argument
16559 boundary. */
16560 emit_move_insn (tmp, x);
16563 /* Helper function of ix86_fixup_binary_operands to canonicalize
16564 operand order. Returns true if the operands should be swapped. */
16566 static bool
16567 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16568 rtx operands[])
16570 rtx dst = operands[0];
16571 rtx src1 = operands[1];
16572 rtx src2 = operands[2];
16574 /* If the operation is not commutative, we can't do anything. */
16575 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16576 return false;
16578 /* Highest priority is that src1 should match dst. */
16579 if (rtx_equal_p (dst, src1))
16580 return false;
16581 if (rtx_equal_p (dst, src2))
16582 return true;
16584 /* Next highest priority is that immediate constants come second. */
16585 if (immediate_operand (src2, mode))
16586 return false;
16587 if (immediate_operand (src1, mode))
16588 return true;
16590 /* Lowest priority is that memory references should come second. */
16591 if (MEM_P (src2))
16592 return false;
16593 if (MEM_P (src1))
16594 return true;
16596 return false;
16600 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16601 destination to use for the operation. If different from the true
16602 destination in operands[0], a copy operation will be required. */
16605 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16606 rtx operands[])
16608 rtx dst = operands[0];
16609 rtx src1 = operands[1];
16610 rtx src2 = operands[2];
16612 /* Canonicalize operand order. */
16613 if (ix86_swap_binary_operands_p (code, mode, operands))
16615 rtx temp;
16617 /* It is invalid to swap operands of different modes. */
16618 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16620 temp = src1;
16621 src1 = src2;
16622 src2 = temp;
16625 /* Both source operands cannot be in memory. */
16626 if (MEM_P (src1) && MEM_P (src2))
16628 /* Optimization: Only read from memory once. */
16629 if (rtx_equal_p (src1, src2))
16631 src2 = force_reg (mode, src2);
16632 src1 = src2;
16634 else
16635 src2 = force_reg (mode, src2);
16638 /* If the destination is memory, and we do not have matching source
16639 operands, do things in registers. */
16640 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16641 dst = gen_reg_rtx (mode);
16643 /* Source 1 cannot be a constant. */
16644 if (CONSTANT_P (src1))
16645 src1 = force_reg (mode, src1);
16647 /* Source 1 cannot be a non-matching memory. */
16648 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16649 src1 = force_reg (mode, src1);
16651 /* Improve address combine. */
16652 if (code == PLUS
16653 && GET_MODE_CLASS (mode) == MODE_INT
16654 && MEM_P (src2))
16655 src2 = force_reg (mode, src2);
16657 operands[1] = src1;
16658 operands[2] = src2;
16659 return dst;
16662 /* Similarly, but assume that the destination has already been
16663 set up properly. */
16665 void
16666 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16667 enum machine_mode mode, rtx operands[])
16669 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16670 gcc_assert (dst == operands[0]);
16673 /* Attempt to expand a binary operator. Make the expansion closer to the
16674 actual machine, then just general_operand, which will allow 3 separate
16675 memory references (one output, two input) in a single insn. */
16677 void
16678 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16679 rtx operands[])
16681 rtx src1, src2, dst, op, clob;
16683 dst = ix86_fixup_binary_operands (code, mode, operands);
16684 src1 = operands[1];
16685 src2 = operands[2];
16687 /* Emit the instruction. */
16689 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16690 if (reload_in_progress)
16692 /* Reload doesn't know about the flags register, and doesn't know that
16693 it doesn't want to clobber it. We can only do this with PLUS. */
16694 gcc_assert (code == PLUS);
16695 emit_insn (op);
16697 else if (reload_completed
16698 && code == PLUS
16699 && !rtx_equal_p (dst, src1))
16701 /* This is going to be an LEA; avoid splitting it later. */
16702 emit_insn (op);
16704 else
16706 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16707 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16710 /* Fix up the destination if needed. */
16711 if (dst != operands[0])
16712 emit_move_insn (operands[0], dst);
16715 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16716 the given OPERANDS. */
16718 void
16719 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16720 rtx operands[])
16722 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16723 if (GET_CODE (operands[1]) == SUBREG)
16725 op1 = operands[1];
16726 op2 = operands[2];
16728 else if (GET_CODE (operands[2]) == SUBREG)
16730 op1 = operands[2];
16731 op2 = operands[1];
16733 /* Optimize (__m128i) d | (__m128i) e and similar code
16734 when d and e are float vectors into float vector logical
16735 insn. In C/C++ without using intrinsics there is no other way
16736 to express vector logical operation on float vectors than
16737 to cast them temporarily to integer vectors. */
16738 if (op1
16739 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16740 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16741 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16742 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16743 && SUBREG_BYTE (op1) == 0
16744 && (GET_CODE (op2) == CONST_VECTOR
16745 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16746 && SUBREG_BYTE (op2) == 0))
16747 && can_create_pseudo_p ())
16749 rtx dst;
16750 switch (GET_MODE (SUBREG_REG (op1)))
16752 case V4SFmode:
16753 case V8SFmode:
16754 case V2DFmode:
16755 case V4DFmode:
16756 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16757 if (GET_CODE (op2) == CONST_VECTOR)
16759 op2 = gen_lowpart (GET_MODE (dst), op2);
16760 op2 = force_reg (GET_MODE (dst), op2);
16762 else
16764 op1 = operands[1];
16765 op2 = SUBREG_REG (operands[2]);
16766 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16767 op2 = force_reg (GET_MODE (dst), op2);
16769 op1 = SUBREG_REG (op1);
16770 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16771 op1 = force_reg (GET_MODE (dst), op1);
16772 emit_insn (gen_rtx_SET (VOIDmode, dst,
16773 gen_rtx_fmt_ee (code, GET_MODE (dst),
16774 op1, op2)));
16775 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16776 return;
16777 default:
16778 break;
16781 if (!nonimmediate_operand (operands[1], mode))
16782 operands[1] = force_reg (mode, operands[1]);
16783 if (!nonimmediate_operand (operands[2], mode))
16784 operands[2] = force_reg (mode, operands[2]);
16785 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16786 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16787 gen_rtx_fmt_ee (code, mode, operands[1],
16788 operands[2])));
16791 /* Return TRUE or FALSE depending on whether the binary operator meets the
16792 appropriate constraints. */
16794 bool
16795 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16796 rtx operands[3])
16798 rtx dst = operands[0];
16799 rtx src1 = operands[1];
16800 rtx src2 = operands[2];
16802 /* Both source operands cannot be in memory. */
16803 if (MEM_P (src1) && MEM_P (src2))
16804 return false;
16806 /* Canonicalize operand order for commutative operators. */
16807 if (ix86_swap_binary_operands_p (code, mode, operands))
16809 rtx temp = src1;
16810 src1 = src2;
16811 src2 = temp;
16814 /* If the destination is memory, we must have a matching source operand. */
16815 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16816 return false;
16818 /* Source 1 cannot be a constant. */
16819 if (CONSTANT_P (src1))
16820 return false;
16822 /* Source 1 cannot be a non-matching memory. */
16823 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16824 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16825 return (code == AND
16826 && (mode == HImode
16827 || mode == SImode
16828 || (TARGET_64BIT && mode == DImode))
16829 && satisfies_constraint_L (src2));
16831 return true;
16834 /* Attempt to expand a unary operator. Make the expansion closer to the
16835 actual machine, then just general_operand, which will allow 2 separate
16836 memory references (one output, one input) in a single insn. */
16838 void
16839 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16840 rtx operands[])
16842 int matching_memory;
16843 rtx src, dst, op, clob;
16845 dst = operands[0];
16846 src = operands[1];
16848 /* If the destination is memory, and we do not have matching source
16849 operands, do things in registers. */
16850 matching_memory = 0;
16851 if (MEM_P (dst))
16853 if (rtx_equal_p (dst, src))
16854 matching_memory = 1;
16855 else
16856 dst = gen_reg_rtx (mode);
16859 /* When source operand is memory, destination must match. */
16860 if (MEM_P (src) && !matching_memory)
16861 src = force_reg (mode, src);
16863 /* Emit the instruction. */
16865 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16866 if (reload_in_progress || code == NOT)
16868 /* Reload doesn't know about the flags register, and doesn't know that
16869 it doesn't want to clobber it. */
16870 gcc_assert (code == NOT);
16871 emit_insn (op);
16873 else
16875 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16876 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16879 /* Fix up the destination if needed. */
16880 if (dst != operands[0])
16881 emit_move_insn (operands[0], dst);
16884 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16885 divisor are within the range [0-255]. */
16887 void
16888 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16889 bool signed_p)
16891 rtx end_label, qimode_label;
16892 rtx insn, div, mod;
16893 rtx scratch, tmp0, tmp1, tmp2;
16894 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16895 rtx (*gen_zero_extend) (rtx, rtx);
16896 rtx (*gen_test_ccno_1) (rtx, rtx);
16898 switch (mode)
16900 case SImode:
16901 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16902 gen_test_ccno_1 = gen_testsi_ccno_1;
16903 gen_zero_extend = gen_zero_extendqisi2;
16904 break;
16905 case DImode:
16906 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16907 gen_test_ccno_1 = gen_testdi_ccno_1;
16908 gen_zero_extend = gen_zero_extendqidi2;
16909 break;
16910 default:
16911 gcc_unreachable ();
16914 end_label = gen_label_rtx ();
16915 qimode_label = gen_label_rtx ();
16917 scratch = gen_reg_rtx (mode);
16919 /* Use 8bit unsigned divimod if dividend and divisor are within
16920 the range [0-255]. */
16921 emit_move_insn (scratch, operands[2]);
16922 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16923 scratch, 1, OPTAB_DIRECT);
16924 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16925 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16926 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16927 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16928 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16929 pc_rtx);
16930 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16931 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16932 JUMP_LABEL (insn) = qimode_label;
16934 /* Generate original signed/unsigned divimod. */
16935 div = gen_divmod4_1 (operands[0], operands[1],
16936 operands[2], operands[3]);
16937 emit_insn (div);
16939 /* Branch to the end. */
16940 emit_jump_insn (gen_jump (end_label));
16941 emit_barrier ();
16943 /* Generate 8bit unsigned divide. */
16944 emit_label (qimode_label);
16945 /* Don't use operands[0] for result of 8bit divide since not all
16946 registers support QImode ZERO_EXTRACT. */
16947 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16948 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16949 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16950 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16952 if (signed_p)
16954 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16955 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16957 else
16959 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16960 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16963 /* Extract remainder from AH. */
16964 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16965 if (REG_P (operands[1]))
16966 insn = emit_move_insn (operands[1], tmp1);
16967 else
16969 /* Need a new scratch register since the old one has result
16970 of 8bit divide. */
16971 scratch = gen_reg_rtx (mode);
16972 emit_move_insn (scratch, tmp1);
16973 insn = emit_move_insn (operands[1], scratch);
16975 set_unique_reg_note (insn, REG_EQUAL, mod);
16977 /* Zero extend quotient from AL. */
16978 tmp1 = gen_lowpart (QImode, tmp0);
16979 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16980 set_unique_reg_note (insn, REG_EQUAL, div);
16982 emit_label (end_label);
16985 #define LEA_MAX_STALL (3)
16986 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16988 /* Increase given DISTANCE in half-cycles according to
16989 dependencies between PREV and NEXT instructions.
16990 Add 1 half-cycle if there is no dependency and
16991 go to next cycle if there is some dependecy. */
16993 static unsigned int
16994 increase_distance (rtx prev, rtx next, unsigned int distance)
16996 df_ref *use_rec;
16997 df_ref *def_rec;
16999 if (!prev || !next)
17000 return distance + (distance & 1) + 2;
17002 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17003 return distance + 1;
17005 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17006 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17007 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17008 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17009 return distance + (distance & 1) + 2;
17011 return distance + 1;
17014 /* Function checks if instruction INSN defines register number
17015 REGNO1 or REGNO2. */
17017 static bool
17018 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17019 rtx insn)
17021 df_ref *def_rec;
17023 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17024 if (DF_REF_REG_DEF_P (*def_rec)
17025 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17026 && (regno1 == DF_REF_REGNO (*def_rec)
17027 || regno2 == DF_REF_REGNO (*def_rec)))
17029 return true;
17032 return false;
17035 /* Function checks if instruction INSN uses register number
17036 REGNO as a part of address expression. */
17038 static bool
17039 insn_uses_reg_mem (unsigned int regno, rtx insn)
17041 df_ref *use_rec;
17043 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17044 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17045 return true;
17047 return false;
17050 /* Search backward for non-agu definition of register number REGNO1
17051 or register number REGNO2 in basic block starting from instruction
17052 START up to head of basic block or instruction INSN.
17054 Function puts true value into *FOUND var if definition was found
17055 and false otherwise.
17057 Distance in half-cycles between START and found instruction or head
17058 of BB is added to DISTANCE and returned. */
17060 static int
17061 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17062 rtx insn, int distance,
17063 rtx start, bool *found)
17065 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17066 rtx prev = start;
17067 rtx next = NULL;
17069 *found = false;
17071 while (prev
17072 && prev != insn
17073 && distance < LEA_SEARCH_THRESHOLD)
17075 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17077 distance = increase_distance (prev, next, distance);
17078 if (insn_defines_reg (regno1, regno2, prev))
17080 if (recog_memoized (prev) < 0
17081 || get_attr_type (prev) != TYPE_LEA)
17083 *found = true;
17084 return distance;
17088 next = prev;
17090 if (prev == BB_HEAD (bb))
17091 break;
17093 prev = PREV_INSN (prev);
17096 return distance;
17099 /* Search backward for non-agu definition of register number REGNO1
17100 or register number REGNO2 in INSN's basic block until
17101 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17102 2. Reach neighbour BBs boundary, or
17103 3. Reach agu definition.
17104 Returns the distance between the non-agu definition point and INSN.
17105 If no definition point, returns -1. */
17107 static int
17108 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17109 rtx insn)
17111 basic_block bb = BLOCK_FOR_INSN (insn);
17112 int distance = 0;
17113 bool found = false;
17115 if (insn != BB_HEAD (bb))
17116 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17117 distance, PREV_INSN (insn),
17118 &found);
17120 if (!found && distance < LEA_SEARCH_THRESHOLD)
17122 edge e;
17123 edge_iterator ei;
17124 bool simple_loop = false;
17126 FOR_EACH_EDGE (e, ei, bb->preds)
17127 if (e->src == bb)
17129 simple_loop = true;
17130 break;
17133 if (simple_loop)
17134 distance = distance_non_agu_define_in_bb (regno1, regno2,
17135 insn, distance,
17136 BB_END (bb), &found);
17137 else
17139 int shortest_dist = -1;
17140 bool found_in_bb = false;
17142 FOR_EACH_EDGE (e, ei, bb->preds)
17144 int bb_dist
17145 = distance_non_agu_define_in_bb (regno1, regno2,
17146 insn, distance,
17147 BB_END (e->src),
17148 &found_in_bb);
17149 if (found_in_bb)
17151 if (shortest_dist < 0)
17152 shortest_dist = bb_dist;
17153 else if (bb_dist > 0)
17154 shortest_dist = MIN (bb_dist, shortest_dist);
17156 found = true;
17160 distance = shortest_dist;
17164 /* get_attr_type may modify recog data. We want to make sure
17165 that recog data is valid for instruction INSN, on which
17166 distance_non_agu_define is called. INSN is unchanged here. */
17167 extract_insn_cached (insn);
17169 if (!found)
17170 return -1;
17172 return distance >> 1;
17175 /* Return the distance in half-cycles between INSN and the next
17176 insn that uses register number REGNO in memory address added
17177 to DISTANCE. Return -1 if REGNO0 is set.
17179 Put true value into *FOUND if register usage was found and
17180 false otherwise.
17181 Put true value into *REDEFINED if register redefinition was
17182 found and false otherwise. */
17184 static int
17185 distance_agu_use_in_bb (unsigned int regno,
17186 rtx insn, int distance, rtx start,
17187 bool *found, bool *redefined)
17189 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17190 rtx next = start;
17191 rtx prev = NULL;
17193 *found = false;
17194 *redefined = false;
17196 while (next
17197 && next != insn
17198 && distance < LEA_SEARCH_THRESHOLD)
17200 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17202 distance = increase_distance(prev, next, distance);
17203 if (insn_uses_reg_mem (regno, next))
17205 /* Return DISTANCE if OP0 is used in memory
17206 address in NEXT. */
17207 *found = true;
17208 return distance;
17211 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17213 /* Return -1 if OP0 is set in NEXT. */
17214 *redefined = true;
17215 return -1;
17218 prev = next;
17221 if (next == BB_END (bb))
17222 break;
17224 next = NEXT_INSN (next);
17227 return distance;
17230 /* Return the distance between INSN and the next insn that uses
17231 register number REGNO0 in memory address. Return -1 if no such
17232 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17234 static int
17235 distance_agu_use (unsigned int regno0, rtx insn)
17237 basic_block bb = BLOCK_FOR_INSN (insn);
17238 int distance = 0;
17239 bool found = false;
17240 bool redefined = false;
17242 if (insn != BB_END (bb))
17243 distance = distance_agu_use_in_bb (regno0, insn, distance,
17244 NEXT_INSN (insn),
17245 &found, &redefined);
17247 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17249 edge e;
17250 edge_iterator ei;
17251 bool simple_loop = false;
17253 FOR_EACH_EDGE (e, ei, bb->succs)
17254 if (e->dest == bb)
17256 simple_loop = true;
17257 break;
17260 if (simple_loop)
17261 distance = distance_agu_use_in_bb (regno0, insn,
17262 distance, BB_HEAD (bb),
17263 &found, &redefined);
17264 else
17266 int shortest_dist = -1;
17267 bool found_in_bb = false;
17268 bool redefined_in_bb = false;
17270 FOR_EACH_EDGE (e, ei, bb->succs)
17272 int bb_dist
17273 = distance_agu_use_in_bb (regno0, insn,
17274 distance, BB_HEAD (e->dest),
17275 &found_in_bb, &redefined_in_bb);
17276 if (found_in_bb)
17278 if (shortest_dist < 0)
17279 shortest_dist = bb_dist;
17280 else if (bb_dist > 0)
17281 shortest_dist = MIN (bb_dist, shortest_dist);
17283 found = true;
17287 distance = shortest_dist;
17291 if (!found || redefined)
17292 return -1;
17294 return distance >> 1;
17297 /* Define this macro to tune LEA priority vs ADD, it take effect when
17298 there is a dilemma of choicing LEA or ADD
17299 Negative value: ADD is more preferred than LEA
17300 Zero: Netrual
17301 Positive value: LEA is more preferred than ADD*/
17302 #define IX86_LEA_PRIORITY 0
17304 /* Return true if usage of lea INSN has performance advantage
17305 over a sequence of instructions. Instructions sequence has
17306 SPLIT_COST cycles higher latency than lea latency. */
17308 static bool
17309 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17310 unsigned int regno2, int split_cost)
17312 int dist_define, dist_use;
17314 dist_define = distance_non_agu_define (regno1, regno2, insn);
17315 dist_use = distance_agu_use (regno0, insn);
17317 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17319 /* If there is no non AGU operand definition, no AGU
17320 operand usage and split cost is 0 then both lea
17321 and non lea variants have same priority. Currently
17322 we prefer lea for 64 bit code and non lea on 32 bit
17323 code. */
17324 if (dist_use < 0 && split_cost == 0)
17325 return TARGET_64BIT || IX86_LEA_PRIORITY;
17326 else
17327 return true;
17330 /* With longer definitions distance lea is more preferable.
17331 Here we change it to take into account splitting cost and
17332 lea priority. */
17333 dist_define += split_cost + IX86_LEA_PRIORITY;
17335 /* If there is no use in memory addess then we just check
17336 that split cost exceeds AGU stall. */
17337 if (dist_use < 0)
17338 return dist_define > LEA_MAX_STALL;
17340 /* If this insn has both backward non-agu dependence and forward
17341 agu dependence, the one with short distance takes effect. */
17342 return dist_define >= dist_use;
17345 /* Return true if it is legal to clobber flags by INSN and
17346 false otherwise. */
17348 static bool
17349 ix86_ok_to_clobber_flags (rtx insn)
17351 basic_block bb = BLOCK_FOR_INSN (insn);
17352 df_ref *use;
17353 bitmap live;
17355 while (insn)
17357 if (NONDEBUG_INSN_P (insn))
17359 for (use = DF_INSN_USES (insn); *use; use++)
17360 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17361 return false;
17363 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17364 return true;
17367 if (insn == BB_END (bb))
17368 break;
17370 insn = NEXT_INSN (insn);
17373 live = df_get_live_out(bb);
17374 return !REGNO_REG_SET_P (live, FLAGS_REG);
17377 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17378 move and add to avoid AGU stalls. */
17380 bool
17381 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17383 unsigned int regno0, regno1, regno2;
17385 /* Check if we need to optimize. */
17386 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17387 return false;
17389 /* Check it is correct to split here. */
17390 if (!ix86_ok_to_clobber_flags(insn))
17391 return false;
17393 regno0 = true_regnum (operands[0]);
17394 regno1 = true_regnum (operands[1]);
17395 regno2 = true_regnum (operands[2]);
17397 /* We need to split only adds with non destructive
17398 destination operand. */
17399 if (regno0 == regno1 || regno0 == regno2)
17400 return false;
17401 else
17402 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17405 /* Return true if we should emit lea instruction instead of mov
17406 instruction. */
17408 bool
17409 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17411 unsigned int regno0, regno1;
17413 /* Check if we need to optimize. */
17414 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17415 return false;
17417 /* Use lea for reg to reg moves only. */
17418 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17419 return false;
17421 regno0 = true_regnum (operands[0]);
17422 regno1 = true_regnum (operands[1]);
17424 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17427 /* Return true if we need to split lea into a sequence of
17428 instructions to avoid AGU stalls. */
17430 bool
17431 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17433 unsigned int regno0, regno1, regno2;
17434 int split_cost;
17435 struct ix86_address parts;
17436 int ok;
17438 /* Check we need to optimize. */
17439 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17440 return false;
17442 /* Check it is correct to split here. */
17443 if (!ix86_ok_to_clobber_flags(insn))
17444 return false;
17446 ok = ix86_decompose_address (operands[1], &parts);
17447 gcc_assert (ok);
17449 /* There should be at least two components in the address. */
17450 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17451 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17452 return false;
17454 /* We should not split into add if non legitimate pic
17455 operand is used as displacement. */
17456 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17457 return false;
17459 regno0 = true_regnum (operands[0]) ;
17460 regno1 = INVALID_REGNUM;
17461 regno2 = INVALID_REGNUM;
17463 if (parts.base)
17464 regno1 = true_regnum (parts.base);
17465 if (parts.index)
17466 regno2 = true_regnum (parts.index);
17468 split_cost = 0;
17470 /* Compute how many cycles we will add to execution time
17471 if split lea into a sequence of instructions. */
17472 if (parts.base || parts.index)
17474 /* Have to use mov instruction if non desctructive
17475 destination form is used. */
17476 if (regno1 != regno0 && regno2 != regno0)
17477 split_cost += 1;
17479 /* Have to add index to base if both exist. */
17480 if (parts.base && parts.index)
17481 split_cost += 1;
17483 /* Have to use shift and adds if scale is 2 or greater. */
17484 if (parts.scale > 1)
17486 if (regno0 != regno1)
17487 split_cost += 1;
17488 else if (regno2 == regno0)
17489 split_cost += 4;
17490 else
17491 split_cost += parts.scale;
17494 /* Have to use add instruction with immediate if
17495 disp is non zero. */
17496 if (parts.disp && parts.disp != const0_rtx)
17497 split_cost += 1;
17499 /* Subtract the price of lea. */
17500 split_cost -= 1;
17503 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17506 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17507 matches destination. RTX includes clobber of FLAGS_REG. */
17509 static void
17510 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17511 rtx dst, rtx src)
17513 rtx op, clob;
17515 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17516 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17518 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17521 /* Return true if regno1 def is nearest to the insn. */
17523 static bool
17524 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17526 rtx prev = insn;
17527 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17529 if (insn == start)
17530 return false;
17531 while (prev && prev != start)
17533 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17535 prev = PREV_INSN (prev);
17536 continue;
17538 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17539 return true;
17540 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17541 return false;
17542 prev = PREV_INSN (prev);
17545 /* None of the regs is defined in the bb. */
17546 return false;
17549 /* Split lea instructions into a sequence of instructions
17550 which are executed on ALU to avoid AGU stalls.
17551 It is assumed that it is allowed to clobber flags register
17552 at lea position. */
17554 void
17555 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17557 unsigned int regno0, regno1, regno2;
17558 struct ix86_address parts;
17559 rtx target, tmp;
17560 int ok, adds;
17562 ok = ix86_decompose_address (operands[1], &parts);
17563 gcc_assert (ok);
17565 target = gen_lowpart (mode, operands[0]);
17567 regno0 = true_regnum (target);
17568 regno1 = INVALID_REGNUM;
17569 regno2 = INVALID_REGNUM;
17571 if (parts.base)
17573 parts.base = gen_lowpart (mode, parts.base);
17574 regno1 = true_regnum (parts.base);
17577 if (parts.index)
17579 parts.index = gen_lowpart (mode, parts.index);
17580 regno2 = true_regnum (parts.index);
17583 if (parts.disp)
17584 parts.disp = gen_lowpart (mode, parts.disp);
17586 if (parts.scale > 1)
17588 /* Case r1 = r1 + ... */
17589 if (regno1 == regno0)
17591 /* If we have a case r1 = r1 + C * r1 then we
17592 should use multiplication which is very
17593 expensive. Assume cost model is wrong if we
17594 have such case here. */
17595 gcc_assert (regno2 != regno0);
17597 for (adds = parts.scale; adds > 0; adds--)
17598 ix86_emit_binop (PLUS, mode, target, parts.index);
17600 else
17602 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17603 if (regno0 != regno2)
17604 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17606 /* Use shift for scaling. */
17607 ix86_emit_binop (ASHIFT, mode, target,
17608 GEN_INT (exact_log2 (parts.scale)));
17610 if (parts.base)
17611 ix86_emit_binop (PLUS, mode, target, parts.base);
17613 if (parts.disp && parts.disp != const0_rtx)
17614 ix86_emit_binop (PLUS, mode, target, parts.disp);
17617 else if (!parts.base && !parts.index)
17619 gcc_assert(parts.disp);
17620 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17622 else
17624 if (!parts.base)
17626 if (regno0 != regno2)
17627 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17629 else if (!parts.index)
17631 if (regno0 != regno1)
17632 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17634 else
17636 if (regno0 == regno1)
17637 tmp = parts.index;
17638 else if (regno0 == regno2)
17639 tmp = parts.base;
17640 else
17642 rtx tmp1;
17644 /* Find better operand for SET instruction, depending
17645 on which definition is farther from the insn. */
17646 if (find_nearest_reg_def (insn, regno1, regno2))
17647 tmp = parts.index, tmp1 = parts.base;
17648 else
17649 tmp = parts.base, tmp1 = parts.index;
17651 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17653 if (parts.disp && parts.disp != const0_rtx)
17654 ix86_emit_binop (PLUS, mode, target, parts.disp);
17656 ix86_emit_binop (PLUS, mode, target, tmp1);
17657 return;
17660 ix86_emit_binop (PLUS, mode, target, tmp);
17663 if (parts.disp && parts.disp != const0_rtx)
17664 ix86_emit_binop (PLUS, mode, target, parts.disp);
17668 /* Return true if it is ok to optimize an ADD operation to LEA
17669 operation to avoid flag register consumation. For most processors,
17670 ADD is faster than LEA. For the processors like ATOM, if the
17671 destination register of LEA holds an actual address which will be
17672 used soon, LEA is better and otherwise ADD is better. */
17674 bool
17675 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17677 unsigned int regno0 = true_regnum (operands[0]);
17678 unsigned int regno1 = true_regnum (operands[1]);
17679 unsigned int regno2 = true_regnum (operands[2]);
17681 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17682 if (regno0 != regno1 && regno0 != regno2)
17683 return true;
17685 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17686 return false;
17688 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17691 /* Return true if destination reg of SET_BODY is shift count of
17692 USE_BODY. */
17694 static bool
17695 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17697 rtx set_dest;
17698 rtx shift_rtx;
17699 int i;
17701 /* Retrieve destination of SET_BODY. */
17702 switch (GET_CODE (set_body))
17704 case SET:
17705 set_dest = SET_DEST (set_body);
17706 if (!set_dest || !REG_P (set_dest))
17707 return false;
17708 break;
17709 case PARALLEL:
17710 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17711 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17712 use_body))
17713 return true;
17714 default:
17715 return false;
17716 break;
17719 /* Retrieve shift count of USE_BODY. */
17720 switch (GET_CODE (use_body))
17722 case SET:
17723 shift_rtx = XEXP (use_body, 1);
17724 break;
17725 case PARALLEL:
17726 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17727 if (ix86_dep_by_shift_count_body (set_body,
17728 XVECEXP (use_body, 0, i)))
17729 return true;
17730 default:
17731 return false;
17732 break;
17735 if (shift_rtx
17736 && (GET_CODE (shift_rtx) == ASHIFT
17737 || GET_CODE (shift_rtx) == LSHIFTRT
17738 || GET_CODE (shift_rtx) == ASHIFTRT
17739 || GET_CODE (shift_rtx) == ROTATE
17740 || GET_CODE (shift_rtx) == ROTATERT))
17742 rtx shift_count = XEXP (shift_rtx, 1);
17744 /* Return true if shift count is dest of SET_BODY. */
17745 if (REG_P (shift_count))
17747 /* Add check since it can be invoked before register
17748 allocation in pre-reload schedule. */
17749 if (reload_completed
17750 && true_regnum (set_dest) == true_regnum (shift_count))
17751 return true;
17752 else if (REGNO(set_dest) == REGNO(shift_count))
17753 return true;
17757 return false;
17760 /* Return true if destination reg of SET_INSN is shift count of
17761 USE_INSN. */
17763 bool
17764 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17766 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17767 PATTERN (use_insn));
17770 /* Return TRUE or FALSE depending on whether the unary operator meets the
17771 appropriate constraints. */
17773 bool
17774 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17775 enum machine_mode mode ATTRIBUTE_UNUSED,
17776 rtx operands[2] ATTRIBUTE_UNUSED)
17778 /* If one of operands is memory, source and destination must match. */
17779 if ((MEM_P (operands[0])
17780 || MEM_P (operands[1]))
17781 && ! rtx_equal_p (operands[0], operands[1]))
17782 return false;
17783 return true;
17786 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17787 are ok, keeping in mind the possible movddup alternative. */
17789 bool
17790 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17792 if (MEM_P (operands[0]))
17793 return rtx_equal_p (operands[0], operands[1 + high]);
17794 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17795 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17796 return true;
17799 /* Post-reload splitter for converting an SF or DFmode value in an
17800 SSE register into an unsigned SImode. */
17802 void
17803 ix86_split_convert_uns_si_sse (rtx operands[])
17805 enum machine_mode vecmode;
17806 rtx value, large, zero_or_two31, input, two31, x;
17808 large = operands[1];
17809 zero_or_two31 = operands[2];
17810 input = operands[3];
17811 two31 = operands[4];
17812 vecmode = GET_MODE (large);
17813 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17815 /* Load up the value into the low element. We must ensure that the other
17816 elements are valid floats -- zero is the easiest such value. */
17817 if (MEM_P (input))
17819 if (vecmode == V4SFmode)
17820 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17821 else
17822 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17824 else
17826 input = gen_rtx_REG (vecmode, REGNO (input));
17827 emit_move_insn (value, CONST0_RTX (vecmode));
17828 if (vecmode == V4SFmode)
17829 emit_insn (gen_sse_movss (value, value, input));
17830 else
17831 emit_insn (gen_sse2_movsd (value, value, input));
17834 emit_move_insn (large, two31);
17835 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17837 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17838 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17840 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17841 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17843 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17844 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17846 large = gen_rtx_REG (V4SImode, REGNO (large));
17847 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17849 x = gen_rtx_REG (V4SImode, REGNO (value));
17850 if (vecmode == V4SFmode)
17851 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17852 else
17853 emit_insn (gen_sse2_cvttpd2dq (x, value));
17854 value = x;
17856 emit_insn (gen_xorv4si3 (value, value, large));
17859 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17860 Expects the 64-bit DImode to be supplied in a pair of integral
17861 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17862 -mfpmath=sse, !optimize_size only. */
17864 void
17865 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17867 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17868 rtx int_xmm, fp_xmm;
17869 rtx biases, exponents;
17870 rtx x;
17872 int_xmm = gen_reg_rtx (V4SImode);
17873 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
17874 emit_insn (gen_movdi_to_sse (int_xmm, input));
17875 else if (TARGET_SSE_SPLIT_REGS)
17877 emit_clobber (int_xmm);
17878 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17880 else
17882 x = gen_reg_rtx (V2DImode);
17883 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17884 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17887 x = gen_rtx_CONST_VECTOR (V4SImode,
17888 gen_rtvec (4, GEN_INT (0x43300000UL),
17889 GEN_INT (0x45300000UL),
17890 const0_rtx, const0_rtx));
17891 exponents = validize_mem (force_const_mem (V4SImode, x));
17893 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17894 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17896 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17897 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17898 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17899 (0x1.0p84 + double(fp_value_hi_xmm)).
17900 Note these exponents differ by 32. */
17902 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17904 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17905 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17906 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17907 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17908 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17909 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17910 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17911 biases = validize_mem (force_const_mem (V2DFmode, biases));
17912 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17914 /* Add the upper and lower DFmode values together. */
17915 if (TARGET_SSE3)
17916 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17917 else
17919 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17920 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17921 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17924 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17927 /* Not used, but eases macroization of patterns. */
17928 void
17929 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17930 rtx input ATTRIBUTE_UNUSED)
17932 gcc_unreachable ();
17935 /* Convert an unsigned SImode value into a DFmode. Only currently used
17936 for SSE, but applicable anywhere. */
17938 void
17939 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17941 REAL_VALUE_TYPE TWO31r;
17942 rtx x, fp;
17944 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17945 NULL, 1, OPTAB_DIRECT);
17947 fp = gen_reg_rtx (DFmode);
17948 emit_insn (gen_floatsidf2 (fp, x));
17950 real_ldexp (&TWO31r, &dconst1, 31);
17951 x = const_double_from_real_value (TWO31r, DFmode);
17953 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17954 if (x != target)
17955 emit_move_insn (target, x);
17958 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17959 32-bit mode; otherwise we have a direct convert instruction. */
17961 void
17962 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17964 REAL_VALUE_TYPE TWO32r;
17965 rtx fp_lo, fp_hi, x;
17967 fp_lo = gen_reg_rtx (DFmode);
17968 fp_hi = gen_reg_rtx (DFmode);
17970 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17972 real_ldexp (&TWO32r, &dconst1, 32);
17973 x = const_double_from_real_value (TWO32r, DFmode);
17974 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17976 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17978 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17979 0, OPTAB_DIRECT);
17980 if (x != target)
17981 emit_move_insn (target, x);
17984 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17985 For x86_32, -mfpmath=sse, !optimize_size only. */
17986 void
17987 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17989 REAL_VALUE_TYPE ONE16r;
17990 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17992 real_ldexp (&ONE16r, &dconst1, 16);
17993 x = const_double_from_real_value (ONE16r, SFmode);
17994 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17995 NULL, 0, OPTAB_DIRECT);
17996 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17997 NULL, 0, OPTAB_DIRECT);
17998 fp_hi = gen_reg_rtx (SFmode);
17999 fp_lo = gen_reg_rtx (SFmode);
18000 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18001 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18002 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18003 0, OPTAB_DIRECT);
18004 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18005 0, OPTAB_DIRECT);
18006 if (!rtx_equal_p (target, fp_hi))
18007 emit_move_insn (target, fp_hi);
18010 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18011 a vector of unsigned ints VAL to vector of floats TARGET. */
18013 void
18014 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18016 rtx tmp[8];
18017 REAL_VALUE_TYPE TWO16r;
18018 enum machine_mode intmode = GET_MODE (val);
18019 enum machine_mode fltmode = GET_MODE (target);
18020 rtx (*cvt) (rtx, rtx);
18022 if (intmode == V4SImode)
18023 cvt = gen_floatv4siv4sf2;
18024 else
18025 cvt = gen_floatv8siv8sf2;
18026 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18027 tmp[0] = force_reg (intmode, tmp[0]);
18028 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18029 OPTAB_DIRECT);
18030 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18031 NULL_RTX, 1, OPTAB_DIRECT);
18032 tmp[3] = gen_reg_rtx (fltmode);
18033 emit_insn (cvt (tmp[3], tmp[1]));
18034 tmp[4] = gen_reg_rtx (fltmode);
18035 emit_insn (cvt (tmp[4], tmp[2]));
18036 real_ldexp (&TWO16r, &dconst1, 16);
18037 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18038 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18039 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18040 OPTAB_DIRECT);
18041 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18042 OPTAB_DIRECT);
18043 if (tmp[7] != target)
18044 emit_move_insn (target, tmp[7]);
18047 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18048 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18049 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18050 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18053 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18055 REAL_VALUE_TYPE TWO31r;
18056 rtx two31r, tmp[4];
18057 enum machine_mode mode = GET_MODE (val);
18058 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18059 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18060 rtx (*cmp) (rtx, rtx, rtx, rtx);
18061 int i;
18063 for (i = 0; i < 3; i++)
18064 tmp[i] = gen_reg_rtx (mode);
18065 real_ldexp (&TWO31r, &dconst1, 31);
18066 two31r = const_double_from_real_value (TWO31r, scalarmode);
18067 two31r = ix86_build_const_vector (mode, 1, two31r);
18068 two31r = force_reg (mode, two31r);
18069 switch (mode)
18071 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18072 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18073 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18074 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18075 default: gcc_unreachable ();
18077 tmp[3] = gen_rtx_LE (mode, two31r, val);
18078 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18079 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18080 0, OPTAB_DIRECT);
18081 if (intmode == V4SImode || TARGET_AVX2)
18082 *xorp = expand_simple_binop (intmode, ASHIFT,
18083 gen_lowpart (intmode, tmp[0]),
18084 GEN_INT (31), NULL_RTX, 0,
18085 OPTAB_DIRECT);
18086 else
18088 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18089 two31 = ix86_build_const_vector (intmode, 1, two31);
18090 *xorp = expand_simple_binop (intmode, AND,
18091 gen_lowpart (intmode, tmp[0]),
18092 two31, NULL_RTX, 0,
18093 OPTAB_DIRECT);
18095 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18096 0, OPTAB_DIRECT);
18099 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18100 then replicate the value for all elements of the vector
18101 register. */
18104 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18106 int i, n_elt;
18107 rtvec v;
18108 enum machine_mode scalar_mode;
18110 switch (mode)
18112 case V32QImode:
18113 case V16QImode:
18114 case V16HImode:
18115 case V8HImode:
18116 case V8SImode:
18117 case V4SImode:
18118 case V4DImode:
18119 case V2DImode:
18120 gcc_assert (vect);
18121 case V8SFmode:
18122 case V4SFmode:
18123 case V4DFmode:
18124 case V2DFmode:
18125 n_elt = GET_MODE_NUNITS (mode);
18126 v = rtvec_alloc (n_elt);
18127 scalar_mode = GET_MODE_INNER (mode);
18129 RTVEC_ELT (v, 0) = value;
18131 for (i = 1; i < n_elt; ++i)
18132 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18134 return gen_rtx_CONST_VECTOR (mode, v);
18136 default:
18137 gcc_unreachable ();
18141 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18142 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18143 for an SSE register. If VECT is true, then replicate the mask for
18144 all elements of the vector register. If INVERT is true, then create
18145 a mask excluding the sign bit. */
18148 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18150 enum machine_mode vec_mode, imode;
18151 HOST_WIDE_INT hi, lo;
18152 int shift = 63;
18153 rtx v;
18154 rtx mask;
18156 /* Find the sign bit, sign extended to 2*HWI. */
18157 switch (mode)
18159 case V8SImode:
18160 case V4SImode:
18161 case V8SFmode:
18162 case V4SFmode:
18163 vec_mode = mode;
18164 mode = GET_MODE_INNER (mode);
18165 imode = SImode;
18166 lo = 0x80000000, hi = lo < 0;
18167 break;
18169 case V4DImode:
18170 case V2DImode:
18171 case V4DFmode:
18172 case V2DFmode:
18173 vec_mode = mode;
18174 mode = GET_MODE_INNER (mode);
18175 imode = DImode;
18176 if (HOST_BITS_PER_WIDE_INT >= 64)
18177 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18178 else
18179 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18180 break;
18182 case TImode:
18183 case TFmode:
18184 vec_mode = VOIDmode;
18185 if (HOST_BITS_PER_WIDE_INT >= 64)
18187 imode = TImode;
18188 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18190 else
18192 rtvec vec;
18194 imode = DImode;
18195 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18197 if (invert)
18199 lo = ~lo, hi = ~hi;
18200 v = constm1_rtx;
18202 else
18203 v = const0_rtx;
18205 mask = immed_double_const (lo, hi, imode);
18207 vec = gen_rtvec (2, v, mask);
18208 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18209 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18211 return v;
18213 break;
18215 default:
18216 gcc_unreachable ();
18219 if (invert)
18220 lo = ~lo, hi = ~hi;
18222 /* Force this value into the low part of a fp vector constant. */
18223 mask = immed_double_const (lo, hi, imode);
18224 mask = gen_lowpart (mode, mask);
18226 if (vec_mode == VOIDmode)
18227 return force_reg (mode, mask);
18229 v = ix86_build_const_vector (vec_mode, vect, mask);
18230 return force_reg (vec_mode, v);
18233 /* Generate code for floating point ABS or NEG. */
18235 void
18236 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18237 rtx operands[])
18239 rtx mask, set, dst, src;
18240 bool use_sse = false;
18241 bool vector_mode = VECTOR_MODE_P (mode);
18242 enum machine_mode vmode = mode;
18244 if (vector_mode)
18245 use_sse = true;
18246 else if (mode == TFmode)
18247 use_sse = true;
18248 else if (TARGET_SSE_MATH)
18250 use_sse = SSE_FLOAT_MODE_P (mode);
18251 if (mode == SFmode)
18252 vmode = V4SFmode;
18253 else if (mode == DFmode)
18254 vmode = V2DFmode;
18257 /* NEG and ABS performed with SSE use bitwise mask operations.
18258 Create the appropriate mask now. */
18259 if (use_sse)
18260 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18261 else
18262 mask = NULL_RTX;
18264 dst = operands[0];
18265 src = operands[1];
18267 set = gen_rtx_fmt_e (code, mode, src);
18268 set = gen_rtx_SET (VOIDmode, dst, set);
18270 if (mask)
18272 rtx use, clob;
18273 rtvec par;
18275 use = gen_rtx_USE (VOIDmode, mask);
18276 if (vector_mode)
18277 par = gen_rtvec (2, set, use);
18278 else
18280 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18281 par = gen_rtvec (3, set, use, clob);
18283 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18285 else
18286 emit_insn (set);
18289 /* Expand a copysign operation. Special case operand 0 being a constant. */
18291 void
18292 ix86_expand_copysign (rtx operands[])
18294 enum machine_mode mode, vmode;
18295 rtx dest, op0, op1, mask, nmask;
18297 dest = operands[0];
18298 op0 = operands[1];
18299 op1 = operands[2];
18301 mode = GET_MODE (dest);
18303 if (mode == SFmode)
18304 vmode = V4SFmode;
18305 else if (mode == DFmode)
18306 vmode = V2DFmode;
18307 else
18308 vmode = mode;
18310 if (GET_CODE (op0) == CONST_DOUBLE)
18312 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18314 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18315 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18317 if (mode == SFmode || mode == DFmode)
18319 if (op0 == CONST0_RTX (mode))
18320 op0 = CONST0_RTX (vmode);
18321 else
18323 rtx v = ix86_build_const_vector (vmode, false, op0);
18325 op0 = force_reg (vmode, v);
18328 else if (op0 != CONST0_RTX (mode))
18329 op0 = force_reg (mode, op0);
18331 mask = ix86_build_signbit_mask (vmode, 0, 0);
18333 if (mode == SFmode)
18334 copysign_insn = gen_copysignsf3_const;
18335 else if (mode == DFmode)
18336 copysign_insn = gen_copysigndf3_const;
18337 else
18338 copysign_insn = gen_copysigntf3_const;
18340 emit_insn (copysign_insn (dest, op0, op1, mask));
18342 else
18344 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18346 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18347 mask = ix86_build_signbit_mask (vmode, 0, 0);
18349 if (mode == SFmode)
18350 copysign_insn = gen_copysignsf3_var;
18351 else if (mode == DFmode)
18352 copysign_insn = gen_copysigndf3_var;
18353 else
18354 copysign_insn = gen_copysigntf3_var;
18356 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18360 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18361 be a constant, and so has already been expanded into a vector constant. */
18363 void
18364 ix86_split_copysign_const (rtx operands[])
18366 enum machine_mode mode, vmode;
18367 rtx dest, op0, mask, x;
18369 dest = operands[0];
18370 op0 = operands[1];
18371 mask = operands[3];
18373 mode = GET_MODE (dest);
18374 vmode = GET_MODE (mask);
18376 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18377 x = gen_rtx_AND (vmode, dest, mask);
18378 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18380 if (op0 != CONST0_RTX (vmode))
18382 x = gen_rtx_IOR (vmode, dest, op0);
18383 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18387 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18388 so we have to do two masks. */
18390 void
18391 ix86_split_copysign_var (rtx operands[])
18393 enum machine_mode mode, vmode;
18394 rtx dest, scratch, op0, op1, mask, nmask, x;
18396 dest = operands[0];
18397 scratch = operands[1];
18398 op0 = operands[2];
18399 op1 = operands[3];
18400 nmask = operands[4];
18401 mask = operands[5];
18403 mode = GET_MODE (dest);
18404 vmode = GET_MODE (mask);
18406 if (rtx_equal_p (op0, op1))
18408 /* Shouldn't happen often (it's useless, obviously), but when it does
18409 we'd generate incorrect code if we continue below. */
18410 emit_move_insn (dest, op0);
18411 return;
18414 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18416 gcc_assert (REGNO (op1) == REGNO (scratch));
18418 x = gen_rtx_AND (vmode, scratch, mask);
18419 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18421 dest = mask;
18422 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18423 x = gen_rtx_NOT (vmode, dest);
18424 x = gen_rtx_AND (vmode, x, op0);
18425 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18427 else
18429 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18431 x = gen_rtx_AND (vmode, scratch, mask);
18433 else /* alternative 2,4 */
18435 gcc_assert (REGNO (mask) == REGNO (scratch));
18436 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18437 x = gen_rtx_AND (vmode, scratch, op1);
18439 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18441 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18443 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18444 x = gen_rtx_AND (vmode, dest, nmask);
18446 else /* alternative 3,4 */
18448 gcc_assert (REGNO (nmask) == REGNO (dest));
18449 dest = nmask;
18450 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18451 x = gen_rtx_AND (vmode, dest, op0);
18453 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18456 x = gen_rtx_IOR (vmode, dest, scratch);
18457 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18460 /* Return TRUE or FALSE depending on whether the first SET in INSN
18461 has source and destination with matching CC modes, and that the
18462 CC mode is at least as constrained as REQ_MODE. */
18464 bool
18465 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18467 rtx set;
18468 enum machine_mode set_mode;
18470 set = PATTERN (insn);
18471 if (GET_CODE (set) == PARALLEL)
18472 set = XVECEXP (set, 0, 0);
18473 gcc_assert (GET_CODE (set) == SET);
18474 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18476 set_mode = GET_MODE (SET_DEST (set));
18477 switch (set_mode)
18479 case CCNOmode:
18480 if (req_mode != CCNOmode
18481 && (req_mode != CCmode
18482 || XEXP (SET_SRC (set), 1) != const0_rtx))
18483 return false;
18484 break;
18485 case CCmode:
18486 if (req_mode == CCGCmode)
18487 return false;
18488 /* FALLTHRU */
18489 case CCGCmode:
18490 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18491 return false;
18492 /* FALLTHRU */
18493 case CCGOCmode:
18494 if (req_mode == CCZmode)
18495 return false;
18496 /* FALLTHRU */
18497 case CCZmode:
18498 break;
18500 case CCAmode:
18501 case CCCmode:
18502 case CCOmode:
18503 case CCSmode:
18504 if (set_mode != req_mode)
18505 return false;
18506 break;
18508 default:
18509 gcc_unreachable ();
18512 return GET_MODE (SET_SRC (set)) == set_mode;
18515 /* Generate insn patterns to do an integer compare of OPERANDS. */
18517 static rtx
18518 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18520 enum machine_mode cmpmode;
18521 rtx tmp, flags;
18523 cmpmode = SELECT_CC_MODE (code, op0, op1);
18524 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18526 /* This is very simple, but making the interface the same as in the
18527 FP case makes the rest of the code easier. */
18528 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18529 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18531 /* Return the test that should be put into the flags user, i.e.
18532 the bcc, scc, or cmov instruction. */
18533 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18536 /* Figure out whether to use ordered or unordered fp comparisons.
18537 Return the appropriate mode to use. */
18539 enum machine_mode
18540 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18542 /* ??? In order to make all comparisons reversible, we do all comparisons
18543 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18544 all forms trapping and nontrapping comparisons, we can make inequality
18545 comparisons trapping again, since it results in better code when using
18546 FCOM based compares. */
18547 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18550 enum machine_mode
18551 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18553 enum machine_mode mode = GET_MODE (op0);
18555 if (SCALAR_FLOAT_MODE_P (mode))
18557 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18558 return ix86_fp_compare_mode (code);
18561 switch (code)
18563 /* Only zero flag is needed. */
18564 case EQ: /* ZF=0 */
18565 case NE: /* ZF!=0 */
18566 return CCZmode;
18567 /* Codes needing carry flag. */
18568 case GEU: /* CF=0 */
18569 case LTU: /* CF=1 */
18570 /* Detect overflow checks. They need just the carry flag. */
18571 if (GET_CODE (op0) == PLUS
18572 && rtx_equal_p (op1, XEXP (op0, 0)))
18573 return CCCmode;
18574 else
18575 return CCmode;
18576 case GTU: /* CF=0 & ZF=0 */
18577 case LEU: /* CF=1 | ZF=1 */
18578 /* Detect overflow checks. They need just the carry flag. */
18579 if (GET_CODE (op0) == MINUS
18580 && rtx_equal_p (op1, XEXP (op0, 0)))
18581 return CCCmode;
18582 else
18583 return CCmode;
18584 /* Codes possibly doable only with sign flag when
18585 comparing against zero. */
18586 case GE: /* SF=OF or SF=0 */
18587 case LT: /* SF<>OF or SF=1 */
18588 if (op1 == const0_rtx)
18589 return CCGOCmode;
18590 else
18591 /* For other cases Carry flag is not required. */
18592 return CCGCmode;
18593 /* Codes doable only with sign flag when comparing
18594 against zero, but we miss jump instruction for it
18595 so we need to use relational tests against overflow
18596 that thus needs to be zero. */
18597 case GT: /* ZF=0 & SF=OF */
18598 case LE: /* ZF=1 | SF<>OF */
18599 if (op1 == const0_rtx)
18600 return CCNOmode;
18601 else
18602 return CCGCmode;
18603 /* strcmp pattern do (use flags) and combine may ask us for proper
18604 mode. */
18605 case USE:
18606 return CCmode;
18607 default:
18608 gcc_unreachable ();
18612 /* Return the fixed registers used for condition codes. */
18614 static bool
18615 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18617 *p1 = FLAGS_REG;
18618 *p2 = FPSR_REG;
18619 return true;
18622 /* If two condition code modes are compatible, return a condition code
18623 mode which is compatible with both. Otherwise, return
18624 VOIDmode. */
18626 static enum machine_mode
18627 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18629 if (m1 == m2)
18630 return m1;
18632 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18633 return VOIDmode;
18635 if ((m1 == CCGCmode && m2 == CCGOCmode)
18636 || (m1 == CCGOCmode && m2 == CCGCmode))
18637 return CCGCmode;
18639 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18640 return m2;
18641 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18642 return m1;
18644 switch (m1)
18646 default:
18647 gcc_unreachable ();
18649 case CCmode:
18650 case CCGCmode:
18651 case CCGOCmode:
18652 case CCNOmode:
18653 case CCAmode:
18654 case CCCmode:
18655 case CCOmode:
18656 case CCSmode:
18657 case CCZmode:
18658 switch (m2)
18660 default:
18661 return VOIDmode;
18663 case CCmode:
18664 case CCGCmode:
18665 case CCGOCmode:
18666 case CCNOmode:
18667 case CCAmode:
18668 case CCCmode:
18669 case CCOmode:
18670 case CCSmode:
18671 case CCZmode:
18672 return CCmode;
18675 case CCFPmode:
18676 case CCFPUmode:
18677 /* These are only compatible with themselves, which we already
18678 checked above. */
18679 return VOIDmode;
18684 /* Return a comparison we can do and that it is equivalent to
18685 swap_condition (code) apart possibly from orderedness.
18686 But, never change orderedness if TARGET_IEEE_FP, returning
18687 UNKNOWN in that case if necessary. */
18689 static enum rtx_code
18690 ix86_fp_swap_condition (enum rtx_code code)
18692 switch (code)
18694 case GT: /* GTU - CF=0 & ZF=0 */
18695 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18696 case GE: /* GEU - CF=0 */
18697 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18698 case UNLT: /* LTU - CF=1 */
18699 return TARGET_IEEE_FP ? UNKNOWN : GT;
18700 case UNLE: /* LEU - CF=1 | ZF=1 */
18701 return TARGET_IEEE_FP ? UNKNOWN : GE;
18702 default:
18703 return swap_condition (code);
18707 /* Return cost of comparison CODE using the best strategy for performance.
18708 All following functions do use number of instructions as a cost metrics.
18709 In future this should be tweaked to compute bytes for optimize_size and
18710 take into account performance of various instructions on various CPUs. */
18712 static int
18713 ix86_fp_comparison_cost (enum rtx_code code)
18715 int arith_cost;
18717 /* The cost of code using bit-twiddling on %ah. */
18718 switch (code)
18720 case UNLE:
18721 case UNLT:
18722 case LTGT:
18723 case GT:
18724 case GE:
18725 case UNORDERED:
18726 case ORDERED:
18727 case UNEQ:
18728 arith_cost = 4;
18729 break;
18730 case LT:
18731 case NE:
18732 case EQ:
18733 case UNGE:
18734 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18735 break;
18736 case LE:
18737 case UNGT:
18738 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18739 break;
18740 default:
18741 gcc_unreachable ();
18744 switch (ix86_fp_comparison_strategy (code))
18746 case IX86_FPCMP_COMI:
18747 return arith_cost > 4 ? 3 : 2;
18748 case IX86_FPCMP_SAHF:
18749 return arith_cost > 4 ? 4 : 3;
18750 default:
18751 return arith_cost;
18755 /* Return strategy to use for floating-point. We assume that fcomi is always
18756 preferrable where available, since that is also true when looking at size
18757 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18759 enum ix86_fpcmp_strategy
18760 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18762 /* Do fcomi/sahf based test when profitable. */
18764 if (TARGET_CMOVE)
18765 return IX86_FPCMP_COMI;
18767 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18768 return IX86_FPCMP_SAHF;
18770 return IX86_FPCMP_ARITH;
18773 /* Swap, force into registers, or otherwise massage the two operands
18774 to a fp comparison. The operands are updated in place; the new
18775 comparison code is returned. */
18777 static enum rtx_code
18778 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18780 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18781 rtx op0 = *pop0, op1 = *pop1;
18782 enum machine_mode op_mode = GET_MODE (op0);
18783 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18785 /* All of the unordered compare instructions only work on registers.
18786 The same is true of the fcomi compare instructions. The XFmode
18787 compare instructions require registers except when comparing
18788 against zero or when converting operand 1 from fixed point to
18789 floating point. */
18791 if (!is_sse
18792 && (fpcmp_mode == CCFPUmode
18793 || (op_mode == XFmode
18794 && ! (standard_80387_constant_p (op0) == 1
18795 || standard_80387_constant_p (op1) == 1)
18796 && GET_CODE (op1) != FLOAT)
18797 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18799 op0 = force_reg (op_mode, op0);
18800 op1 = force_reg (op_mode, op1);
18802 else
18804 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18805 things around if they appear profitable, otherwise force op0
18806 into a register. */
18808 if (standard_80387_constant_p (op0) == 0
18809 || (MEM_P (op0)
18810 && ! (standard_80387_constant_p (op1) == 0
18811 || MEM_P (op1))))
18813 enum rtx_code new_code = ix86_fp_swap_condition (code);
18814 if (new_code != UNKNOWN)
18816 rtx tmp;
18817 tmp = op0, op0 = op1, op1 = tmp;
18818 code = new_code;
18822 if (!REG_P (op0))
18823 op0 = force_reg (op_mode, op0);
18825 if (CONSTANT_P (op1))
18827 int tmp = standard_80387_constant_p (op1);
18828 if (tmp == 0)
18829 op1 = validize_mem (force_const_mem (op_mode, op1));
18830 else if (tmp == 1)
18832 if (TARGET_CMOVE)
18833 op1 = force_reg (op_mode, op1);
18835 else
18836 op1 = force_reg (op_mode, op1);
18840 /* Try to rearrange the comparison to make it cheaper. */
18841 if (ix86_fp_comparison_cost (code)
18842 > ix86_fp_comparison_cost (swap_condition (code))
18843 && (REG_P (op1) || can_create_pseudo_p ()))
18845 rtx tmp;
18846 tmp = op0, op0 = op1, op1 = tmp;
18847 code = swap_condition (code);
18848 if (!REG_P (op0))
18849 op0 = force_reg (op_mode, op0);
18852 *pop0 = op0;
18853 *pop1 = op1;
18854 return code;
18857 /* Convert comparison codes we use to represent FP comparison to integer
18858 code that will result in proper branch. Return UNKNOWN if no such code
18859 is available. */
18861 enum rtx_code
18862 ix86_fp_compare_code_to_integer (enum rtx_code code)
18864 switch (code)
18866 case GT:
18867 return GTU;
18868 case GE:
18869 return GEU;
18870 case ORDERED:
18871 case UNORDERED:
18872 return code;
18873 break;
18874 case UNEQ:
18875 return EQ;
18876 break;
18877 case UNLT:
18878 return LTU;
18879 break;
18880 case UNLE:
18881 return LEU;
18882 break;
18883 case LTGT:
18884 return NE;
18885 break;
18886 default:
18887 return UNKNOWN;
18891 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18893 static rtx
18894 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18896 enum machine_mode fpcmp_mode, intcmp_mode;
18897 rtx tmp, tmp2;
18899 fpcmp_mode = ix86_fp_compare_mode (code);
18900 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18902 /* Do fcomi/sahf based test when profitable. */
18903 switch (ix86_fp_comparison_strategy (code))
18905 case IX86_FPCMP_COMI:
18906 intcmp_mode = fpcmp_mode;
18907 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18908 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18909 tmp);
18910 emit_insn (tmp);
18911 break;
18913 case IX86_FPCMP_SAHF:
18914 intcmp_mode = fpcmp_mode;
18915 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18916 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18917 tmp);
18919 if (!scratch)
18920 scratch = gen_reg_rtx (HImode);
18921 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18922 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18923 break;
18925 case IX86_FPCMP_ARITH:
18926 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18927 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18928 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18929 if (!scratch)
18930 scratch = gen_reg_rtx (HImode);
18931 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18933 /* In the unordered case, we have to check C2 for NaN's, which
18934 doesn't happen to work out to anything nice combination-wise.
18935 So do some bit twiddling on the value we've got in AH to come
18936 up with an appropriate set of condition codes. */
18938 intcmp_mode = CCNOmode;
18939 switch (code)
18941 case GT:
18942 case UNGT:
18943 if (code == GT || !TARGET_IEEE_FP)
18945 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18946 code = EQ;
18948 else
18950 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18951 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18952 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18953 intcmp_mode = CCmode;
18954 code = GEU;
18956 break;
18957 case LT:
18958 case UNLT:
18959 if (code == LT && TARGET_IEEE_FP)
18961 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18962 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18963 intcmp_mode = CCmode;
18964 code = EQ;
18966 else
18968 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18969 code = NE;
18971 break;
18972 case GE:
18973 case UNGE:
18974 if (code == GE || !TARGET_IEEE_FP)
18976 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18977 code = EQ;
18979 else
18981 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18982 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18983 code = NE;
18985 break;
18986 case LE:
18987 case UNLE:
18988 if (code == LE && TARGET_IEEE_FP)
18990 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18991 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18992 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18993 intcmp_mode = CCmode;
18994 code = LTU;
18996 else
18998 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18999 code = NE;
19001 break;
19002 case EQ:
19003 case UNEQ:
19004 if (code == EQ && TARGET_IEEE_FP)
19006 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19007 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19008 intcmp_mode = CCmode;
19009 code = EQ;
19011 else
19013 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19014 code = NE;
19016 break;
19017 case NE:
19018 case LTGT:
19019 if (code == NE && TARGET_IEEE_FP)
19021 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19022 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19023 GEN_INT (0x40)));
19024 code = NE;
19026 else
19028 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19029 code = EQ;
19031 break;
19033 case UNORDERED:
19034 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19035 code = NE;
19036 break;
19037 case ORDERED:
19038 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19039 code = EQ;
19040 break;
19042 default:
19043 gcc_unreachable ();
19045 break;
19047 default:
19048 gcc_unreachable();
19051 /* Return the test that should be put into the flags user, i.e.
19052 the bcc, scc, or cmov instruction. */
19053 return gen_rtx_fmt_ee (code, VOIDmode,
19054 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19055 const0_rtx);
19058 static rtx
19059 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19061 rtx ret;
19063 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19064 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19066 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19068 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19069 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19071 else
19072 ret = ix86_expand_int_compare (code, op0, op1);
19074 return ret;
19077 void
19078 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19080 enum machine_mode mode = GET_MODE (op0);
19081 rtx tmp;
19083 switch (mode)
19085 case SFmode:
19086 case DFmode:
19087 case XFmode:
19088 case QImode:
19089 case HImode:
19090 case SImode:
19091 simple:
19092 tmp = ix86_expand_compare (code, op0, op1);
19093 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19094 gen_rtx_LABEL_REF (VOIDmode, label),
19095 pc_rtx);
19096 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19097 return;
19099 case DImode:
19100 if (TARGET_64BIT)
19101 goto simple;
19102 case TImode:
19103 /* Expand DImode branch into multiple compare+branch. */
19105 rtx lo[2], hi[2], label2;
19106 enum rtx_code code1, code2, code3;
19107 enum machine_mode submode;
19109 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19111 tmp = op0, op0 = op1, op1 = tmp;
19112 code = swap_condition (code);
19115 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19116 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19118 submode = mode == DImode ? SImode : DImode;
19120 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19121 avoid two branches. This costs one extra insn, so disable when
19122 optimizing for size. */
19124 if ((code == EQ || code == NE)
19125 && (!optimize_insn_for_size_p ()
19126 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19128 rtx xor0, xor1;
19130 xor1 = hi[0];
19131 if (hi[1] != const0_rtx)
19132 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19133 NULL_RTX, 0, OPTAB_WIDEN);
19135 xor0 = lo[0];
19136 if (lo[1] != const0_rtx)
19137 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19138 NULL_RTX, 0, OPTAB_WIDEN);
19140 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19141 NULL_RTX, 0, OPTAB_WIDEN);
19143 ix86_expand_branch (code, tmp, const0_rtx, label);
19144 return;
19147 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19148 op1 is a constant and the low word is zero, then we can just
19149 examine the high word. Similarly for low word -1 and
19150 less-or-equal-than or greater-than. */
19152 if (CONST_INT_P (hi[1]))
19153 switch (code)
19155 case LT: case LTU: case GE: case GEU:
19156 if (lo[1] == const0_rtx)
19158 ix86_expand_branch (code, hi[0], hi[1], label);
19159 return;
19161 break;
19162 case LE: case LEU: case GT: case GTU:
19163 if (lo[1] == constm1_rtx)
19165 ix86_expand_branch (code, hi[0], hi[1], label);
19166 return;
19168 break;
19169 default:
19170 break;
19173 /* Otherwise, we need two or three jumps. */
19175 label2 = gen_label_rtx ();
19177 code1 = code;
19178 code2 = swap_condition (code);
19179 code3 = unsigned_condition (code);
19181 switch (code)
19183 case LT: case GT: case LTU: case GTU:
19184 break;
19186 case LE: code1 = LT; code2 = GT; break;
19187 case GE: code1 = GT; code2 = LT; break;
19188 case LEU: code1 = LTU; code2 = GTU; break;
19189 case GEU: code1 = GTU; code2 = LTU; break;
19191 case EQ: code1 = UNKNOWN; code2 = NE; break;
19192 case NE: code2 = UNKNOWN; break;
19194 default:
19195 gcc_unreachable ();
19199 * a < b =>
19200 * if (hi(a) < hi(b)) goto true;
19201 * if (hi(a) > hi(b)) goto false;
19202 * if (lo(a) < lo(b)) goto true;
19203 * false:
19206 if (code1 != UNKNOWN)
19207 ix86_expand_branch (code1, hi[0], hi[1], label);
19208 if (code2 != UNKNOWN)
19209 ix86_expand_branch (code2, hi[0], hi[1], label2);
19211 ix86_expand_branch (code3, lo[0], lo[1], label);
19213 if (code2 != UNKNOWN)
19214 emit_label (label2);
19215 return;
19218 default:
19219 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19220 goto simple;
19224 /* Split branch based on floating point condition. */
19225 void
19226 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19227 rtx target1, rtx target2, rtx tmp, rtx pushed)
19229 rtx condition;
19230 rtx i;
19232 if (target2 != pc_rtx)
19234 rtx tmp = target2;
19235 code = reverse_condition_maybe_unordered (code);
19236 target2 = target1;
19237 target1 = tmp;
19240 condition = ix86_expand_fp_compare (code, op1, op2,
19241 tmp);
19243 /* Remove pushed operand from stack. */
19244 if (pushed)
19245 ix86_free_from_memory (GET_MODE (pushed));
19247 i = emit_jump_insn (gen_rtx_SET
19248 (VOIDmode, pc_rtx,
19249 gen_rtx_IF_THEN_ELSE (VOIDmode,
19250 condition, target1, target2)));
19251 if (split_branch_probability >= 0)
19252 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19255 void
19256 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19258 rtx ret;
19260 gcc_assert (GET_MODE (dest) == QImode);
19262 ret = ix86_expand_compare (code, op0, op1);
19263 PUT_MODE (ret, QImode);
19264 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19267 /* Expand comparison setting or clearing carry flag. Return true when
19268 successful and set pop for the operation. */
19269 static bool
19270 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19272 enum machine_mode mode =
19273 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19275 /* Do not handle double-mode compares that go through special path. */
19276 if (mode == (TARGET_64BIT ? TImode : DImode))
19277 return false;
19279 if (SCALAR_FLOAT_MODE_P (mode))
19281 rtx compare_op, compare_seq;
19283 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19285 /* Shortcut: following common codes never translate
19286 into carry flag compares. */
19287 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19288 || code == ORDERED || code == UNORDERED)
19289 return false;
19291 /* These comparisons require zero flag; swap operands so they won't. */
19292 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19293 && !TARGET_IEEE_FP)
19295 rtx tmp = op0;
19296 op0 = op1;
19297 op1 = tmp;
19298 code = swap_condition (code);
19301 /* Try to expand the comparison and verify that we end up with
19302 carry flag based comparison. This fails to be true only when
19303 we decide to expand comparison using arithmetic that is not
19304 too common scenario. */
19305 start_sequence ();
19306 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19307 compare_seq = get_insns ();
19308 end_sequence ();
19310 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19311 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19312 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19313 else
19314 code = GET_CODE (compare_op);
19316 if (code != LTU && code != GEU)
19317 return false;
19319 emit_insn (compare_seq);
19320 *pop = compare_op;
19321 return true;
19324 if (!INTEGRAL_MODE_P (mode))
19325 return false;
19327 switch (code)
19329 case LTU:
19330 case GEU:
19331 break;
19333 /* Convert a==0 into (unsigned)a<1. */
19334 case EQ:
19335 case NE:
19336 if (op1 != const0_rtx)
19337 return false;
19338 op1 = const1_rtx;
19339 code = (code == EQ ? LTU : GEU);
19340 break;
19342 /* Convert a>b into b<a or a>=b-1. */
19343 case GTU:
19344 case LEU:
19345 if (CONST_INT_P (op1))
19347 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19348 /* Bail out on overflow. We still can swap operands but that
19349 would force loading of the constant into register. */
19350 if (op1 == const0_rtx
19351 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19352 return false;
19353 code = (code == GTU ? GEU : LTU);
19355 else
19357 rtx tmp = op1;
19358 op1 = op0;
19359 op0 = tmp;
19360 code = (code == GTU ? LTU : GEU);
19362 break;
19364 /* Convert a>=0 into (unsigned)a<0x80000000. */
19365 case LT:
19366 case GE:
19367 if (mode == DImode || op1 != const0_rtx)
19368 return false;
19369 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19370 code = (code == LT ? GEU : LTU);
19371 break;
19372 case LE:
19373 case GT:
19374 if (mode == DImode || op1 != constm1_rtx)
19375 return false;
19376 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19377 code = (code == LE ? GEU : LTU);
19378 break;
19380 default:
19381 return false;
19383 /* Swapping operands may cause constant to appear as first operand. */
19384 if (!nonimmediate_operand (op0, VOIDmode))
19386 if (!can_create_pseudo_p ())
19387 return false;
19388 op0 = force_reg (mode, op0);
19390 *pop = ix86_expand_compare (code, op0, op1);
19391 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19392 return true;
19395 bool
19396 ix86_expand_int_movcc (rtx operands[])
19398 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19399 rtx compare_seq, compare_op;
19400 enum machine_mode mode = GET_MODE (operands[0]);
19401 bool sign_bit_compare_p = false;
19402 rtx op0 = XEXP (operands[1], 0);
19403 rtx op1 = XEXP (operands[1], 1);
19405 if (GET_MODE (op0) == TImode
19406 || (GET_MODE (op0) == DImode
19407 && !TARGET_64BIT))
19408 return false;
19410 start_sequence ();
19411 compare_op = ix86_expand_compare (code, op0, op1);
19412 compare_seq = get_insns ();
19413 end_sequence ();
19415 compare_code = GET_CODE (compare_op);
19417 if ((op1 == const0_rtx && (code == GE || code == LT))
19418 || (op1 == constm1_rtx && (code == GT || code == LE)))
19419 sign_bit_compare_p = true;
19421 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19422 HImode insns, we'd be swallowed in word prefix ops. */
19424 if ((mode != HImode || TARGET_FAST_PREFIX)
19425 && (mode != (TARGET_64BIT ? TImode : DImode))
19426 && CONST_INT_P (operands[2])
19427 && CONST_INT_P (operands[3]))
19429 rtx out = operands[0];
19430 HOST_WIDE_INT ct = INTVAL (operands[2]);
19431 HOST_WIDE_INT cf = INTVAL (operands[3]);
19432 HOST_WIDE_INT diff;
19434 diff = ct - cf;
19435 /* Sign bit compares are better done using shifts than we do by using
19436 sbb. */
19437 if (sign_bit_compare_p
19438 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19440 /* Detect overlap between destination and compare sources. */
19441 rtx tmp = out;
19443 if (!sign_bit_compare_p)
19445 rtx flags;
19446 bool fpcmp = false;
19448 compare_code = GET_CODE (compare_op);
19450 flags = XEXP (compare_op, 0);
19452 if (GET_MODE (flags) == CCFPmode
19453 || GET_MODE (flags) == CCFPUmode)
19455 fpcmp = true;
19456 compare_code
19457 = ix86_fp_compare_code_to_integer (compare_code);
19460 /* To simplify rest of code, restrict to the GEU case. */
19461 if (compare_code == LTU)
19463 HOST_WIDE_INT tmp = ct;
19464 ct = cf;
19465 cf = tmp;
19466 compare_code = reverse_condition (compare_code);
19467 code = reverse_condition (code);
19469 else
19471 if (fpcmp)
19472 PUT_CODE (compare_op,
19473 reverse_condition_maybe_unordered
19474 (GET_CODE (compare_op)));
19475 else
19476 PUT_CODE (compare_op,
19477 reverse_condition (GET_CODE (compare_op)));
19479 diff = ct - cf;
19481 if (reg_overlap_mentioned_p (out, op0)
19482 || reg_overlap_mentioned_p (out, op1))
19483 tmp = gen_reg_rtx (mode);
19485 if (mode == DImode)
19486 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19487 else
19488 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19489 flags, compare_op));
19491 else
19493 if (code == GT || code == GE)
19494 code = reverse_condition (code);
19495 else
19497 HOST_WIDE_INT tmp = ct;
19498 ct = cf;
19499 cf = tmp;
19500 diff = ct - cf;
19502 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19505 if (diff == 1)
19508 * cmpl op0,op1
19509 * sbbl dest,dest
19510 * [addl dest, ct]
19512 * Size 5 - 8.
19514 if (ct)
19515 tmp = expand_simple_binop (mode, PLUS,
19516 tmp, GEN_INT (ct),
19517 copy_rtx (tmp), 1, OPTAB_DIRECT);
19519 else if (cf == -1)
19522 * cmpl op0,op1
19523 * sbbl dest,dest
19524 * orl $ct, dest
19526 * Size 8.
19528 tmp = expand_simple_binop (mode, IOR,
19529 tmp, GEN_INT (ct),
19530 copy_rtx (tmp), 1, OPTAB_DIRECT);
19532 else if (diff == -1 && ct)
19535 * cmpl op0,op1
19536 * sbbl dest,dest
19537 * notl dest
19538 * [addl dest, cf]
19540 * Size 8 - 11.
19542 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19543 if (cf)
19544 tmp = expand_simple_binop (mode, PLUS,
19545 copy_rtx (tmp), GEN_INT (cf),
19546 copy_rtx (tmp), 1, OPTAB_DIRECT);
19548 else
19551 * cmpl op0,op1
19552 * sbbl dest,dest
19553 * [notl dest]
19554 * andl cf - ct, dest
19555 * [addl dest, ct]
19557 * Size 8 - 11.
19560 if (cf == 0)
19562 cf = ct;
19563 ct = 0;
19564 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19567 tmp = expand_simple_binop (mode, AND,
19568 copy_rtx (tmp),
19569 gen_int_mode (cf - ct, mode),
19570 copy_rtx (tmp), 1, OPTAB_DIRECT);
19571 if (ct)
19572 tmp = expand_simple_binop (mode, PLUS,
19573 copy_rtx (tmp), GEN_INT (ct),
19574 copy_rtx (tmp), 1, OPTAB_DIRECT);
19577 if (!rtx_equal_p (tmp, out))
19578 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19580 return true;
19583 if (diff < 0)
19585 enum machine_mode cmp_mode = GET_MODE (op0);
19587 HOST_WIDE_INT tmp;
19588 tmp = ct, ct = cf, cf = tmp;
19589 diff = -diff;
19591 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19593 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19595 /* We may be reversing unordered compare to normal compare, that
19596 is not valid in general (we may convert non-trapping condition
19597 to trapping one), however on i386 we currently emit all
19598 comparisons unordered. */
19599 compare_code = reverse_condition_maybe_unordered (compare_code);
19600 code = reverse_condition_maybe_unordered (code);
19602 else
19604 compare_code = reverse_condition (compare_code);
19605 code = reverse_condition (code);
19609 compare_code = UNKNOWN;
19610 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19611 && CONST_INT_P (op1))
19613 if (op1 == const0_rtx
19614 && (code == LT || code == GE))
19615 compare_code = code;
19616 else if (op1 == constm1_rtx)
19618 if (code == LE)
19619 compare_code = LT;
19620 else if (code == GT)
19621 compare_code = GE;
19625 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19626 if (compare_code != UNKNOWN
19627 && GET_MODE (op0) == GET_MODE (out)
19628 && (cf == -1 || ct == -1))
19630 /* If lea code below could be used, only optimize
19631 if it results in a 2 insn sequence. */
19633 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19634 || diff == 3 || diff == 5 || diff == 9)
19635 || (compare_code == LT && ct == -1)
19636 || (compare_code == GE && cf == -1))
19639 * notl op1 (if necessary)
19640 * sarl $31, op1
19641 * orl cf, op1
19643 if (ct != -1)
19645 cf = ct;
19646 ct = -1;
19647 code = reverse_condition (code);
19650 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19652 out = expand_simple_binop (mode, IOR,
19653 out, GEN_INT (cf),
19654 out, 1, OPTAB_DIRECT);
19655 if (out != operands[0])
19656 emit_move_insn (operands[0], out);
19658 return true;
19663 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19664 || diff == 3 || diff == 5 || diff == 9)
19665 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19666 && (mode != DImode
19667 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19670 * xorl dest,dest
19671 * cmpl op1,op2
19672 * setcc dest
19673 * lea cf(dest*(ct-cf)),dest
19675 * Size 14.
19677 * This also catches the degenerate setcc-only case.
19680 rtx tmp;
19681 int nops;
19683 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19685 nops = 0;
19686 /* On x86_64 the lea instruction operates on Pmode, so we need
19687 to get arithmetics done in proper mode to match. */
19688 if (diff == 1)
19689 tmp = copy_rtx (out);
19690 else
19692 rtx out1;
19693 out1 = copy_rtx (out);
19694 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19695 nops++;
19696 if (diff & 1)
19698 tmp = gen_rtx_PLUS (mode, tmp, out1);
19699 nops++;
19702 if (cf != 0)
19704 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19705 nops++;
19707 if (!rtx_equal_p (tmp, out))
19709 if (nops == 1)
19710 out = force_operand (tmp, copy_rtx (out));
19711 else
19712 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19714 if (!rtx_equal_p (out, operands[0]))
19715 emit_move_insn (operands[0], copy_rtx (out));
19717 return true;
19721 * General case: Jumpful:
19722 * xorl dest,dest cmpl op1, op2
19723 * cmpl op1, op2 movl ct, dest
19724 * setcc dest jcc 1f
19725 * decl dest movl cf, dest
19726 * andl (cf-ct),dest 1:
19727 * addl ct,dest
19729 * Size 20. Size 14.
19731 * This is reasonably steep, but branch mispredict costs are
19732 * high on modern cpus, so consider failing only if optimizing
19733 * for space.
19736 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19737 && BRANCH_COST (optimize_insn_for_speed_p (),
19738 false) >= 2)
19740 if (cf == 0)
19742 enum machine_mode cmp_mode = GET_MODE (op0);
19744 cf = ct;
19745 ct = 0;
19747 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19749 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19751 /* We may be reversing unordered compare to normal compare,
19752 that is not valid in general (we may convert non-trapping
19753 condition to trapping one), however on i386 we currently
19754 emit all comparisons unordered. */
19755 code = reverse_condition_maybe_unordered (code);
19757 else
19759 code = reverse_condition (code);
19760 if (compare_code != UNKNOWN)
19761 compare_code = reverse_condition (compare_code);
19765 if (compare_code != UNKNOWN)
19767 /* notl op1 (if needed)
19768 sarl $31, op1
19769 andl (cf-ct), op1
19770 addl ct, op1
19772 For x < 0 (resp. x <= -1) there will be no notl,
19773 so if possible swap the constants to get rid of the
19774 complement.
19775 True/false will be -1/0 while code below (store flag
19776 followed by decrement) is 0/-1, so the constants need
19777 to be exchanged once more. */
19779 if (compare_code == GE || !cf)
19781 code = reverse_condition (code);
19782 compare_code = LT;
19784 else
19786 HOST_WIDE_INT tmp = cf;
19787 cf = ct;
19788 ct = tmp;
19791 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19793 else
19795 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19797 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19798 constm1_rtx,
19799 copy_rtx (out), 1, OPTAB_DIRECT);
19802 out = expand_simple_binop (mode, AND, copy_rtx (out),
19803 gen_int_mode (cf - ct, mode),
19804 copy_rtx (out), 1, OPTAB_DIRECT);
19805 if (ct)
19806 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19807 copy_rtx (out), 1, OPTAB_DIRECT);
19808 if (!rtx_equal_p (out, operands[0]))
19809 emit_move_insn (operands[0], copy_rtx (out));
19811 return true;
19815 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19817 /* Try a few things more with specific constants and a variable. */
19819 optab op;
19820 rtx var, orig_out, out, tmp;
19822 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19823 return false;
19825 /* If one of the two operands is an interesting constant, load a
19826 constant with the above and mask it in with a logical operation. */
19828 if (CONST_INT_P (operands[2]))
19830 var = operands[3];
19831 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19832 operands[3] = constm1_rtx, op = and_optab;
19833 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19834 operands[3] = const0_rtx, op = ior_optab;
19835 else
19836 return false;
19838 else if (CONST_INT_P (operands[3]))
19840 var = operands[2];
19841 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19842 operands[2] = constm1_rtx, op = and_optab;
19843 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19844 operands[2] = const0_rtx, op = ior_optab;
19845 else
19846 return false;
19848 else
19849 return false;
19851 orig_out = operands[0];
19852 tmp = gen_reg_rtx (mode);
19853 operands[0] = tmp;
19855 /* Recurse to get the constant loaded. */
19856 if (ix86_expand_int_movcc (operands) == 0)
19857 return false;
19859 /* Mask in the interesting variable. */
19860 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19861 OPTAB_WIDEN);
19862 if (!rtx_equal_p (out, orig_out))
19863 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19865 return true;
19869 * For comparison with above,
19871 * movl cf,dest
19872 * movl ct,tmp
19873 * cmpl op1,op2
19874 * cmovcc tmp,dest
19876 * Size 15.
19879 if (! nonimmediate_operand (operands[2], mode))
19880 operands[2] = force_reg (mode, operands[2]);
19881 if (! nonimmediate_operand (operands[3], mode))
19882 operands[3] = force_reg (mode, operands[3]);
19884 if (! register_operand (operands[2], VOIDmode)
19885 && (mode == QImode
19886 || ! register_operand (operands[3], VOIDmode)))
19887 operands[2] = force_reg (mode, operands[2]);
19889 if (mode == QImode
19890 && ! register_operand (operands[3], VOIDmode))
19891 operands[3] = force_reg (mode, operands[3]);
19893 emit_insn (compare_seq);
19894 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19895 gen_rtx_IF_THEN_ELSE (mode,
19896 compare_op, operands[2],
19897 operands[3])));
19898 return true;
19901 /* Swap, force into registers, or otherwise massage the two operands
19902 to an sse comparison with a mask result. Thus we differ a bit from
19903 ix86_prepare_fp_compare_args which expects to produce a flags result.
19905 The DEST operand exists to help determine whether to commute commutative
19906 operators. The POP0/POP1 operands are updated in place. The new
19907 comparison code is returned, or UNKNOWN if not implementable. */
19909 static enum rtx_code
19910 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19911 rtx *pop0, rtx *pop1)
19913 rtx tmp;
19915 switch (code)
19917 case LTGT:
19918 case UNEQ:
19919 /* AVX supports all the needed comparisons. */
19920 if (TARGET_AVX)
19921 break;
19922 /* We have no LTGT as an operator. We could implement it with
19923 NE & ORDERED, but this requires an extra temporary. It's
19924 not clear that it's worth it. */
19925 return UNKNOWN;
19927 case LT:
19928 case LE:
19929 case UNGT:
19930 case UNGE:
19931 /* These are supported directly. */
19932 break;
19934 case EQ:
19935 case NE:
19936 case UNORDERED:
19937 case ORDERED:
19938 /* AVX has 3 operand comparisons, no need to swap anything. */
19939 if (TARGET_AVX)
19940 break;
19941 /* For commutative operators, try to canonicalize the destination
19942 operand to be first in the comparison - this helps reload to
19943 avoid extra moves. */
19944 if (!dest || !rtx_equal_p (dest, *pop1))
19945 break;
19946 /* FALLTHRU */
19948 case GE:
19949 case GT:
19950 case UNLE:
19951 case UNLT:
19952 /* These are not supported directly before AVX, and furthermore
19953 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19954 comparison operands to transform into something that is
19955 supported. */
19956 tmp = *pop0;
19957 *pop0 = *pop1;
19958 *pop1 = tmp;
19959 code = swap_condition (code);
19960 break;
19962 default:
19963 gcc_unreachable ();
19966 return code;
19969 /* Detect conditional moves that exactly match min/max operational
19970 semantics. Note that this is IEEE safe, as long as we don't
19971 interchange the operands.
19973 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19974 and TRUE if the operation is successful and instructions are emitted. */
19976 static bool
19977 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19978 rtx cmp_op1, rtx if_true, rtx if_false)
19980 enum machine_mode mode;
19981 bool is_min;
19982 rtx tmp;
19984 if (code == LT)
19986 else if (code == UNGE)
19988 tmp = if_true;
19989 if_true = if_false;
19990 if_false = tmp;
19992 else
19993 return false;
19995 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19996 is_min = true;
19997 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19998 is_min = false;
19999 else
20000 return false;
20002 mode = GET_MODE (dest);
20004 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20005 but MODE may be a vector mode and thus not appropriate. */
20006 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20008 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20009 rtvec v;
20011 if_true = force_reg (mode, if_true);
20012 v = gen_rtvec (2, if_true, if_false);
20013 tmp = gen_rtx_UNSPEC (mode, v, u);
20015 else
20017 code = is_min ? SMIN : SMAX;
20018 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20021 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20022 return true;
20025 /* Expand an sse vector comparison. Return the register with the result. */
20027 static rtx
20028 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20029 rtx op_true, rtx op_false)
20031 enum machine_mode mode = GET_MODE (dest);
20032 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20033 rtx x;
20035 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20036 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20037 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20039 if (optimize
20040 || reg_overlap_mentioned_p (dest, op_true)
20041 || reg_overlap_mentioned_p (dest, op_false))
20042 dest = gen_reg_rtx (mode);
20044 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20045 if (cmp_mode != mode)
20047 x = force_reg (cmp_mode, x);
20048 convert_move (dest, x, false);
20050 else
20051 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20053 return dest;
20056 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20057 operations. This is used for both scalar and vector conditional moves. */
20059 static void
20060 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20062 enum machine_mode mode = GET_MODE (dest);
20063 rtx t2, t3, x;
20065 if (vector_all_ones_operand (op_true, mode)
20066 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20068 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20070 else if (op_false == CONST0_RTX (mode))
20072 op_true = force_reg (mode, op_true);
20073 x = gen_rtx_AND (mode, cmp, op_true);
20074 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20076 else if (op_true == CONST0_RTX (mode))
20078 op_false = force_reg (mode, op_false);
20079 x = gen_rtx_NOT (mode, cmp);
20080 x = gen_rtx_AND (mode, x, op_false);
20081 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20083 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20085 op_false = force_reg (mode, op_false);
20086 x = gen_rtx_IOR (mode, cmp, op_false);
20087 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20089 else if (TARGET_XOP)
20091 op_true = force_reg (mode, op_true);
20093 if (!nonimmediate_operand (op_false, mode))
20094 op_false = force_reg (mode, op_false);
20096 emit_insn (gen_rtx_SET (mode, dest,
20097 gen_rtx_IF_THEN_ELSE (mode, cmp,
20098 op_true,
20099 op_false)));
20101 else
20103 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20105 if (!nonimmediate_operand (op_true, mode))
20106 op_true = force_reg (mode, op_true);
20108 op_false = force_reg (mode, op_false);
20110 switch (mode)
20112 case V4SFmode:
20113 if (TARGET_SSE4_1)
20114 gen = gen_sse4_1_blendvps;
20115 break;
20116 case V2DFmode:
20117 if (TARGET_SSE4_1)
20118 gen = gen_sse4_1_blendvpd;
20119 break;
20120 case V16QImode:
20121 case V8HImode:
20122 case V4SImode:
20123 case V2DImode:
20124 if (TARGET_SSE4_1)
20126 gen = gen_sse4_1_pblendvb;
20127 dest = gen_lowpart (V16QImode, dest);
20128 op_false = gen_lowpart (V16QImode, op_false);
20129 op_true = gen_lowpart (V16QImode, op_true);
20130 cmp = gen_lowpart (V16QImode, cmp);
20132 break;
20133 case V8SFmode:
20134 if (TARGET_AVX)
20135 gen = gen_avx_blendvps256;
20136 break;
20137 case V4DFmode:
20138 if (TARGET_AVX)
20139 gen = gen_avx_blendvpd256;
20140 break;
20141 case V32QImode:
20142 case V16HImode:
20143 case V8SImode:
20144 case V4DImode:
20145 if (TARGET_AVX2)
20147 gen = gen_avx2_pblendvb;
20148 dest = gen_lowpart (V32QImode, dest);
20149 op_false = gen_lowpart (V32QImode, op_false);
20150 op_true = gen_lowpart (V32QImode, op_true);
20151 cmp = gen_lowpart (V32QImode, cmp);
20153 break;
20154 default:
20155 break;
20158 if (gen != NULL)
20159 emit_insn (gen (dest, op_false, op_true, cmp));
20160 else
20162 op_true = force_reg (mode, op_true);
20164 t2 = gen_reg_rtx (mode);
20165 if (optimize)
20166 t3 = gen_reg_rtx (mode);
20167 else
20168 t3 = dest;
20170 x = gen_rtx_AND (mode, op_true, cmp);
20171 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20173 x = gen_rtx_NOT (mode, cmp);
20174 x = gen_rtx_AND (mode, x, op_false);
20175 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20177 x = gen_rtx_IOR (mode, t3, t2);
20178 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20183 /* Expand a floating-point conditional move. Return true if successful. */
20185 bool
20186 ix86_expand_fp_movcc (rtx operands[])
20188 enum machine_mode mode = GET_MODE (operands[0]);
20189 enum rtx_code code = GET_CODE (operands[1]);
20190 rtx tmp, compare_op;
20191 rtx op0 = XEXP (operands[1], 0);
20192 rtx op1 = XEXP (operands[1], 1);
20194 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20196 enum machine_mode cmode;
20198 /* Since we've no cmove for sse registers, don't force bad register
20199 allocation just to gain access to it. Deny movcc when the
20200 comparison mode doesn't match the move mode. */
20201 cmode = GET_MODE (op0);
20202 if (cmode == VOIDmode)
20203 cmode = GET_MODE (op1);
20204 if (cmode != mode)
20205 return false;
20207 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20208 if (code == UNKNOWN)
20209 return false;
20211 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20212 operands[2], operands[3]))
20213 return true;
20215 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20216 operands[2], operands[3]);
20217 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20218 return true;
20221 if (GET_MODE (op0) == TImode
20222 || (GET_MODE (op0) == DImode
20223 && !TARGET_64BIT))
20224 return false;
20226 /* The floating point conditional move instructions don't directly
20227 support conditions resulting from a signed integer comparison. */
20229 compare_op = ix86_expand_compare (code, op0, op1);
20230 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20232 tmp = gen_reg_rtx (QImode);
20233 ix86_expand_setcc (tmp, code, op0, op1);
20235 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20238 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20239 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20240 operands[2], operands[3])));
20242 return true;
20245 /* Expand a floating-point vector conditional move; a vcond operation
20246 rather than a movcc operation. */
20248 bool
20249 ix86_expand_fp_vcond (rtx operands[])
20251 enum rtx_code code = GET_CODE (operands[3]);
20252 rtx cmp;
20254 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20255 &operands[4], &operands[5]);
20256 if (code == UNKNOWN)
20258 rtx temp;
20259 switch (GET_CODE (operands[3]))
20261 case LTGT:
20262 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20263 operands[5], operands[0], operands[0]);
20264 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20265 operands[5], operands[1], operands[2]);
20266 code = AND;
20267 break;
20268 case UNEQ:
20269 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20270 operands[5], operands[0], operands[0]);
20271 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20272 operands[5], operands[1], operands[2]);
20273 code = IOR;
20274 break;
20275 default:
20276 gcc_unreachable ();
20278 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20279 OPTAB_DIRECT);
20280 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20281 return true;
20284 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20285 operands[5], operands[1], operands[2]))
20286 return true;
20288 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20289 operands[1], operands[2]);
20290 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20291 return true;
20294 /* Expand a signed/unsigned integral vector conditional move. */
20296 bool
20297 ix86_expand_int_vcond (rtx operands[])
20299 enum machine_mode data_mode = GET_MODE (operands[0]);
20300 enum machine_mode mode = GET_MODE (operands[4]);
20301 enum rtx_code code = GET_CODE (operands[3]);
20302 bool negate = false;
20303 rtx x, cop0, cop1;
20305 cop0 = operands[4];
20306 cop1 = operands[5];
20308 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20309 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20310 if ((code == LT || code == GE)
20311 && data_mode == mode
20312 && cop1 == CONST0_RTX (mode)
20313 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20314 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20315 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20316 && (GET_MODE_SIZE (data_mode) == 16
20317 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20319 rtx negop = operands[2 - (code == LT)];
20320 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20321 if (negop == CONST1_RTX (data_mode))
20323 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20324 operands[0], 1, OPTAB_DIRECT);
20325 if (res != operands[0])
20326 emit_move_insn (operands[0], res);
20327 return true;
20329 else if (GET_MODE_INNER (data_mode) != DImode
20330 && vector_all_ones_operand (negop, data_mode))
20332 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20333 operands[0], 0, OPTAB_DIRECT);
20334 if (res != operands[0])
20335 emit_move_insn (operands[0], res);
20336 return true;
20340 if (!nonimmediate_operand (cop1, mode))
20341 cop1 = force_reg (mode, cop1);
20342 if (!general_operand (operands[1], data_mode))
20343 operands[1] = force_reg (data_mode, operands[1]);
20344 if (!general_operand (operands[2], data_mode))
20345 operands[2] = force_reg (data_mode, operands[2]);
20347 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20348 if (TARGET_XOP
20349 && (mode == V16QImode || mode == V8HImode
20350 || mode == V4SImode || mode == V2DImode))
20352 else
20354 /* Canonicalize the comparison to EQ, GT, GTU. */
20355 switch (code)
20357 case EQ:
20358 case GT:
20359 case GTU:
20360 break;
20362 case NE:
20363 case LE:
20364 case LEU:
20365 code = reverse_condition (code);
20366 negate = true;
20367 break;
20369 case GE:
20370 case GEU:
20371 code = reverse_condition (code);
20372 negate = true;
20373 /* FALLTHRU */
20375 case LT:
20376 case LTU:
20377 code = swap_condition (code);
20378 x = cop0, cop0 = cop1, cop1 = x;
20379 break;
20381 default:
20382 gcc_unreachable ();
20385 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20386 if (mode == V2DImode)
20388 switch (code)
20390 case EQ:
20391 /* SSE4.1 supports EQ. */
20392 if (!TARGET_SSE4_1)
20393 return false;
20394 break;
20396 case GT:
20397 case GTU:
20398 /* SSE4.2 supports GT/GTU. */
20399 if (!TARGET_SSE4_2)
20400 return false;
20401 break;
20403 default:
20404 gcc_unreachable ();
20408 /* Unsigned parallel compare is not supported by the hardware.
20409 Play some tricks to turn this into a signed comparison
20410 against 0. */
20411 if (code == GTU)
20413 cop0 = force_reg (mode, cop0);
20415 switch (mode)
20417 case V8SImode:
20418 case V4DImode:
20419 case V4SImode:
20420 case V2DImode:
20422 rtx t1, t2, mask;
20423 rtx (*gen_sub3) (rtx, rtx, rtx);
20425 switch (mode)
20427 case V8SImode: gen_sub3 = gen_subv8si3; break;
20428 case V4DImode: gen_sub3 = gen_subv4di3; break;
20429 case V4SImode: gen_sub3 = gen_subv4si3; break;
20430 case V2DImode: gen_sub3 = gen_subv2di3; break;
20431 default:
20432 gcc_unreachable ();
20434 /* Subtract (-(INT MAX) - 1) from both operands to make
20435 them signed. */
20436 mask = ix86_build_signbit_mask (mode, true, false);
20437 t1 = gen_reg_rtx (mode);
20438 emit_insn (gen_sub3 (t1, cop0, mask));
20440 t2 = gen_reg_rtx (mode);
20441 emit_insn (gen_sub3 (t2, cop1, mask));
20443 cop0 = t1;
20444 cop1 = t2;
20445 code = GT;
20447 break;
20449 case V32QImode:
20450 case V16HImode:
20451 case V16QImode:
20452 case V8HImode:
20453 /* Perform a parallel unsigned saturating subtraction. */
20454 x = gen_reg_rtx (mode);
20455 emit_insn (gen_rtx_SET (VOIDmode, x,
20456 gen_rtx_US_MINUS (mode, cop0, cop1)));
20458 cop0 = x;
20459 cop1 = CONST0_RTX (mode);
20460 code = EQ;
20461 negate = !negate;
20462 break;
20464 default:
20465 gcc_unreachable ();
20470 /* Allow the comparison to be done in one mode, but the movcc to
20471 happen in another mode. */
20472 if (data_mode == mode)
20474 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20475 operands[1+negate], operands[2-negate]);
20477 else
20479 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20480 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20481 code, cop0, cop1,
20482 operands[1+negate], operands[2-negate]);
20483 x = gen_lowpart (data_mode, x);
20486 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20487 operands[2-negate]);
20488 return true;
20491 /* Expand a variable vector permutation. */
20493 void
20494 ix86_expand_vec_perm (rtx operands[])
20496 rtx target = operands[0];
20497 rtx op0 = operands[1];
20498 rtx op1 = operands[2];
20499 rtx mask = operands[3];
20500 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20501 enum machine_mode mode = GET_MODE (op0);
20502 enum machine_mode maskmode = GET_MODE (mask);
20503 int w, e, i;
20504 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20506 /* Number of elements in the vector. */
20507 w = GET_MODE_NUNITS (mode);
20508 e = GET_MODE_UNIT_SIZE (mode);
20509 gcc_assert (w <= 32);
20511 if (TARGET_AVX2)
20513 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20515 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20516 an constant shuffle operand. With a tiny bit of effort we can
20517 use VPERMD instead. A re-interpretation stall for V4DFmode is
20518 unfortunate but there's no avoiding it.
20519 Similarly for V16HImode we don't have instructions for variable
20520 shuffling, while for V32QImode we can use after preparing suitable
20521 masks vpshufb; vpshufb; vpermq; vpor. */
20523 if (mode == V16HImode)
20525 maskmode = mode = V32QImode;
20526 w = 32;
20527 e = 1;
20529 else
20531 maskmode = mode = V8SImode;
20532 w = 8;
20533 e = 4;
20535 t1 = gen_reg_rtx (maskmode);
20537 /* Replicate the low bits of the V4DImode mask into V8SImode:
20538 mask = { A B C D }
20539 t1 = { A A B B C C D D }. */
20540 for (i = 0; i < w / 2; ++i)
20541 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20542 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20543 vt = force_reg (maskmode, vt);
20544 mask = gen_lowpart (maskmode, mask);
20545 if (maskmode == V8SImode)
20546 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20547 else
20548 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20550 /* Multiply the shuffle indicies by two. */
20551 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20552 OPTAB_DIRECT);
20554 /* Add one to the odd shuffle indicies:
20555 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20556 for (i = 0; i < w / 2; ++i)
20558 vec[i * 2] = const0_rtx;
20559 vec[i * 2 + 1] = const1_rtx;
20561 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20562 vt = validize_mem (force_const_mem (maskmode, vt));
20563 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20564 OPTAB_DIRECT);
20566 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20567 operands[3] = mask = t1;
20568 target = gen_lowpart (mode, target);
20569 op0 = gen_lowpart (mode, op0);
20570 op1 = gen_lowpart (mode, op1);
20573 switch (mode)
20575 case V8SImode:
20576 /* The VPERMD and VPERMPS instructions already properly ignore
20577 the high bits of the shuffle elements. No need for us to
20578 perform an AND ourselves. */
20579 if (one_operand_shuffle)
20580 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20581 else
20583 t1 = gen_reg_rtx (V8SImode);
20584 t2 = gen_reg_rtx (V8SImode);
20585 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20586 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20587 goto merge_two;
20589 return;
20591 case V8SFmode:
20592 mask = gen_lowpart (V8SFmode, mask);
20593 if (one_operand_shuffle)
20594 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20595 else
20597 t1 = gen_reg_rtx (V8SFmode);
20598 t2 = gen_reg_rtx (V8SFmode);
20599 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20600 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20601 goto merge_two;
20603 return;
20605 case V4SImode:
20606 /* By combining the two 128-bit input vectors into one 256-bit
20607 input vector, we can use VPERMD and VPERMPS for the full
20608 two-operand shuffle. */
20609 t1 = gen_reg_rtx (V8SImode);
20610 t2 = gen_reg_rtx (V8SImode);
20611 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20612 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20613 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20614 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20615 return;
20617 case V4SFmode:
20618 t1 = gen_reg_rtx (V8SFmode);
20619 t2 = gen_reg_rtx (V8SImode);
20620 mask = gen_lowpart (V4SImode, mask);
20621 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20622 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20623 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20624 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20625 return;
20627 case V32QImode:
20628 t1 = gen_reg_rtx (V32QImode);
20629 t2 = gen_reg_rtx (V32QImode);
20630 t3 = gen_reg_rtx (V32QImode);
20631 vt2 = GEN_INT (128);
20632 for (i = 0; i < 32; i++)
20633 vec[i] = vt2;
20634 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20635 vt = force_reg (V32QImode, vt);
20636 for (i = 0; i < 32; i++)
20637 vec[i] = i < 16 ? vt2 : const0_rtx;
20638 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20639 vt2 = force_reg (V32QImode, vt2);
20640 /* From mask create two adjusted masks, which contain the same
20641 bits as mask in the low 7 bits of each vector element.
20642 The first mask will have the most significant bit clear
20643 if it requests element from the same 128-bit lane
20644 and MSB set if it requests element from the other 128-bit lane.
20645 The second mask will have the opposite values of the MSB,
20646 and additionally will have its 128-bit lanes swapped.
20647 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20648 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20649 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20650 stands for other 12 bytes. */
20651 /* The bit whether element is from the same lane or the other
20652 lane is bit 4, so shift it up by 3 to the MSB position. */
20653 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20654 gen_lowpart (V4DImode, mask),
20655 GEN_INT (3)));
20656 /* Clear MSB bits from the mask just in case it had them set. */
20657 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20658 /* After this t1 will have MSB set for elements from other lane. */
20659 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20660 /* Clear bits other than MSB. */
20661 emit_insn (gen_andv32qi3 (t1, t1, vt));
20662 /* Or in the lower bits from mask into t3. */
20663 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20664 /* And invert MSB bits in t1, so MSB is set for elements from the same
20665 lane. */
20666 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20667 /* Swap 128-bit lanes in t3. */
20668 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20669 gen_lowpart (V4DImode, t3),
20670 const2_rtx, GEN_INT (3),
20671 const0_rtx, const1_rtx));
20672 /* And or in the lower bits from mask into t1. */
20673 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20674 if (one_operand_shuffle)
20676 /* Each of these shuffles will put 0s in places where
20677 element from the other 128-bit lane is needed, otherwise
20678 will shuffle in the requested value. */
20679 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20680 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20681 /* For t3 the 128-bit lanes are swapped again. */
20682 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20683 gen_lowpart (V4DImode, t3),
20684 const2_rtx, GEN_INT (3),
20685 const0_rtx, const1_rtx));
20686 /* And oring both together leads to the result. */
20687 emit_insn (gen_iorv32qi3 (target, t1, t3));
20688 return;
20691 t4 = gen_reg_rtx (V32QImode);
20692 /* Similarly to the above one_operand_shuffle code,
20693 just for repeated twice for each operand. merge_two:
20694 code will merge the two results together. */
20695 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20696 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20697 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20698 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20699 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20700 gen_lowpart (V4DImode, t4),
20701 const2_rtx, GEN_INT (3),
20702 const0_rtx, const1_rtx));
20703 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20704 gen_lowpart (V4DImode, t3),
20705 const2_rtx, GEN_INT (3),
20706 const0_rtx, const1_rtx));
20707 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20708 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20709 t1 = t4;
20710 t2 = t3;
20711 goto merge_two;
20713 default:
20714 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20715 break;
20719 if (TARGET_XOP)
20721 /* The XOP VPPERM insn supports three inputs. By ignoring the
20722 one_operand_shuffle special case, we avoid creating another
20723 set of constant vectors in memory. */
20724 one_operand_shuffle = false;
20726 /* mask = mask & {2*w-1, ...} */
20727 vt = GEN_INT (2*w - 1);
20729 else
20731 /* mask = mask & {w-1, ...} */
20732 vt = GEN_INT (w - 1);
20735 for (i = 0; i < w; i++)
20736 vec[i] = vt;
20737 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20738 mask = expand_simple_binop (maskmode, AND, mask, vt,
20739 NULL_RTX, 0, OPTAB_DIRECT);
20741 /* For non-QImode operations, convert the word permutation control
20742 into a byte permutation control. */
20743 if (mode != V16QImode)
20745 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20746 GEN_INT (exact_log2 (e)),
20747 NULL_RTX, 0, OPTAB_DIRECT);
20749 /* Convert mask to vector of chars. */
20750 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20752 /* Replicate each of the input bytes into byte positions:
20753 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20754 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20755 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20756 for (i = 0; i < 16; ++i)
20757 vec[i] = GEN_INT (i/e * e);
20758 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20759 vt = validize_mem (force_const_mem (V16QImode, vt));
20760 if (TARGET_XOP)
20761 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20762 else
20763 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20765 /* Convert it into the byte positions by doing
20766 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20767 for (i = 0; i < 16; ++i)
20768 vec[i] = GEN_INT (i % e);
20769 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20770 vt = validize_mem (force_const_mem (V16QImode, vt));
20771 emit_insn (gen_addv16qi3 (mask, mask, vt));
20774 /* The actual shuffle operations all operate on V16QImode. */
20775 op0 = gen_lowpart (V16QImode, op0);
20776 op1 = gen_lowpart (V16QImode, op1);
20777 target = gen_lowpart (V16QImode, target);
20779 if (TARGET_XOP)
20781 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20783 else if (one_operand_shuffle)
20785 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20787 else
20789 rtx xops[6];
20790 bool ok;
20792 /* Shuffle the two input vectors independently. */
20793 t1 = gen_reg_rtx (V16QImode);
20794 t2 = gen_reg_rtx (V16QImode);
20795 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20796 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20798 merge_two:
20799 /* Then merge them together. The key is whether any given control
20800 element contained a bit set that indicates the second word. */
20801 mask = operands[3];
20802 vt = GEN_INT (w);
20803 if (maskmode == V2DImode && !TARGET_SSE4_1)
20805 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20806 more shuffle to convert the V2DI input mask into a V4SI
20807 input mask. At which point the masking that expand_int_vcond
20808 will work as desired. */
20809 rtx t3 = gen_reg_rtx (V4SImode);
20810 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20811 const0_rtx, const0_rtx,
20812 const2_rtx, const2_rtx));
20813 mask = t3;
20814 maskmode = V4SImode;
20815 e = w = 4;
20818 for (i = 0; i < w; i++)
20819 vec[i] = vt;
20820 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20821 vt = force_reg (maskmode, vt);
20822 mask = expand_simple_binop (maskmode, AND, mask, vt,
20823 NULL_RTX, 0, OPTAB_DIRECT);
20825 xops[0] = gen_lowpart (mode, operands[0]);
20826 xops[1] = gen_lowpart (mode, t2);
20827 xops[2] = gen_lowpart (mode, t1);
20828 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20829 xops[4] = mask;
20830 xops[5] = vt;
20831 ok = ix86_expand_int_vcond (xops);
20832 gcc_assert (ok);
20836 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20837 true if we should do zero extension, else sign extension. HIGH_P is
20838 true if we want the N/2 high elements, else the low elements. */
20840 void
20841 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20843 enum machine_mode imode = GET_MODE (src);
20844 rtx tmp;
20846 if (TARGET_SSE4_1)
20848 rtx (*unpack)(rtx, rtx);
20849 rtx (*extract)(rtx, rtx) = NULL;
20850 enum machine_mode halfmode = BLKmode;
20852 switch (imode)
20854 case V32QImode:
20855 if (unsigned_p)
20856 unpack = gen_avx2_zero_extendv16qiv16hi2;
20857 else
20858 unpack = gen_avx2_sign_extendv16qiv16hi2;
20859 halfmode = V16QImode;
20860 extract
20861 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20862 break;
20863 case V16HImode:
20864 if (unsigned_p)
20865 unpack = gen_avx2_zero_extendv8hiv8si2;
20866 else
20867 unpack = gen_avx2_sign_extendv8hiv8si2;
20868 halfmode = V8HImode;
20869 extract
20870 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20871 break;
20872 case V8SImode:
20873 if (unsigned_p)
20874 unpack = gen_avx2_zero_extendv4siv4di2;
20875 else
20876 unpack = gen_avx2_sign_extendv4siv4di2;
20877 halfmode = V4SImode;
20878 extract
20879 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20880 break;
20881 case V16QImode:
20882 if (unsigned_p)
20883 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20884 else
20885 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20886 break;
20887 case V8HImode:
20888 if (unsigned_p)
20889 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20890 else
20891 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20892 break;
20893 case V4SImode:
20894 if (unsigned_p)
20895 unpack = gen_sse4_1_zero_extendv2siv2di2;
20896 else
20897 unpack = gen_sse4_1_sign_extendv2siv2di2;
20898 break;
20899 default:
20900 gcc_unreachable ();
20903 if (GET_MODE_SIZE (imode) == 32)
20905 tmp = gen_reg_rtx (halfmode);
20906 emit_insn (extract (tmp, src));
20908 else if (high_p)
20910 /* Shift higher 8 bytes to lower 8 bytes. */
20911 tmp = gen_reg_rtx (imode);
20912 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20913 gen_lowpart (V1TImode, src),
20914 GEN_INT (64)));
20916 else
20917 tmp = src;
20919 emit_insn (unpack (dest, tmp));
20921 else
20923 rtx (*unpack)(rtx, rtx, rtx);
20925 switch (imode)
20927 case V16QImode:
20928 if (high_p)
20929 unpack = gen_vec_interleave_highv16qi;
20930 else
20931 unpack = gen_vec_interleave_lowv16qi;
20932 break;
20933 case V8HImode:
20934 if (high_p)
20935 unpack = gen_vec_interleave_highv8hi;
20936 else
20937 unpack = gen_vec_interleave_lowv8hi;
20938 break;
20939 case V4SImode:
20940 if (high_p)
20941 unpack = gen_vec_interleave_highv4si;
20942 else
20943 unpack = gen_vec_interleave_lowv4si;
20944 break;
20945 default:
20946 gcc_unreachable ();
20949 if (unsigned_p)
20950 tmp = force_reg (imode, CONST0_RTX (imode));
20951 else
20952 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20953 src, pc_rtx, pc_rtx);
20955 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20959 /* Expand conditional increment or decrement using adb/sbb instructions.
20960 The default case using setcc followed by the conditional move can be
20961 done by generic code. */
20962 bool
20963 ix86_expand_int_addcc (rtx operands[])
20965 enum rtx_code code = GET_CODE (operands[1]);
20966 rtx flags;
20967 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20968 rtx compare_op;
20969 rtx val = const0_rtx;
20970 bool fpcmp = false;
20971 enum machine_mode mode;
20972 rtx op0 = XEXP (operands[1], 0);
20973 rtx op1 = XEXP (operands[1], 1);
20975 if (operands[3] != const1_rtx
20976 && operands[3] != constm1_rtx)
20977 return false;
20978 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20979 return false;
20980 code = GET_CODE (compare_op);
20982 flags = XEXP (compare_op, 0);
20984 if (GET_MODE (flags) == CCFPmode
20985 || GET_MODE (flags) == CCFPUmode)
20987 fpcmp = true;
20988 code = ix86_fp_compare_code_to_integer (code);
20991 if (code != LTU)
20993 val = constm1_rtx;
20994 if (fpcmp)
20995 PUT_CODE (compare_op,
20996 reverse_condition_maybe_unordered
20997 (GET_CODE (compare_op)));
20998 else
20999 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21002 mode = GET_MODE (operands[0]);
21004 /* Construct either adc or sbb insn. */
21005 if ((code == LTU) == (operands[3] == constm1_rtx))
21007 switch (mode)
21009 case QImode:
21010 insn = gen_subqi3_carry;
21011 break;
21012 case HImode:
21013 insn = gen_subhi3_carry;
21014 break;
21015 case SImode:
21016 insn = gen_subsi3_carry;
21017 break;
21018 case DImode:
21019 insn = gen_subdi3_carry;
21020 break;
21021 default:
21022 gcc_unreachable ();
21025 else
21027 switch (mode)
21029 case QImode:
21030 insn = gen_addqi3_carry;
21031 break;
21032 case HImode:
21033 insn = gen_addhi3_carry;
21034 break;
21035 case SImode:
21036 insn = gen_addsi3_carry;
21037 break;
21038 case DImode:
21039 insn = gen_adddi3_carry;
21040 break;
21041 default:
21042 gcc_unreachable ();
21045 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21047 return true;
21051 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21052 but works for floating pointer parameters and nonoffsetable memories.
21053 For pushes, it returns just stack offsets; the values will be saved
21054 in the right order. Maximally three parts are generated. */
21056 static int
21057 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21059 int size;
21061 if (!TARGET_64BIT)
21062 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21063 else
21064 size = (GET_MODE_SIZE (mode) + 4) / 8;
21066 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21067 gcc_assert (size >= 2 && size <= 4);
21069 /* Optimize constant pool reference to immediates. This is used by fp
21070 moves, that force all constants to memory to allow combining. */
21071 if (MEM_P (operand) && MEM_READONLY_P (operand))
21073 rtx tmp = maybe_get_pool_constant (operand);
21074 if (tmp)
21075 operand = tmp;
21078 if (MEM_P (operand) && !offsettable_memref_p (operand))
21080 /* The only non-offsetable memories we handle are pushes. */
21081 int ok = push_operand (operand, VOIDmode);
21083 gcc_assert (ok);
21085 operand = copy_rtx (operand);
21086 PUT_MODE (operand, word_mode);
21087 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21088 return size;
21091 if (GET_CODE (operand) == CONST_VECTOR)
21093 enum machine_mode imode = int_mode_for_mode (mode);
21094 /* Caution: if we looked through a constant pool memory above,
21095 the operand may actually have a different mode now. That's
21096 ok, since we want to pun this all the way back to an integer. */
21097 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21098 gcc_assert (operand != NULL);
21099 mode = imode;
21102 if (!TARGET_64BIT)
21104 if (mode == DImode)
21105 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21106 else
21108 int i;
21110 if (REG_P (operand))
21112 gcc_assert (reload_completed);
21113 for (i = 0; i < size; i++)
21114 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21116 else if (offsettable_memref_p (operand))
21118 operand = adjust_address (operand, SImode, 0);
21119 parts[0] = operand;
21120 for (i = 1; i < size; i++)
21121 parts[i] = adjust_address (operand, SImode, 4 * i);
21123 else if (GET_CODE (operand) == CONST_DOUBLE)
21125 REAL_VALUE_TYPE r;
21126 long l[4];
21128 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21129 switch (mode)
21131 case TFmode:
21132 real_to_target (l, &r, mode);
21133 parts[3] = gen_int_mode (l[3], SImode);
21134 parts[2] = gen_int_mode (l[2], SImode);
21135 break;
21136 case XFmode:
21137 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21138 long double may not be 80-bit. */
21139 real_to_target (l, &r, mode);
21140 parts[2] = gen_int_mode (l[2], SImode);
21141 break;
21142 case DFmode:
21143 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21144 break;
21145 default:
21146 gcc_unreachable ();
21148 parts[1] = gen_int_mode (l[1], SImode);
21149 parts[0] = gen_int_mode (l[0], SImode);
21151 else
21152 gcc_unreachable ();
21155 else
21157 if (mode == TImode)
21158 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21159 if (mode == XFmode || mode == TFmode)
21161 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21162 if (REG_P (operand))
21164 gcc_assert (reload_completed);
21165 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21166 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21168 else if (offsettable_memref_p (operand))
21170 operand = adjust_address (operand, DImode, 0);
21171 parts[0] = operand;
21172 parts[1] = adjust_address (operand, upper_mode, 8);
21174 else if (GET_CODE (operand) == CONST_DOUBLE)
21176 REAL_VALUE_TYPE r;
21177 long l[4];
21179 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21180 real_to_target (l, &r, mode);
21182 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21183 if (HOST_BITS_PER_WIDE_INT >= 64)
21184 parts[0]
21185 = gen_int_mode
21186 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21187 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21188 DImode);
21189 else
21190 parts[0] = immed_double_const (l[0], l[1], DImode);
21192 if (upper_mode == SImode)
21193 parts[1] = gen_int_mode (l[2], SImode);
21194 else if (HOST_BITS_PER_WIDE_INT >= 64)
21195 parts[1]
21196 = gen_int_mode
21197 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21198 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21199 DImode);
21200 else
21201 parts[1] = immed_double_const (l[2], l[3], DImode);
21203 else
21204 gcc_unreachable ();
21208 return size;
21211 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21212 Return false when normal moves are needed; true when all required
21213 insns have been emitted. Operands 2-4 contain the input values
21214 int the correct order; operands 5-7 contain the output values. */
21216 void
21217 ix86_split_long_move (rtx operands[])
21219 rtx part[2][4];
21220 int nparts, i, j;
21221 int push = 0;
21222 int collisions = 0;
21223 enum machine_mode mode = GET_MODE (operands[0]);
21224 bool collisionparts[4];
21226 /* The DFmode expanders may ask us to move double.
21227 For 64bit target this is single move. By hiding the fact
21228 here we simplify i386.md splitters. */
21229 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21231 /* Optimize constant pool reference to immediates. This is used by
21232 fp moves, that force all constants to memory to allow combining. */
21234 if (MEM_P (operands[1])
21235 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21236 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21237 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21238 if (push_operand (operands[0], VOIDmode))
21240 operands[0] = copy_rtx (operands[0]);
21241 PUT_MODE (operands[0], word_mode);
21243 else
21244 operands[0] = gen_lowpart (DImode, operands[0]);
21245 operands[1] = gen_lowpart (DImode, operands[1]);
21246 emit_move_insn (operands[0], operands[1]);
21247 return;
21250 /* The only non-offsettable memory we handle is push. */
21251 if (push_operand (operands[0], VOIDmode))
21252 push = 1;
21253 else
21254 gcc_assert (!MEM_P (operands[0])
21255 || offsettable_memref_p (operands[0]));
21257 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21258 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21260 /* When emitting push, take care for source operands on the stack. */
21261 if (push && MEM_P (operands[1])
21262 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21264 rtx src_base = XEXP (part[1][nparts - 1], 0);
21266 /* Compensate for the stack decrement by 4. */
21267 if (!TARGET_64BIT && nparts == 3
21268 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21269 src_base = plus_constant (Pmode, src_base, 4);
21271 /* src_base refers to the stack pointer and is
21272 automatically decreased by emitted push. */
21273 for (i = 0; i < nparts; i++)
21274 part[1][i] = change_address (part[1][i],
21275 GET_MODE (part[1][i]), src_base);
21278 /* We need to do copy in the right order in case an address register
21279 of the source overlaps the destination. */
21280 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21282 rtx tmp;
21284 for (i = 0; i < nparts; i++)
21286 collisionparts[i]
21287 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21288 if (collisionparts[i])
21289 collisions++;
21292 /* Collision in the middle part can be handled by reordering. */
21293 if (collisions == 1 && nparts == 3 && collisionparts [1])
21295 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21296 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21298 else if (collisions == 1
21299 && nparts == 4
21300 && (collisionparts [1] || collisionparts [2]))
21302 if (collisionparts [1])
21304 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21305 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21307 else
21309 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21310 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21314 /* If there are more collisions, we can't handle it by reordering.
21315 Do an lea to the last part and use only one colliding move. */
21316 else if (collisions > 1)
21318 rtx base;
21320 collisions = 1;
21322 base = part[0][nparts - 1];
21324 /* Handle the case when the last part isn't valid for lea.
21325 Happens in 64-bit mode storing the 12-byte XFmode. */
21326 if (GET_MODE (base) != Pmode)
21327 base = gen_rtx_REG (Pmode, REGNO (base));
21329 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21330 part[1][0] = replace_equiv_address (part[1][0], base);
21331 for (i = 1; i < nparts; i++)
21333 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21334 part[1][i] = replace_equiv_address (part[1][i], tmp);
21339 if (push)
21341 if (!TARGET_64BIT)
21343 if (nparts == 3)
21345 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21346 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21347 stack_pointer_rtx, GEN_INT (-4)));
21348 emit_move_insn (part[0][2], part[1][2]);
21350 else if (nparts == 4)
21352 emit_move_insn (part[0][3], part[1][3]);
21353 emit_move_insn (part[0][2], part[1][2]);
21356 else
21358 /* In 64bit mode we don't have 32bit push available. In case this is
21359 register, it is OK - we will just use larger counterpart. We also
21360 retype memory - these comes from attempt to avoid REX prefix on
21361 moving of second half of TFmode value. */
21362 if (GET_MODE (part[1][1]) == SImode)
21364 switch (GET_CODE (part[1][1]))
21366 case MEM:
21367 part[1][1] = adjust_address (part[1][1], DImode, 0);
21368 break;
21370 case REG:
21371 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21372 break;
21374 default:
21375 gcc_unreachable ();
21378 if (GET_MODE (part[1][0]) == SImode)
21379 part[1][0] = part[1][1];
21382 emit_move_insn (part[0][1], part[1][1]);
21383 emit_move_insn (part[0][0], part[1][0]);
21384 return;
21387 /* Choose correct order to not overwrite the source before it is copied. */
21388 if ((REG_P (part[0][0])
21389 && REG_P (part[1][1])
21390 && (REGNO (part[0][0]) == REGNO (part[1][1])
21391 || (nparts == 3
21392 && REGNO (part[0][0]) == REGNO (part[1][2]))
21393 || (nparts == 4
21394 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21395 || (collisions > 0
21396 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21398 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21400 operands[2 + i] = part[0][j];
21401 operands[6 + i] = part[1][j];
21404 else
21406 for (i = 0; i < nparts; i++)
21408 operands[2 + i] = part[0][i];
21409 operands[6 + i] = part[1][i];
21413 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21414 if (optimize_insn_for_size_p ())
21416 for (j = 0; j < nparts - 1; j++)
21417 if (CONST_INT_P (operands[6 + j])
21418 && operands[6 + j] != const0_rtx
21419 && REG_P (operands[2 + j]))
21420 for (i = j; i < nparts - 1; i++)
21421 if (CONST_INT_P (operands[7 + i])
21422 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21423 operands[7 + i] = operands[2 + j];
21426 for (i = 0; i < nparts; i++)
21427 emit_move_insn (operands[2 + i], operands[6 + i]);
21429 return;
21432 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21433 left shift by a constant, either using a single shift or
21434 a sequence of add instructions. */
21436 static void
21437 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21439 rtx (*insn)(rtx, rtx, rtx);
21441 if (count == 1
21442 || (count * ix86_cost->add <= ix86_cost->shift_const
21443 && !optimize_insn_for_size_p ()))
21445 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21446 while (count-- > 0)
21447 emit_insn (insn (operand, operand, operand));
21449 else
21451 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21452 emit_insn (insn (operand, operand, GEN_INT (count)));
21456 void
21457 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21459 rtx (*gen_ashl3)(rtx, rtx, rtx);
21460 rtx (*gen_shld)(rtx, rtx, rtx);
21461 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21463 rtx low[2], high[2];
21464 int count;
21466 if (CONST_INT_P (operands[2]))
21468 split_double_mode (mode, operands, 2, low, high);
21469 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21471 if (count >= half_width)
21473 emit_move_insn (high[0], low[1]);
21474 emit_move_insn (low[0], const0_rtx);
21476 if (count > half_width)
21477 ix86_expand_ashl_const (high[0], count - half_width, mode);
21479 else
21481 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21483 if (!rtx_equal_p (operands[0], operands[1]))
21484 emit_move_insn (operands[0], operands[1]);
21486 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21487 ix86_expand_ashl_const (low[0], count, mode);
21489 return;
21492 split_double_mode (mode, operands, 1, low, high);
21494 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21496 if (operands[1] == const1_rtx)
21498 /* Assuming we've chosen a QImode capable registers, then 1 << N
21499 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21500 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21502 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21504 ix86_expand_clear (low[0]);
21505 ix86_expand_clear (high[0]);
21506 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21508 d = gen_lowpart (QImode, low[0]);
21509 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21510 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21511 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21513 d = gen_lowpart (QImode, high[0]);
21514 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21515 s = gen_rtx_NE (QImode, flags, const0_rtx);
21516 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21519 /* Otherwise, we can get the same results by manually performing
21520 a bit extract operation on bit 5/6, and then performing the two
21521 shifts. The two methods of getting 0/1 into low/high are exactly
21522 the same size. Avoiding the shift in the bit extract case helps
21523 pentium4 a bit; no one else seems to care much either way. */
21524 else
21526 enum machine_mode half_mode;
21527 rtx (*gen_lshr3)(rtx, rtx, rtx);
21528 rtx (*gen_and3)(rtx, rtx, rtx);
21529 rtx (*gen_xor3)(rtx, rtx, rtx);
21530 HOST_WIDE_INT bits;
21531 rtx x;
21533 if (mode == DImode)
21535 half_mode = SImode;
21536 gen_lshr3 = gen_lshrsi3;
21537 gen_and3 = gen_andsi3;
21538 gen_xor3 = gen_xorsi3;
21539 bits = 5;
21541 else
21543 half_mode = DImode;
21544 gen_lshr3 = gen_lshrdi3;
21545 gen_and3 = gen_anddi3;
21546 gen_xor3 = gen_xordi3;
21547 bits = 6;
21550 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21551 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21552 else
21553 x = gen_lowpart (half_mode, operands[2]);
21554 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21556 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21557 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21558 emit_move_insn (low[0], high[0]);
21559 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21562 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21563 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21564 return;
21567 if (operands[1] == constm1_rtx)
21569 /* For -1 << N, we can avoid the shld instruction, because we
21570 know that we're shifting 0...31/63 ones into a -1. */
21571 emit_move_insn (low[0], constm1_rtx);
21572 if (optimize_insn_for_size_p ())
21573 emit_move_insn (high[0], low[0]);
21574 else
21575 emit_move_insn (high[0], constm1_rtx);
21577 else
21579 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21581 if (!rtx_equal_p (operands[0], operands[1]))
21582 emit_move_insn (operands[0], operands[1]);
21584 split_double_mode (mode, operands, 1, low, high);
21585 emit_insn (gen_shld (high[0], low[0], operands[2]));
21588 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21590 if (TARGET_CMOVE && scratch)
21592 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21593 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21595 ix86_expand_clear (scratch);
21596 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21598 else
21600 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21601 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21603 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21607 void
21608 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21610 rtx (*gen_ashr3)(rtx, rtx, rtx)
21611 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21612 rtx (*gen_shrd)(rtx, rtx, rtx);
21613 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21615 rtx low[2], high[2];
21616 int count;
21618 if (CONST_INT_P (operands[2]))
21620 split_double_mode (mode, operands, 2, low, high);
21621 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21623 if (count == GET_MODE_BITSIZE (mode) - 1)
21625 emit_move_insn (high[0], high[1]);
21626 emit_insn (gen_ashr3 (high[0], high[0],
21627 GEN_INT (half_width - 1)));
21628 emit_move_insn (low[0], high[0]);
21631 else if (count >= half_width)
21633 emit_move_insn (low[0], high[1]);
21634 emit_move_insn (high[0], low[0]);
21635 emit_insn (gen_ashr3 (high[0], high[0],
21636 GEN_INT (half_width - 1)));
21638 if (count > half_width)
21639 emit_insn (gen_ashr3 (low[0], low[0],
21640 GEN_INT (count - half_width)));
21642 else
21644 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21646 if (!rtx_equal_p (operands[0], operands[1]))
21647 emit_move_insn (operands[0], operands[1]);
21649 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21650 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21653 else
21655 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21657 if (!rtx_equal_p (operands[0], operands[1]))
21658 emit_move_insn (operands[0], operands[1]);
21660 split_double_mode (mode, operands, 1, low, high);
21662 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21663 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21665 if (TARGET_CMOVE && scratch)
21667 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21668 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21670 emit_move_insn (scratch, high[0]);
21671 emit_insn (gen_ashr3 (scratch, scratch,
21672 GEN_INT (half_width - 1)));
21673 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21674 scratch));
21676 else
21678 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21679 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21681 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21686 void
21687 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21689 rtx (*gen_lshr3)(rtx, rtx, rtx)
21690 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21691 rtx (*gen_shrd)(rtx, rtx, rtx);
21692 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21694 rtx low[2], high[2];
21695 int count;
21697 if (CONST_INT_P (operands[2]))
21699 split_double_mode (mode, operands, 2, low, high);
21700 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21702 if (count >= half_width)
21704 emit_move_insn (low[0], high[1]);
21705 ix86_expand_clear (high[0]);
21707 if (count > half_width)
21708 emit_insn (gen_lshr3 (low[0], low[0],
21709 GEN_INT (count - half_width)));
21711 else
21713 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21715 if (!rtx_equal_p (operands[0], operands[1]))
21716 emit_move_insn (operands[0], operands[1]);
21718 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21719 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21722 else
21724 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21726 if (!rtx_equal_p (operands[0], operands[1]))
21727 emit_move_insn (operands[0], operands[1]);
21729 split_double_mode (mode, operands, 1, low, high);
21731 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21732 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21734 if (TARGET_CMOVE && scratch)
21736 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21737 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21739 ix86_expand_clear (scratch);
21740 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21741 scratch));
21743 else
21745 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21746 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21748 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21753 /* Predict just emitted jump instruction to be taken with probability PROB. */
21754 static void
21755 predict_jump (int prob)
21757 rtx insn = get_last_insn ();
21758 gcc_assert (JUMP_P (insn));
21759 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21762 /* Helper function for the string operations below. Dest VARIABLE whether
21763 it is aligned to VALUE bytes. If true, jump to the label. */
21764 static rtx
21765 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21767 rtx label = gen_label_rtx ();
21768 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21769 if (GET_MODE (variable) == DImode)
21770 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21771 else
21772 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21773 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21774 1, label);
21775 if (epilogue)
21776 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21777 else
21778 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21779 return label;
21782 /* Adjust COUNTER by the VALUE. */
21783 static void
21784 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21786 rtx (*gen_add)(rtx, rtx, rtx)
21787 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21789 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21792 /* Zero extend possibly SImode EXP to Pmode register. */
21794 ix86_zero_extend_to_Pmode (rtx exp)
21796 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21799 /* Divide COUNTREG by SCALE. */
21800 static rtx
21801 scale_counter (rtx countreg, int scale)
21803 rtx sc;
21805 if (scale == 1)
21806 return countreg;
21807 if (CONST_INT_P (countreg))
21808 return GEN_INT (INTVAL (countreg) / scale);
21809 gcc_assert (REG_P (countreg));
21811 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21812 GEN_INT (exact_log2 (scale)),
21813 NULL, 1, OPTAB_DIRECT);
21814 return sc;
21817 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21818 DImode for constant loop counts. */
21820 static enum machine_mode
21821 counter_mode (rtx count_exp)
21823 if (GET_MODE (count_exp) != VOIDmode)
21824 return GET_MODE (count_exp);
21825 if (!CONST_INT_P (count_exp))
21826 return Pmode;
21827 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21828 return DImode;
21829 return SImode;
21832 /* When SRCPTR is non-NULL, output simple loop to move memory
21833 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21834 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21835 equivalent loop to set memory by VALUE (supposed to be in MODE).
21837 The size is rounded down to whole number of chunk size moved at once.
21838 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21841 static void
21842 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21843 rtx destptr, rtx srcptr, rtx value,
21844 rtx count, enum machine_mode mode, int unroll,
21845 int expected_size)
21847 rtx out_label, top_label, iter, tmp;
21848 enum machine_mode iter_mode = counter_mode (count);
21849 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21850 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21851 rtx size;
21852 rtx x_addr;
21853 rtx y_addr;
21854 int i;
21856 top_label = gen_label_rtx ();
21857 out_label = gen_label_rtx ();
21858 iter = gen_reg_rtx (iter_mode);
21860 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21861 NULL, 1, OPTAB_DIRECT);
21862 /* Those two should combine. */
21863 if (piece_size == const1_rtx)
21865 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21866 true, out_label);
21867 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21869 emit_move_insn (iter, const0_rtx);
21871 emit_label (top_label);
21873 tmp = convert_modes (Pmode, iter_mode, iter, true);
21874 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21875 destmem = change_address (destmem, mode, x_addr);
21877 if (srcmem)
21879 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21880 srcmem = change_address (srcmem, mode, y_addr);
21882 /* When unrolling for chips that reorder memory reads and writes,
21883 we can save registers by using single temporary.
21884 Also using 4 temporaries is overkill in 32bit mode. */
21885 if (!TARGET_64BIT && 0)
21887 for (i = 0; i < unroll; i++)
21889 if (i)
21891 destmem =
21892 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21893 srcmem =
21894 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21896 emit_move_insn (destmem, srcmem);
21899 else
21901 rtx tmpreg[4];
21902 gcc_assert (unroll <= 4);
21903 for (i = 0; i < unroll; i++)
21905 tmpreg[i] = gen_reg_rtx (mode);
21906 if (i)
21908 srcmem =
21909 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21911 emit_move_insn (tmpreg[i], srcmem);
21913 for (i = 0; i < unroll; i++)
21915 if (i)
21917 destmem =
21918 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21920 emit_move_insn (destmem, tmpreg[i]);
21924 else
21925 for (i = 0; i < unroll; i++)
21927 if (i)
21928 destmem =
21929 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21930 emit_move_insn (destmem, value);
21933 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21934 true, OPTAB_LIB_WIDEN);
21935 if (tmp != iter)
21936 emit_move_insn (iter, tmp);
21938 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21939 true, top_label);
21940 if (expected_size != -1)
21942 expected_size /= GET_MODE_SIZE (mode) * unroll;
21943 if (expected_size == 0)
21944 predict_jump (0);
21945 else if (expected_size > REG_BR_PROB_BASE)
21946 predict_jump (REG_BR_PROB_BASE - 1);
21947 else
21948 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21950 else
21951 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21952 iter = ix86_zero_extend_to_Pmode (iter);
21953 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21954 true, OPTAB_LIB_WIDEN);
21955 if (tmp != destptr)
21956 emit_move_insn (destptr, tmp);
21957 if (srcptr)
21959 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21960 true, OPTAB_LIB_WIDEN);
21961 if (tmp != srcptr)
21962 emit_move_insn (srcptr, tmp);
21964 emit_label (out_label);
21967 /* Output "rep; mov" instruction.
21968 Arguments have same meaning as for previous function */
21969 static void
21970 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21971 rtx destptr, rtx srcptr,
21972 rtx count,
21973 enum machine_mode mode)
21975 rtx destexp;
21976 rtx srcexp;
21977 rtx countreg;
21978 HOST_WIDE_INT rounded_count;
21980 /* If the size is known, it is shorter to use rep movs. */
21981 if (mode == QImode && CONST_INT_P (count)
21982 && !(INTVAL (count) & 3))
21983 mode = SImode;
21985 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21986 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21987 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21988 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21989 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21990 if (mode != QImode)
21992 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21993 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21994 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21995 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21996 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21997 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21999 else
22001 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22002 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22004 if (CONST_INT_P (count))
22006 rounded_count = (INTVAL (count)
22007 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22008 destmem = shallow_copy_rtx (destmem);
22009 srcmem = shallow_copy_rtx (srcmem);
22010 set_mem_size (destmem, rounded_count);
22011 set_mem_size (srcmem, rounded_count);
22013 else
22015 if (MEM_SIZE_KNOWN_P (destmem))
22016 clear_mem_size (destmem);
22017 if (MEM_SIZE_KNOWN_P (srcmem))
22018 clear_mem_size (srcmem);
22020 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22021 destexp, srcexp));
22024 /* Output "rep; stos" instruction.
22025 Arguments have same meaning as for previous function */
22026 static void
22027 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
22028 rtx count, enum machine_mode mode,
22029 rtx orig_value)
22031 rtx destexp;
22032 rtx countreg;
22033 HOST_WIDE_INT rounded_count;
22035 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22036 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22037 value = force_reg (mode, gen_lowpart (mode, value));
22038 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22039 if (mode != QImode)
22041 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22042 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22043 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22045 else
22046 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22047 if (orig_value == const0_rtx && CONST_INT_P (count))
22049 rounded_count = (INTVAL (count)
22050 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22051 destmem = shallow_copy_rtx (destmem);
22052 set_mem_size (destmem, rounded_count);
22054 else if (MEM_SIZE_KNOWN_P (destmem))
22055 clear_mem_size (destmem);
22056 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22059 static void
22060 emit_strmov (rtx destmem, rtx srcmem,
22061 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
22063 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
22064 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
22065 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22068 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22069 static void
22070 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22071 rtx destptr, rtx srcptr, rtx count, int max_size)
22073 rtx src, dest;
22074 if (CONST_INT_P (count))
22076 HOST_WIDE_INT countval = INTVAL (count);
22077 int offset = 0;
22079 if ((countval & 0x10) && max_size > 16)
22081 if (TARGET_64BIT)
22083 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
22084 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
22086 else
22087 gcc_unreachable ();
22088 offset += 16;
22090 if ((countval & 0x08) && max_size > 8)
22092 if (TARGET_64BIT)
22093 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
22094 else
22096 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
22097 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
22099 offset += 8;
22101 if ((countval & 0x04) && max_size > 4)
22103 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
22104 offset += 4;
22106 if ((countval & 0x02) && max_size > 2)
22108 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
22109 offset += 2;
22111 if ((countval & 0x01) && max_size > 1)
22113 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
22114 offset += 1;
22116 return;
22118 if (max_size > 8)
22120 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22121 count, 1, OPTAB_DIRECT);
22122 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22123 count, QImode, 1, 4);
22124 return;
22127 /* When there are stringops, we can cheaply increase dest and src pointers.
22128 Otherwise we save code size by maintaining offset (zero is readily
22129 available from preceding rep operation) and using x86 addressing modes.
22131 if (TARGET_SINGLE_STRINGOP)
22133 if (max_size > 4)
22135 rtx label = ix86_expand_aligntest (count, 4, true);
22136 src = change_address (srcmem, SImode, srcptr);
22137 dest = change_address (destmem, SImode, destptr);
22138 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22139 emit_label (label);
22140 LABEL_NUSES (label) = 1;
22142 if (max_size > 2)
22144 rtx label = ix86_expand_aligntest (count, 2, true);
22145 src = change_address (srcmem, HImode, srcptr);
22146 dest = change_address (destmem, HImode, destptr);
22147 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22148 emit_label (label);
22149 LABEL_NUSES (label) = 1;
22151 if (max_size > 1)
22153 rtx label = ix86_expand_aligntest (count, 1, true);
22154 src = change_address (srcmem, QImode, srcptr);
22155 dest = change_address (destmem, QImode, destptr);
22156 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22157 emit_label (label);
22158 LABEL_NUSES (label) = 1;
22161 else
22163 rtx offset = force_reg (Pmode, const0_rtx);
22164 rtx tmp;
22166 if (max_size > 4)
22168 rtx label = ix86_expand_aligntest (count, 4, true);
22169 src = change_address (srcmem, SImode, srcptr);
22170 dest = change_address (destmem, SImode, destptr);
22171 emit_move_insn (dest, src);
22172 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22173 true, OPTAB_LIB_WIDEN);
22174 if (tmp != offset)
22175 emit_move_insn (offset, tmp);
22176 emit_label (label);
22177 LABEL_NUSES (label) = 1;
22179 if (max_size > 2)
22181 rtx label = ix86_expand_aligntest (count, 2, true);
22182 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22183 src = change_address (srcmem, HImode, tmp);
22184 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22185 dest = change_address (destmem, HImode, tmp);
22186 emit_move_insn (dest, src);
22187 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22188 true, OPTAB_LIB_WIDEN);
22189 if (tmp != offset)
22190 emit_move_insn (offset, tmp);
22191 emit_label (label);
22192 LABEL_NUSES (label) = 1;
22194 if (max_size > 1)
22196 rtx label = ix86_expand_aligntest (count, 1, true);
22197 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22198 src = change_address (srcmem, QImode, tmp);
22199 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22200 dest = change_address (destmem, QImode, tmp);
22201 emit_move_insn (dest, src);
22202 emit_label (label);
22203 LABEL_NUSES (label) = 1;
22208 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22209 static void
22210 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22211 rtx count, int max_size)
22213 count =
22214 expand_simple_binop (counter_mode (count), AND, count,
22215 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22216 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22217 gen_lowpart (QImode, value), count, QImode,
22218 1, max_size / 2);
22221 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22222 static void
22223 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22225 rtx dest;
22227 if (CONST_INT_P (count))
22229 HOST_WIDE_INT countval = INTVAL (count);
22230 int offset = 0;
22232 if ((countval & 0x10) && max_size > 16)
22234 if (TARGET_64BIT)
22236 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22237 emit_insn (gen_strset (destptr, dest, value));
22238 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22239 emit_insn (gen_strset (destptr, dest, value));
22241 else
22242 gcc_unreachable ();
22243 offset += 16;
22245 if ((countval & 0x08) && max_size > 8)
22247 if (TARGET_64BIT)
22249 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22250 emit_insn (gen_strset (destptr, dest, value));
22252 else
22254 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22255 emit_insn (gen_strset (destptr, dest, value));
22256 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22257 emit_insn (gen_strset (destptr, dest, value));
22259 offset += 8;
22261 if ((countval & 0x04) && max_size > 4)
22263 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22264 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22265 offset += 4;
22267 if ((countval & 0x02) && max_size > 2)
22269 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22270 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22271 offset += 2;
22273 if ((countval & 0x01) && max_size > 1)
22275 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22276 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22277 offset += 1;
22279 return;
22281 if (max_size > 32)
22283 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22284 return;
22286 if (max_size > 16)
22288 rtx label = ix86_expand_aligntest (count, 16, true);
22289 if (TARGET_64BIT)
22291 dest = change_address (destmem, DImode, destptr);
22292 emit_insn (gen_strset (destptr, dest, value));
22293 emit_insn (gen_strset (destptr, dest, value));
22295 else
22297 dest = change_address (destmem, SImode, destptr);
22298 emit_insn (gen_strset (destptr, dest, value));
22299 emit_insn (gen_strset (destptr, dest, value));
22300 emit_insn (gen_strset (destptr, dest, value));
22301 emit_insn (gen_strset (destptr, dest, value));
22303 emit_label (label);
22304 LABEL_NUSES (label) = 1;
22306 if (max_size > 8)
22308 rtx label = ix86_expand_aligntest (count, 8, true);
22309 if (TARGET_64BIT)
22311 dest = change_address (destmem, DImode, destptr);
22312 emit_insn (gen_strset (destptr, dest, value));
22314 else
22316 dest = change_address (destmem, SImode, destptr);
22317 emit_insn (gen_strset (destptr, dest, value));
22318 emit_insn (gen_strset (destptr, dest, value));
22320 emit_label (label);
22321 LABEL_NUSES (label) = 1;
22323 if (max_size > 4)
22325 rtx label = ix86_expand_aligntest (count, 4, true);
22326 dest = change_address (destmem, SImode, destptr);
22327 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22328 emit_label (label);
22329 LABEL_NUSES (label) = 1;
22331 if (max_size > 2)
22333 rtx label = ix86_expand_aligntest (count, 2, true);
22334 dest = change_address (destmem, HImode, destptr);
22335 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22336 emit_label (label);
22337 LABEL_NUSES (label) = 1;
22339 if (max_size > 1)
22341 rtx label = ix86_expand_aligntest (count, 1, true);
22342 dest = change_address (destmem, QImode, destptr);
22343 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22344 emit_label (label);
22345 LABEL_NUSES (label) = 1;
22349 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22350 DESIRED_ALIGNMENT. */
22351 static void
22352 expand_movmem_prologue (rtx destmem, rtx srcmem,
22353 rtx destptr, rtx srcptr, rtx count,
22354 int align, int desired_alignment)
22356 if (align <= 1 && desired_alignment > 1)
22358 rtx label = ix86_expand_aligntest (destptr, 1, false);
22359 srcmem = change_address (srcmem, QImode, srcptr);
22360 destmem = change_address (destmem, QImode, destptr);
22361 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22362 ix86_adjust_counter (count, 1);
22363 emit_label (label);
22364 LABEL_NUSES (label) = 1;
22366 if (align <= 2 && desired_alignment > 2)
22368 rtx label = ix86_expand_aligntest (destptr, 2, false);
22369 srcmem = change_address (srcmem, HImode, srcptr);
22370 destmem = change_address (destmem, HImode, destptr);
22371 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22372 ix86_adjust_counter (count, 2);
22373 emit_label (label);
22374 LABEL_NUSES (label) = 1;
22376 if (align <= 4 && desired_alignment > 4)
22378 rtx label = ix86_expand_aligntest (destptr, 4, false);
22379 srcmem = change_address (srcmem, SImode, srcptr);
22380 destmem = change_address (destmem, SImode, destptr);
22381 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22382 ix86_adjust_counter (count, 4);
22383 emit_label (label);
22384 LABEL_NUSES (label) = 1;
22386 gcc_assert (desired_alignment <= 8);
22389 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22390 ALIGN_BYTES is how many bytes need to be copied. */
22391 static rtx
22392 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22393 int desired_align, int align_bytes)
22395 rtx src = *srcp;
22396 rtx orig_dst = dst;
22397 rtx orig_src = src;
22398 int off = 0;
22399 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22400 if (src_align_bytes >= 0)
22401 src_align_bytes = desired_align - src_align_bytes;
22402 if (align_bytes & 1)
22404 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22405 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22406 off = 1;
22407 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22409 if (align_bytes & 2)
22411 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22412 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22413 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22414 set_mem_align (dst, 2 * BITS_PER_UNIT);
22415 if (src_align_bytes >= 0
22416 && (src_align_bytes & 1) == (align_bytes & 1)
22417 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22418 set_mem_align (src, 2 * BITS_PER_UNIT);
22419 off = 2;
22420 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22422 if (align_bytes & 4)
22424 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22425 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22426 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22427 set_mem_align (dst, 4 * BITS_PER_UNIT);
22428 if (src_align_bytes >= 0)
22430 unsigned int src_align = 0;
22431 if ((src_align_bytes & 3) == (align_bytes & 3))
22432 src_align = 4;
22433 else if ((src_align_bytes & 1) == (align_bytes & 1))
22434 src_align = 2;
22435 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22436 set_mem_align (src, src_align * BITS_PER_UNIT);
22438 off = 4;
22439 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22441 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22442 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22443 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22444 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22445 if (src_align_bytes >= 0)
22447 unsigned int src_align = 0;
22448 if ((src_align_bytes & 7) == (align_bytes & 7))
22449 src_align = 8;
22450 else if ((src_align_bytes & 3) == (align_bytes & 3))
22451 src_align = 4;
22452 else if ((src_align_bytes & 1) == (align_bytes & 1))
22453 src_align = 2;
22454 if (src_align > (unsigned int) desired_align)
22455 src_align = desired_align;
22456 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22457 set_mem_align (src, src_align * BITS_PER_UNIT);
22459 if (MEM_SIZE_KNOWN_P (orig_dst))
22460 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22461 if (MEM_SIZE_KNOWN_P (orig_src))
22462 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22463 *srcp = src;
22464 return dst;
22467 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22468 DESIRED_ALIGNMENT. */
22469 static void
22470 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22471 int align, int desired_alignment)
22473 if (align <= 1 && desired_alignment > 1)
22475 rtx label = ix86_expand_aligntest (destptr, 1, false);
22476 destmem = change_address (destmem, QImode, destptr);
22477 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22478 ix86_adjust_counter (count, 1);
22479 emit_label (label);
22480 LABEL_NUSES (label) = 1;
22482 if (align <= 2 && desired_alignment > 2)
22484 rtx label = ix86_expand_aligntest (destptr, 2, false);
22485 destmem = change_address (destmem, HImode, destptr);
22486 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22487 ix86_adjust_counter (count, 2);
22488 emit_label (label);
22489 LABEL_NUSES (label) = 1;
22491 if (align <= 4 && desired_alignment > 4)
22493 rtx label = ix86_expand_aligntest (destptr, 4, false);
22494 destmem = change_address (destmem, SImode, destptr);
22495 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22496 ix86_adjust_counter (count, 4);
22497 emit_label (label);
22498 LABEL_NUSES (label) = 1;
22500 gcc_assert (desired_alignment <= 8);
22503 /* Set enough from DST to align DST known to by aligned by ALIGN to
22504 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22505 static rtx
22506 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22507 int desired_align, int align_bytes)
22509 int off = 0;
22510 rtx orig_dst = dst;
22511 if (align_bytes & 1)
22513 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22514 off = 1;
22515 emit_insn (gen_strset (destreg, dst,
22516 gen_lowpart (QImode, value)));
22518 if (align_bytes & 2)
22520 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22521 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22522 set_mem_align (dst, 2 * BITS_PER_UNIT);
22523 off = 2;
22524 emit_insn (gen_strset (destreg, dst,
22525 gen_lowpart (HImode, value)));
22527 if (align_bytes & 4)
22529 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22530 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22531 set_mem_align (dst, 4 * BITS_PER_UNIT);
22532 off = 4;
22533 emit_insn (gen_strset (destreg, dst,
22534 gen_lowpart (SImode, value)));
22536 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22537 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22538 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22539 if (MEM_SIZE_KNOWN_P (orig_dst))
22540 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22541 return dst;
22544 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22545 static enum stringop_alg
22546 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22547 int *dynamic_check, bool *noalign)
22549 const struct stringop_algs * algs;
22550 bool optimize_for_speed;
22551 /* Algorithms using the rep prefix want at least edi and ecx;
22552 additionally, memset wants eax and memcpy wants esi. Don't
22553 consider such algorithms if the user has appropriated those
22554 registers for their own purposes. */
22555 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22556 || (memset
22557 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22558 *noalign = false;
22560 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22561 || (alg != rep_prefix_1_byte \
22562 && alg != rep_prefix_4_byte \
22563 && alg != rep_prefix_8_byte))
22564 const struct processor_costs *cost;
22566 /* Even if the string operation call is cold, we still might spend a lot
22567 of time processing large blocks. */
22568 if (optimize_function_for_size_p (cfun)
22569 || (optimize_insn_for_size_p ()
22570 && expected_size != -1 && expected_size < 256))
22571 optimize_for_speed = false;
22572 else
22573 optimize_for_speed = true;
22575 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22577 *dynamic_check = -1;
22578 if (memset)
22579 algs = &cost->memset[TARGET_64BIT != 0];
22580 else
22581 algs = &cost->memcpy[TARGET_64BIT != 0];
22582 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22583 return ix86_stringop_alg;
22584 /* rep; movq or rep; movl is the smallest variant. */
22585 else if (!optimize_for_speed)
22587 if (!count || (count & 3))
22588 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22589 else
22590 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22592 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22594 else if (expected_size != -1 && expected_size < 4)
22595 return loop_1_byte;
22596 else if (expected_size != -1)
22598 unsigned int i;
22599 enum stringop_alg alg = libcall;
22600 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22602 /* We get here if the algorithms that were not libcall-based
22603 were rep-prefix based and we are unable to use rep prefixes
22604 based on global register usage. Break out of the loop and
22605 use the heuristic below. */
22606 if (algs->size[i].max == 0)
22607 break;
22608 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22610 enum stringop_alg candidate = algs->size[i].alg;
22612 if (candidate != libcall && ALG_USABLE_P (candidate))
22613 alg = candidate;
22614 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22615 last non-libcall inline algorithm. */
22616 if (TARGET_INLINE_ALL_STRINGOPS)
22618 /* When the current size is best to be copied by a libcall,
22619 but we are still forced to inline, run the heuristic below
22620 that will pick code for medium sized blocks. */
22621 if (alg != libcall)
22622 return alg;
22623 break;
22625 else if (ALG_USABLE_P (candidate))
22627 *noalign = algs->size[i].noalign;
22628 return candidate;
22632 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22634 /* When asked to inline the call anyway, try to pick meaningful choice.
22635 We look for maximal size of block that is faster to copy by hand and
22636 take blocks of at most of that size guessing that average size will
22637 be roughly half of the block.
22639 If this turns out to be bad, we might simply specify the preferred
22640 choice in ix86_costs. */
22641 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22642 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22644 int max = -1;
22645 enum stringop_alg alg;
22646 int i;
22647 bool any_alg_usable_p = true;
22649 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22651 enum stringop_alg candidate = algs->size[i].alg;
22652 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22654 if (candidate != libcall && candidate
22655 && ALG_USABLE_P (candidate))
22656 max = algs->size[i].max;
22658 /* If there aren't any usable algorithms, then recursing on
22659 smaller sizes isn't going to find anything. Just return the
22660 simple byte-at-a-time copy loop. */
22661 if (!any_alg_usable_p)
22663 /* Pick something reasonable. */
22664 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22665 *dynamic_check = 128;
22666 return loop_1_byte;
22668 if (max == -1)
22669 max = 4096;
22670 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22671 gcc_assert (*dynamic_check == -1);
22672 gcc_assert (alg != libcall);
22673 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22674 *dynamic_check = max;
22675 return alg;
22677 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22678 #undef ALG_USABLE_P
22681 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22682 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22683 static int
22684 decide_alignment (int align,
22685 enum stringop_alg alg,
22686 int expected_size)
22688 int desired_align = 0;
22689 switch (alg)
22691 case no_stringop:
22692 gcc_unreachable ();
22693 case loop:
22694 case unrolled_loop:
22695 desired_align = GET_MODE_SIZE (Pmode);
22696 break;
22697 case rep_prefix_8_byte:
22698 desired_align = 8;
22699 break;
22700 case rep_prefix_4_byte:
22701 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22702 copying whole cacheline at once. */
22703 if (TARGET_PENTIUMPRO)
22704 desired_align = 8;
22705 else
22706 desired_align = 4;
22707 break;
22708 case rep_prefix_1_byte:
22709 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22710 copying whole cacheline at once. */
22711 if (TARGET_PENTIUMPRO)
22712 desired_align = 8;
22713 else
22714 desired_align = 1;
22715 break;
22716 case loop_1_byte:
22717 desired_align = 1;
22718 break;
22719 case libcall:
22720 return 0;
22723 if (optimize_size)
22724 desired_align = 1;
22725 if (desired_align < align)
22726 desired_align = align;
22727 if (expected_size != -1 && expected_size < 4)
22728 desired_align = align;
22729 return desired_align;
22732 /* Return the smallest power of 2 greater than VAL. */
22733 static int
22734 smallest_pow2_greater_than (int val)
22736 int ret = 1;
22737 while (ret <= val)
22738 ret <<= 1;
22739 return ret;
22742 /* Expand string move (memcpy) operation. Use i386 string operations
22743 when profitable. expand_setmem contains similar code. The code
22744 depends upon architecture, block size and alignment, but always has
22745 the same overall structure:
22747 1) Prologue guard: Conditional that jumps up to epilogues for small
22748 blocks that can be handled by epilogue alone. This is faster
22749 but also needed for correctness, since prologue assume the block
22750 is larger than the desired alignment.
22752 Optional dynamic check for size and libcall for large
22753 blocks is emitted here too, with -minline-stringops-dynamically.
22755 2) Prologue: copy first few bytes in order to get destination
22756 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22757 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22758 copied. We emit either a jump tree on power of two sized
22759 blocks, or a byte loop.
22761 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22762 with specified algorithm.
22764 4) Epilogue: code copying tail of the block that is too small to be
22765 handled by main body (or up to size guarded by prologue guard). */
22767 bool
22768 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22769 rtx expected_align_exp, rtx expected_size_exp)
22771 rtx destreg;
22772 rtx srcreg;
22773 rtx label = NULL;
22774 rtx tmp;
22775 rtx jump_around_label = NULL;
22776 HOST_WIDE_INT align = 1;
22777 unsigned HOST_WIDE_INT count = 0;
22778 HOST_WIDE_INT expected_size = -1;
22779 int size_needed = 0, epilogue_size_needed;
22780 int desired_align = 0, align_bytes = 0;
22781 enum stringop_alg alg;
22782 int dynamic_check;
22783 bool need_zero_guard = false;
22784 bool noalign;
22786 if (CONST_INT_P (align_exp))
22787 align = INTVAL (align_exp);
22788 /* i386 can do misaligned access on reasonably increased cost. */
22789 if (CONST_INT_P (expected_align_exp)
22790 && INTVAL (expected_align_exp) > align)
22791 align = INTVAL (expected_align_exp);
22792 /* ALIGN is the minimum of destination and source alignment, but we care here
22793 just about destination alignment. */
22794 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22795 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22797 if (CONST_INT_P (count_exp))
22798 count = expected_size = INTVAL (count_exp);
22799 if (CONST_INT_P (expected_size_exp) && count == 0)
22800 expected_size = INTVAL (expected_size_exp);
22802 /* Make sure we don't need to care about overflow later on. */
22803 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22804 return false;
22806 /* Step 0: Decide on preferred algorithm, desired alignment and
22807 size of chunks to be copied by main loop. */
22809 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22810 desired_align = decide_alignment (align, alg, expected_size);
22812 if (!TARGET_ALIGN_STRINGOPS || noalign)
22813 align = desired_align;
22815 if (alg == libcall)
22816 return false;
22817 gcc_assert (alg != no_stringop);
22818 if (!count)
22819 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22820 destreg = copy_addr_to_reg (XEXP (dst, 0));
22821 srcreg = copy_addr_to_reg (XEXP (src, 0));
22822 switch (alg)
22824 case libcall:
22825 case no_stringop:
22826 gcc_unreachable ();
22827 case loop:
22828 need_zero_guard = true;
22829 size_needed = GET_MODE_SIZE (word_mode);
22830 break;
22831 case unrolled_loop:
22832 need_zero_guard = true;
22833 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22834 break;
22835 case rep_prefix_8_byte:
22836 size_needed = 8;
22837 break;
22838 case rep_prefix_4_byte:
22839 size_needed = 4;
22840 break;
22841 case rep_prefix_1_byte:
22842 size_needed = 1;
22843 break;
22844 case loop_1_byte:
22845 need_zero_guard = true;
22846 size_needed = 1;
22847 break;
22850 epilogue_size_needed = size_needed;
22852 /* Step 1: Prologue guard. */
22854 /* Alignment code needs count to be in register. */
22855 if (CONST_INT_P (count_exp) && desired_align > align)
22857 if (INTVAL (count_exp) > desired_align
22858 && INTVAL (count_exp) > size_needed)
22860 align_bytes
22861 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22862 if (align_bytes <= 0)
22863 align_bytes = 0;
22864 else
22865 align_bytes = desired_align - align_bytes;
22867 if (align_bytes == 0)
22868 count_exp = force_reg (counter_mode (count_exp), count_exp);
22870 gcc_assert (desired_align >= 1 && align >= 1);
22872 /* Ensure that alignment prologue won't copy past end of block. */
22873 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22875 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22876 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22877 Make sure it is power of 2. */
22878 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22880 if (count)
22882 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22884 /* If main algorithm works on QImode, no epilogue is needed.
22885 For small sizes just don't align anything. */
22886 if (size_needed == 1)
22887 desired_align = align;
22888 else
22889 goto epilogue;
22892 else
22894 label = gen_label_rtx ();
22895 emit_cmp_and_jump_insns (count_exp,
22896 GEN_INT (epilogue_size_needed),
22897 LTU, 0, counter_mode (count_exp), 1, label);
22898 if (expected_size == -1 || expected_size < epilogue_size_needed)
22899 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22900 else
22901 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22905 /* Emit code to decide on runtime whether library call or inline should be
22906 used. */
22907 if (dynamic_check != -1)
22909 if (CONST_INT_P (count_exp))
22911 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22913 emit_block_move_via_libcall (dst, src, count_exp, false);
22914 count_exp = const0_rtx;
22915 goto epilogue;
22918 else
22920 rtx hot_label = gen_label_rtx ();
22921 jump_around_label = gen_label_rtx ();
22922 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22923 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22924 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22925 emit_block_move_via_libcall (dst, src, count_exp, false);
22926 emit_jump (jump_around_label);
22927 emit_label (hot_label);
22931 /* Step 2: Alignment prologue. */
22933 if (desired_align > align)
22935 if (align_bytes == 0)
22937 /* Except for the first move in epilogue, we no longer know
22938 constant offset in aliasing info. It don't seems to worth
22939 the pain to maintain it for the first move, so throw away
22940 the info early. */
22941 src = change_address (src, BLKmode, srcreg);
22942 dst = change_address (dst, BLKmode, destreg);
22943 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22944 desired_align);
22946 else
22948 /* If we know how many bytes need to be stored before dst is
22949 sufficiently aligned, maintain aliasing info accurately. */
22950 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22951 desired_align, align_bytes);
22952 count_exp = plus_constant (counter_mode (count_exp),
22953 count_exp, -align_bytes);
22954 count -= align_bytes;
22956 if (need_zero_guard
22957 && (count < (unsigned HOST_WIDE_INT) size_needed
22958 || (align_bytes == 0
22959 && count < ((unsigned HOST_WIDE_INT) size_needed
22960 + desired_align - align))))
22962 /* It is possible that we copied enough so the main loop will not
22963 execute. */
22964 gcc_assert (size_needed > 1);
22965 if (label == NULL_RTX)
22966 label = gen_label_rtx ();
22967 emit_cmp_and_jump_insns (count_exp,
22968 GEN_INT (size_needed),
22969 LTU, 0, counter_mode (count_exp), 1, label);
22970 if (expected_size == -1
22971 || expected_size < (desired_align - align) / 2 + size_needed)
22972 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22973 else
22974 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22977 if (label && size_needed == 1)
22979 emit_label (label);
22980 LABEL_NUSES (label) = 1;
22981 label = NULL;
22982 epilogue_size_needed = 1;
22984 else if (label == NULL_RTX)
22985 epilogue_size_needed = size_needed;
22987 /* Step 3: Main loop. */
22989 switch (alg)
22991 case libcall:
22992 case no_stringop:
22993 gcc_unreachable ();
22994 case loop_1_byte:
22995 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22996 count_exp, QImode, 1, expected_size);
22997 break;
22998 case loop:
22999 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
23000 count_exp, word_mode, 1, expected_size);
23001 break;
23002 case unrolled_loop:
23003 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
23004 registers for 4 temporaries anyway. */
23005 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
23006 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
23007 expected_size);
23008 break;
23009 case rep_prefix_8_byte:
23010 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23011 DImode);
23012 break;
23013 case rep_prefix_4_byte:
23014 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23015 SImode);
23016 break;
23017 case rep_prefix_1_byte:
23018 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23019 QImode);
23020 break;
23022 /* Adjust properly the offset of src and dest memory for aliasing. */
23023 if (CONST_INT_P (count_exp))
23025 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
23026 (count / size_needed) * size_needed);
23027 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23028 (count / size_needed) * size_needed);
23030 else
23032 src = change_address (src, BLKmode, srcreg);
23033 dst = change_address (dst, BLKmode, destreg);
23036 /* Step 4: Epilogue to copy the remaining bytes. */
23037 epilogue:
23038 if (label)
23040 /* When the main loop is done, COUNT_EXP might hold original count,
23041 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23042 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23043 bytes. Compensate if needed. */
23045 if (size_needed < epilogue_size_needed)
23047 tmp =
23048 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23049 GEN_INT (size_needed - 1), count_exp, 1,
23050 OPTAB_DIRECT);
23051 if (tmp != count_exp)
23052 emit_move_insn (count_exp, tmp);
23054 emit_label (label);
23055 LABEL_NUSES (label) = 1;
23058 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23059 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
23060 epilogue_size_needed);
23061 if (jump_around_label)
23062 emit_label (jump_around_label);
23063 return true;
23066 /* Helper function for memcpy. For QImode value 0xXY produce
23067 0xXYXYXYXY of wide specified by MODE. This is essentially
23068 a * 0x10101010, but we can do slightly better than
23069 synth_mult by unwinding the sequence by hand on CPUs with
23070 slow multiply. */
23071 static rtx
23072 promote_duplicated_reg (enum machine_mode mode, rtx val)
23074 enum machine_mode valmode = GET_MODE (val);
23075 rtx tmp;
23076 int nops = mode == DImode ? 3 : 2;
23078 gcc_assert (mode == SImode || mode == DImode);
23079 if (val == const0_rtx)
23080 return copy_to_mode_reg (mode, const0_rtx);
23081 if (CONST_INT_P (val))
23083 HOST_WIDE_INT v = INTVAL (val) & 255;
23085 v |= v << 8;
23086 v |= v << 16;
23087 if (mode == DImode)
23088 v |= (v << 16) << 16;
23089 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23092 if (valmode == VOIDmode)
23093 valmode = QImode;
23094 if (valmode != QImode)
23095 val = gen_lowpart (QImode, val);
23096 if (mode == QImode)
23097 return val;
23098 if (!TARGET_PARTIAL_REG_STALL)
23099 nops--;
23100 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23101 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23102 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23103 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23105 rtx reg = convert_modes (mode, QImode, val, true);
23106 tmp = promote_duplicated_reg (mode, const1_rtx);
23107 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23108 OPTAB_DIRECT);
23110 else
23112 rtx reg = convert_modes (mode, QImode, val, true);
23114 if (!TARGET_PARTIAL_REG_STALL)
23115 if (mode == SImode)
23116 emit_insn (gen_movsi_insv_1 (reg, reg));
23117 else
23118 emit_insn (gen_movdi_insv_1 (reg, reg));
23119 else
23121 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23122 NULL, 1, OPTAB_DIRECT);
23123 reg =
23124 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23126 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23127 NULL, 1, OPTAB_DIRECT);
23128 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23129 if (mode == SImode)
23130 return reg;
23131 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23132 NULL, 1, OPTAB_DIRECT);
23133 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23134 return reg;
23138 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23139 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23140 alignment from ALIGN to DESIRED_ALIGN. */
23141 static rtx
23142 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23144 rtx promoted_val;
23146 if (TARGET_64BIT
23147 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23148 promoted_val = promote_duplicated_reg (DImode, val);
23149 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23150 promoted_val = promote_duplicated_reg (SImode, val);
23151 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23152 promoted_val = promote_duplicated_reg (HImode, val);
23153 else
23154 promoted_val = val;
23156 return promoted_val;
23159 /* Expand string clear operation (bzero). Use i386 string operations when
23160 profitable. See expand_movmem comment for explanation of individual
23161 steps performed. */
23162 bool
23163 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23164 rtx expected_align_exp, rtx expected_size_exp)
23166 rtx destreg;
23167 rtx label = NULL;
23168 rtx tmp;
23169 rtx jump_around_label = NULL;
23170 HOST_WIDE_INT align = 1;
23171 unsigned HOST_WIDE_INT count = 0;
23172 HOST_WIDE_INT expected_size = -1;
23173 int size_needed = 0, epilogue_size_needed;
23174 int desired_align = 0, align_bytes = 0;
23175 enum stringop_alg alg;
23176 rtx promoted_val = NULL;
23177 bool force_loopy_epilogue = false;
23178 int dynamic_check;
23179 bool need_zero_guard = false;
23180 bool noalign;
23182 if (CONST_INT_P (align_exp))
23183 align = INTVAL (align_exp);
23184 /* i386 can do misaligned access on reasonably increased cost. */
23185 if (CONST_INT_P (expected_align_exp)
23186 && INTVAL (expected_align_exp) > align)
23187 align = INTVAL (expected_align_exp);
23188 if (CONST_INT_P (count_exp))
23189 count = expected_size = INTVAL (count_exp);
23190 if (CONST_INT_P (expected_size_exp) && count == 0)
23191 expected_size = INTVAL (expected_size_exp);
23193 /* Make sure we don't need to care about overflow later on. */
23194 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23195 return false;
23197 /* Step 0: Decide on preferred algorithm, desired alignment and
23198 size of chunks to be copied by main loop. */
23200 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23201 desired_align = decide_alignment (align, alg, expected_size);
23203 if (!TARGET_ALIGN_STRINGOPS || noalign)
23204 align = desired_align;
23206 if (alg == libcall)
23207 return false;
23208 gcc_assert (alg != no_stringop);
23209 if (!count)
23210 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23211 destreg = copy_addr_to_reg (XEXP (dst, 0));
23212 switch (alg)
23214 case libcall:
23215 case no_stringop:
23216 gcc_unreachable ();
23217 case loop:
23218 need_zero_guard = true;
23219 size_needed = GET_MODE_SIZE (word_mode);
23220 break;
23221 case unrolled_loop:
23222 need_zero_guard = true;
23223 size_needed = GET_MODE_SIZE (word_mode) * 4;
23224 break;
23225 case rep_prefix_8_byte:
23226 size_needed = 8;
23227 break;
23228 case rep_prefix_4_byte:
23229 size_needed = 4;
23230 break;
23231 case rep_prefix_1_byte:
23232 size_needed = 1;
23233 break;
23234 case loop_1_byte:
23235 need_zero_guard = true;
23236 size_needed = 1;
23237 break;
23239 epilogue_size_needed = size_needed;
23241 /* Step 1: Prologue guard. */
23243 /* Alignment code needs count to be in register. */
23244 if (CONST_INT_P (count_exp) && desired_align > align)
23246 if (INTVAL (count_exp) > desired_align
23247 && INTVAL (count_exp) > size_needed)
23249 align_bytes
23250 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23251 if (align_bytes <= 0)
23252 align_bytes = 0;
23253 else
23254 align_bytes = desired_align - align_bytes;
23256 if (align_bytes == 0)
23258 enum machine_mode mode = SImode;
23259 if (TARGET_64BIT && (count & ~0xffffffff))
23260 mode = DImode;
23261 count_exp = force_reg (mode, count_exp);
23264 /* Do the cheap promotion to allow better CSE across the
23265 main loop and epilogue (ie one load of the big constant in the
23266 front of all code. */
23267 if (CONST_INT_P (val_exp))
23268 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23269 desired_align, align);
23270 /* Ensure that alignment prologue won't copy past end of block. */
23271 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23273 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23274 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23275 Make sure it is power of 2. */
23276 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23278 /* To improve performance of small blocks, we jump around the VAL
23279 promoting mode. This mean that if the promoted VAL is not constant,
23280 we might not use it in the epilogue and have to use byte
23281 loop variant. */
23282 if (epilogue_size_needed > 2 && !promoted_val)
23283 force_loopy_epilogue = true;
23284 if (count)
23286 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23288 /* If main algorithm works on QImode, no epilogue is needed.
23289 For small sizes just don't align anything. */
23290 if (size_needed == 1)
23291 desired_align = align;
23292 else
23293 goto epilogue;
23296 else
23298 label = gen_label_rtx ();
23299 emit_cmp_and_jump_insns (count_exp,
23300 GEN_INT (epilogue_size_needed),
23301 LTU, 0, counter_mode (count_exp), 1, label);
23302 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23303 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23304 else
23305 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23308 if (dynamic_check != -1)
23310 rtx hot_label = gen_label_rtx ();
23311 jump_around_label = gen_label_rtx ();
23312 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23313 LEU, 0, counter_mode (count_exp), 1, hot_label);
23314 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23315 set_storage_via_libcall (dst, count_exp, val_exp, false);
23316 emit_jump (jump_around_label);
23317 emit_label (hot_label);
23320 /* Step 2: Alignment prologue. */
23322 /* Do the expensive promotion once we branched off the small blocks. */
23323 if (!promoted_val)
23324 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23325 desired_align, align);
23326 gcc_assert (desired_align >= 1 && align >= 1);
23328 if (desired_align > align)
23330 if (align_bytes == 0)
23332 /* Except for the first move in epilogue, we no longer know
23333 constant offset in aliasing info. It don't seems to worth
23334 the pain to maintain it for the first move, so throw away
23335 the info early. */
23336 dst = change_address (dst, BLKmode, destreg);
23337 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23338 desired_align);
23340 else
23342 /* If we know how many bytes need to be stored before dst is
23343 sufficiently aligned, maintain aliasing info accurately. */
23344 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23345 desired_align, align_bytes);
23346 count_exp = plus_constant (counter_mode (count_exp),
23347 count_exp, -align_bytes);
23348 count -= align_bytes;
23350 if (need_zero_guard
23351 && (count < (unsigned HOST_WIDE_INT) size_needed
23352 || (align_bytes == 0
23353 && count < ((unsigned HOST_WIDE_INT) size_needed
23354 + desired_align - align))))
23356 /* It is possible that we copied enough so the main loop will not
23357 execute. */
23358 gcc_assert (size_needed > 1);
23359 if (label == NULL_RTX)
23360 label = gen_label_rtx ();
23361 emit_cmp_and_jump_insns (count_exp,
23362 GEN_INT (size_needed),
23363 LTU, 0, counter_mode (count_exp), 1, label);
23364 if (expected_size == -1
23365 || expected_size < (desired_align - align) / 2 + size_needed)
23366 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23367 else
23368 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23371 if (label && size_needed == 1)
23373 emit_label (label);
23374 LABEL_NUSES (label) = 1;
23375 label = NULL;
23376 promoted_val = val_exp;
23377 epilogue_size_needed = 1;
23379 else if (label == NULL_RTX)
23380 epilogue_size_needed = size_needed;
23382 /* Step 3: Main loop. */
23384 switch (alg)
23386 case libcall:
23387 case no_stringop:
23388 gcc_unreachable ();
23389 case loop_1_byte:
23390 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23391 count_exp, QImode, 1, expected_size);
23392 break;
23393 case loop:
23394 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23395 count_exp, word_mode, 1, expected_size);
23396 break;
23397 case unrolled_loop:
23398 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23399 count_exp, word_mode, 4, expected_size);
23400 break;
23401 case rep_prefix_8_byte:
23402 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23403 DImode, val_exp);
23404 break;
23405 case rep_prefix_4_byte:
23406 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23407 SImode, val_exp);
23408 break;
23409 case rep_prefix_1_byte:
23410 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23411 QImode, val_exp);
23412 break;
23414 /* Adjust properly the offset of src and dest memory for aliasing. */
23415 if (CONST_INT_P (count_exp))
23416 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23417 (count / size_needed) * size_needed);
23418 else
23419 dst = change_address (dst, BLKmode, destreg);
23421 /* Step 4: Epilogue to copy the remaining bytes. */
23423 if (label)
23425 /* When the main loop is done, COUNT_EXP might hold original count,
23426 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23427 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23428 bytes. Compensate if needed. */
23430 if (size_needed < epilogue_size_needed)
23432 tmp =
23433 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23434 GEN_INT (size_needed - 1), count_exp, 1,
23435 OPTAB_DIRECT);
23436 if (tmp != count_exp)
23437 emit_move_insn (count_exp, tmp);
23439 emit_label (label);
23440 LABEL_NUSES (label) = 1;
23442 epilogue:
23443 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23445 if (force_loopy_epilogue)
23446 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23447 epilogue_size_needed);
23448 else
23449 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23450 epilogue_size_needed);
23452 if (jump_around_label)
23453 emit_label (jump_around_label);
23454 return true;
23457 /* Expand the appropriate insns for doing strlen if not just doing
23458 repnz; scasb
23460 out = result, initialized with the start address
23461 align_rtx = alignment of the address.
23462 scratch = scratch register, initialized with the startaddress when
23463 not aligned, otherwise undefined
23465 This is just the body. It needs the initializations mentioned above and
23466 some address computing at the end. These things are done in i386.md. */
23468 static void
23469 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23471 int align;
23472 rtx tmp;
23473 rtx align_2_label = NULL_RTX;
23474 rtx align_3_label = NULL_RTX;
23475 rtx align_4_label = gen_label_rtx ();
23476 rtx end_0_label = gen_label_rtx ();
23477 rtx mem;
23478 rtx tmpreg = gen_reg_rtx (SImode);
23479 rtx scratch = gen_reg_rtx (SImode);
23480 rtx cmp;
23482 align = 0;
23483 if (CONST_INT_P (align_rtx))
23484 align = INTVAL (align_rtx);
23486 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23488 /* Is there a known alignment and is it less than 4? */
23489 if (align < 4)
23491 rtx scratch1 = gen_reg_rtx (Pmode);
23492 emit_move_insn (scratch1, out);
23493 /* Is there a known alignment and is it not 2? */
23494 if (align != 2)
23496 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23497 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23499 /* Leave just the 3 lower bits. */
23500 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23501 NULL_RTX, 0, OPTAB_WIDEN);
23503 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23504 Pmode, 1, align_4_label);
23505 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23506 Pmode, 1, align_2_label);
23507 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23508 Pmode, 1, align_3_label);
23510 else
23512 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23513 check if is aligned to 4 - byte. */
23515 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23516 NULL_RTX, 0, OPTAB_WIDEN);
23518 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23519 Pmode, 1, align_4_label);
23522 mem = change_address (src, QImode, out);
23524 /* Now compare the bytes. */
23526 /* Compare the first n unaligned byte on a byte per byte basis. */
23527 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23528 QImode, 1, end_0_label);
23530 /* Increment the address. */
23531 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23533 /* Not needed with an alignment of 2 */
23534 if (align != 2)
23536 emit_label (align_2_label);
23538 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23539 end_0_label);
23541 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23543 emit_label (align_3_label);
23546 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23547 end_0_label);
23549 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23552 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23553 align this loop. It gives only huge programs, but does not help to
23554 speed up. */
23555 emit_label (align_4_label);
23557 mem = change_address (src, SImode, out);
23558 emit_move_insn (scratch, mem);
23559 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23561 /* This formula yields a nonzero result iff one of the bytes is zero.
23562 This saves three branches inside loop and many cycles. */
23564 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23565 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23566 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23567 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23568 gen_int_mode (0x80808080, SImode)));
23569 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23570 align_4_label);
23572 if (TARGET_CMOVE)
23574 rtx reg = gen_reg_rtx (SImode);
23575 rtx reg2 = gen_reg_rtx (Pmode);
23576 emit_move_insn (reg, tmpreg);
23577 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23579 /* If zero is not in the first two bytes, move two bytes forward. */
23580 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23581 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23582 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23583 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23584 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23585 reg,
23586 tmpreg)));
23587 /* Emit lea manually to avoid clobbering of flags. */
23588 emit_insn (gen_rtx_SET (SImode, reg2,
23589 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23591 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23592 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23593 emit_insn (gen_rtx_SET (VOIDmode, out,
23594 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23595 reg2,
23596 out)));
23598 else
23600 rtx end_2_label = gen_label_rtx ();
23601 /* Is zero in the first two bytes? */
23603 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23604 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23605 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23606 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23607 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23608 pc_rtx);
23609 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23610 JUMP_LABEL (tmp) = end_2_label;
23612 /* Not in the first two. Move two bytes forward. */
23613 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23614 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23616 emit_label (end_2_label);
23620 /* Avoid branch in fixing the byte. */
23621 tmpreg = gen_lowpart (QImode, tmpreg);
23622 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23623 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23624 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23625 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23627 emit_label (end_0_label);
23630 /* Expand strlen. */
23632 bool
23633 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23635 rtx addr, scratch1, scratch2, scratch3, scratch4;
23637 /* The generic case of strlen expander is long. Avoid it's
23638 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23640 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23641 && !TARGET_INLINE_ALL_STRINGOPS
23642 && !optimize_insn_for_size_p ()
23643 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23644 return false;
23646 addr = force_reg (Pmode, XEXP (src, 0));
23647 scratch1 = gen_reg_rtx (Pmode);
23649 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23650 && !optimize_insn_for_size_p ())
23652 /* Well it seems that some optimizer does not combine a call like
23653 foo(strlen(bar), strlen(bar));
23654 when the move and the subtraction is done here. It does calculate
23655 the length just once when these instructions are done inside of
23656 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23657 often used and I use one fewer register for the lifetime of
23658 output_strlen_unroll() this is better. */
23660 emit_move_insn (out, addr);
23662 ix86_expand_strlensi_unroll_1 (out, src, align);
23664 /* strlensi_unroll_1 returns the address of the zero at the end of
23665 the string, like memchr(), so compute the length by subtracting
23666 the start address. */
23667 emit_insn (ix86_gen_sub3 (out, out, addr));
23669 else
23671 rtx unspec;
23673 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23674 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23675 return false;
23677 scratch2 = gen_reg_rtx (Pmode);
23678 scratch3 = gen_reg_rtx (Pmode);
23679 scratch4 = force_reg (Pmode, constm1_rtx);
23681 emit_move_insn (scratch3, addr);
23682 eoschar = force_reg (QImode, eoschar);
23684 src = replace_equiv_address_nv (src, scratch3);
23686 /* If .md starts supporting :P, this can be done in .md. */
23687 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23688 scratch4), UNSPEC_SCAS);
23689 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23690 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23691 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23693 return true;
23696 /* For given symbol (function) construct code to compute address of it's PLT
23697 entry in large x86-64 PIC model. */
23698 static rtx
23699 construct_plt_address (rtx symbol)
23701 rtx tmp, unspec;
23703 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23704 gcc_assert (ix86_cmodel == CM_LARGE_PIC && DEFAULT_ABI != MS_ABI);
23705 gcc_assert (Pmode == DImode);
23707 tmp = gen_reg_rtx (Pmode);
23708 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23710 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23711 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23712 return tmp;
23716 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23717 rtx callarg2,
23718 rtx pop, bool sibcall)
23720 unsigned int const cregs_size
23721 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
23722 rtx vec[3 + cregs_size];
23723 rtx use = NULL, call;
23724 unsigned int vec_len = 0;
23726 if (pop == const0_rtx)
23727 pop = NULL;
23728 gcc_assert (!TARGET_64BIT || !pop);
23730 if (TARGET_MACHO && !TARGET_64BIT)
23732 #if TARGET_MACHO
23733 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23734 fnaddr = machopic_indirect_call_target (fnaddr);
23735 #endif
23737 else
23739 /* Static functions and indirect calls don't need the pic register. */
23740 if (flag_pic
23741 && (!TARGET_64BIT
23742 || (ix86_cmodel == CM_LARGE_PIC
23743 && DEFAULT_ABI != MS_ABI))
23744 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23745 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23746 use_reg (&use, pic_offset_table_rtx);
23749 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23751 rtx al = gen_rtx_REG (QImode, AX_REG);
23752 emit_move_insn (al, callarg2);
23753 use_reg (&use, al);
23756 if (ix86_cmodel == CM_LARGE_PIC
23757 && DEFAULT_ABI != MS_ABI
23758 && MEM_P (fnaddr)
23759 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23760 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23761 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23762 else if (sibcall
23763 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23764 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23766 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23767 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23770 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23771 if (retval)
23772 call = gen_rtx_SET (VOIDmode, retval, call);
23773 vec[vec_len++] = call;
23775 if (pop)
23777 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23778 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23779 vec[vec_len++] = pop;
23782 if (TARGET_64BIT_MS_ABI
23783 && (!callarg2 || INTVAL (callarg2) != -2))
23785 unsigned i;
23787 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23788 UNSPEC_MS_TO_SYSV_CALL);
23790 for (i = 0; i < cregs_size; i++)
23792 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
23793 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
23795 vec[vec_len++]
23796 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
23800 if (vec_len > 1)
23801 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23802 call = emit_call_insn (call);
23803 if (use)
23804 CALL_INSN_FUNCTION_USAGE (call) = use;
23806 return call;
23809 /* Output the assembly for a call instruction. */
23811 const char *
23812 ix86_output_call_insn (rtx insn, rtx call_op)
23814 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23815 bool seh_nop_p = false;
23816 const char *xasm;
23818 if (SIBLING_CALL_P (insn))
23820 if (direct_p)
23821 xasm = "jmp\t%P0";
23822 /* SEH epilogue detection requires the indirect branch case
23823 to include REX.W. */
23824 else if (TARGET_SEH)
23825 xasm = "rex.W jmp %A0";
23826 else
23827 xasm = "jmp\t%A0";
23829 output_asm_insn (xasm, &call_op);
23830 return "";
23833 /* SEH unwinding can require an extra nop to be emitted in several
23834 circumstances. Determine if we have one of those. */
23835 if (TARGET_SEH)
23837 rtx i;
23839 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23841 /* If we get to another real insn, we don't need the nop. */
23842 if (INSN_P (i))
23843 break;
23845 /* If we get to the epilogue note, prevent a catch region from
23846 being adjacent to the standard epilogue sequence. If non-
23847 call-exceptions, we'll have done this during epilogue emission. */
23848 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23849 && !flag_non_call_exceptions
23850 && !can_throw_internal (insn))
23852 seh_nop_p = true;
23853 break;
23857 /* If we didn't find a real insn following the call, prevent the
23858 unwinder from looking into the next function. */
23859 if (i == NULL)
23860 seh_nop_p = true;
23863 if (direct_p)
23864 xasm = "call\t%P0";
23865 else
23866 xasm = "call\t%A0";
23868 output_asm_insn (xasm, &call_op);
23870 if (seh_nop_p)
23871 return "nop";
23873 return "";
23876 /* Clear stack slot assignments remembered from previous functions.
23877 This is called from INIT_EXPANDERS once before RTL is emitted for each
23878 function. */
23880 static struct machine_function *
23881 ix86_init_machine_status (void)
23883 struct machine_function *f;
23885 f = ggc_alloc_cleared_machine_function ();
23886 f->use_fast_prologue_epilogue_nregs = -1;
23887 f->call_abi = ix86_abi;
23889 return f;
23892 /* Return a MEM corresponding to a stack slot with mode MODE.
23893 Allocate a new slot if necessary.
23895 The RTL for a function can have several slots available: N is
23896 which slot to use. */
23899 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23901 struct stack_local_entry *s;
23903 gcc_assert (n < MAX_386_STACK_LOCALS);
23905 for (s = ix86_stack_locals; s; s = s->next)
23906 if (s->mode == mode && s->n == n)
23907 return validize_mem (copy_rtx (s->rtl));
23909 s = ggc_alloc_stack_local_entry ();
23910 s->n = n;
23911 s->mode = mode;
23912 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23914 s->next = ix86_stack_locals;
23915 ix86_stack_locals = s;
23916 return validize_mem (s->rtl);
23919 static void
23920 ix86_instantiate_decls (void)
23922 struct stack_local_entry *s;
23924 for (s = ix86_stack_locals; s; s = s->next)
23925 if (s->rtl != NULL_RTX)
23926 instantiate_decl_rtl (s->rtl);
23929 /* Calculate the length of the memory address in the instruction encoding.
23930 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23931 or other prefixes. We never generate addr32 prefix for LEA insn. */
23934 memory_address_length (rtx addr, bool lea)
23936 struct ix86_address parts;
23937 rtx base, index, disp;
23938 int len;
23939 int ok;
23941 if (GET_CODE (addr) == PRE_DEC
23942 || GET_CODE (addr) == POST_INC
23943 || GET_CODE (addr) == PRE_MODIFY
23944 || GET_CODE (addr) == POST_MODIFY)
23945 return 0;
23947 ok = ix86_decompose_address (addr, &parts);
23948 gcc_assert (ok);
23950 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23952 /* If this is not LEA instruction, add the length of addr32 prefix. */
23953 if (TARGET_64BIT && !lea
23954 && (SImode_address_operand (addr, VOIDmode)
23955 || (parts.base && GET_MODE (parts.base) == SImode)
23956 || (parts.index && GET_MODE (parts.index) == SImode)))
23957 len++;
23959 base = parts.base;
23960 index = parts.index;
23961 disp = parts.disp;
23963 if (base && GET_CODE (base) == SUBREG)
23964 base = SUBREG_REG (base);
23965 if (index && GET_CODE (index) == SUBREG)
23966 index = SUBREG_REG (index);
23968 gcc_assert (base == NULL_RTX || REG_P (base));
23969 gcc_assert (index == NULL_RTX || REG_P (index));
23971 /* Rule of thumb:
23972 - esp as the base always wants an index,
23973 - ebp as the base always wants a displacement,
23974 - r12 as the base always wants an index,
23975 - r13 as the base always wants a displacement. */
23977 /* Register Indirect. */
23978 if (base && !index && !disp)
23980 /* esp (for its index) and ebp (for its displacement) need
23981 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23982 code. */
23983 if (base == arg_pointer_rtx
23984 || base == frame_pointer_rtx
23985 || REGNO (base) == SP_REG
23986 || REGNO (base) == BP_REG
23987 || REGNO (base) == R12_REG
23988 || REGNO (base) == R13_REG)
23989 len++;
23992 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23993 is not disp32, but disp32(%rip), so for disp32
23994 SIB byte is needed, unless print_operand_address
23995 optimizes it into disp32(%rip) or (%rip) is implied
23996 by UNSPEC. */
23997 else if (disp && !base && !index)
23999 len += 4;
24000 if (TARGET_64BIT)
24002 rtx symbol = disp;
24004 if (GET_CODE (disp) == CONST)
24005 symbol = XEXP (disp, 0);
24006 if (GET_CODE (symbol) == PLUS
24007 && CONST_INT_P (XEXP (symbol, 1)))
24008 symbol = XEXP (symbol, 0);
24010 if (GET_CODE (symbol) != LABEL_REF
24011 && (GET_CODE (symbol) != SYMBOL_REF
24012 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
24013 && (GET_CODE (symbol) != UNSPEC
24014 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
24015 && XINT (symbol, 1) != UNSPEC_PCREL
24016 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
24017 len++;
24020 else
24022 /* Find the length of the displacement constant. */
24023 if (disp)
24025 if (base && satisfies_constraint_K (disp))
24026 len += 1;
24027 else
24028 len += 4;
24030 /* ebp always wants a displacement. Similarly r13. */
24031 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24032 len++;
24034 /* An index requires the two-byte modrm form.... */
24035 if (index
24036 /* ...like esp (or r12), which always wants an index. */
24037 || base == arg_pointer_rtx
24038 || base == frame_pointer_rtx
24039 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24040 len++;
24043 return len;
24046 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24047 is set, expect that insn have 8bit immediate alternative. */
24049 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24051 int len = 0;
24052 int i;
24053 extract_insn_cached (insn);
24054 for (i = recog_data.n_operands - 1; i >= 0; --i)
24055 if (CONSTANT_P (recog_data.operand[i]))
24057 enum attr_mode mode = get_attr_mode (insn);
24059 gcc_assert (!len);
24060 if (shortform && CONST_INT_P (recog_data.operand[i]))
24062 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24063 switch (mode)
24065 case MODE_QI:
24066 len = 1;
24067 continue;
24068 case MODE_HI:
24069 ival = trunc_int_for_mode (ival, HImode);
24070 break;
24071 case MODE_SI:
24072 ival = trunc_int_for_mode (ival, SImode);
24073 break;
24074 default:
24075 break;
24077 if (IN_RANGE (ival, -128, 127))
24079 len = 1;
24080 continue;
24083 switch (mode)
24085 case MODE_QI:
24086 len = 1;
24087 break;
24088 case MODE_HI:
24089 len = 2;
24090 break;
24091 case MODE_SI:
24092 len = 4;
24093 break;
24094 /* Immediates for DImode instructions are encoded
24095 as 32bit sign extended values. */
24096 case MODE_DI:
24097 len = 4;
24098 break;
24099 default:
24100 fatal_insn ("unknown insn mode", insn);
24103 return len;
24106 /* Compute default value for "length_address" attribute. */
24108 ix86_attr_length_address_default (rtx insn)
24110 int i;
24112 if (get_attr_type (insn) == TYPE_LEA)
24114 rtx set = PATTERN (insn), addr;
24116 if (GET_CODE (set) == PARALLEL)
24117 set = XVECEXP (set, 0, 0);
24119 gcc_assert (GET_CODE (set) == SET);
24121 addr = SET_SRC (set);
24123 return memory_address_length (addr, true);
24126 extract_insn_cached (insn);
24127 for (i = recog_data.n_operands - 1; i >= 0; --i)
24128 if (MEM_P (recog_data.operand[i]))
24130 constrain_operands_cached (reload_completed);
24131 if (which_alternative != -1)
24133 const char *constraints = recog_data.constraints[i];
24134 int alt = which_alternative;
24136 while (*constraints == '=' || *constraints == '+')
24137 constraints++;
24138 while (alt-- > 0)
24139 while (*constraints++ != ',')
24141 /* Skip ignored operands. */
24142 if (*constraints == 'X')
24143 continue;
24145 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24147 return 0;
24150 /* Compute default value for "length_vex" attribute. It includes
24151 2 or 3 byte VEX prefix and 1 opcode byte. */
24154 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24156 int i;
24158 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24159 byte VEX prefix. */
24160 if (!has_0f_opcode || has_vex_w)
24161 return 3 + 1;
24163 /* We can always use 2 byte VEX prefix in 32bit. */
24164 if (!TARGET_64BIT)
24165 return 2 + 1;
24167 extract_insn_cached (insn);
24169 for (i = recog_data.n_operands - 1; i >= 0; --i)
24170 if (REG_P (recog_data.operand[i]))
24172 /* REX.W bit uses 3 byte VEX prefix. */
24173 if (GET_MODE (recog_data.operand[i]) == DImode
24174 && GENERAL_REG_P (recog_data.operand[i]))
24175 return 3 + 1;
24177 else
24179 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24180 if (MEM_P (recog_data.operand[i])
24181 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24182 return 3 + 1;
24185 return 2 + 1;
24188 /* Return the maximum number of instructions a cpu can issue. */
24190 static int
24191 ix86_issue_rate (void)
24193 switch (ix86_tune)
24195 case PROCESSOR_PENTIUM:
24196 case PROCESSOR_ATOM:
24197 case PROCESSOR_K6:
24198 case PROCESSOR_BTVER2:
24199 return 2;
24201 case PROCESSOR_PENTIUMPRO:
24202 case PROCESSOR_PENTIUM4:
24203 case PROCESSOR_CORE2:
24204 case PROCESSOR_COREI7:
24205 case PROCESSOR_HASWELL:
24206 case PROCESSOR_ATHLON:
24207 case PROCESSOR_K8:
24208 case PROCESSOR_AMDFAM10:
24209 case PROCESSOR_NOCONA:
24210 case PROCESSOR_GENERIC32:
24211 case PROCESSOR_GENERIC64:
24212 case PROCESSOR_BDVER1:
24213 case PROCESSOR_BDVER2:
24214 case PROCESSOR_BDVER3:
24215 case PROCESSOR_BTVER1:
24216 return 3;
24218 default:
24219 return 1;
24223 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24224 by DEP_INSN and nothing set by DEP_INSN. */
24226 static bool
24227 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24229 rtx set, set2;
24231 /* Simplify the test for uninteresting insns. */
24232 if (insn_type != TYPE_SETCC
24233 && insn_type != TYPE_ICMOV
24234 && insn_type != TYPE_FCMOV
24235 && insn_type != TYPE_IBR)
24236 return false;
24238 if ((set = single_set (dep_insn)) != 0)
24240 set = SET_DEST (set);
24241 set2 = NULL_RTX;
24243 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24244 && XVECLEN (PATTERN (dep_insn), 0) == 2
24245 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24246 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24248 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24249 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24251 else
24252 return false;
24254 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24255 return false;
24257 /* This test is true if the dependent insn reads the flags but
24258 not any other potentially set register. */
24259 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24260 return false;
24262 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24263 return false;
24265 return true;
24268 /* Return true iff USE_INSN has a memory address with operands set by
24269 SET_INSN. */
24271 bool
24272 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24274 int i;
24275 extract_insn_cached (use_insn);
24276 for (i = recog_data.n_operands - 1; i >= 0; --i)
24277 if (MEM_P (recog_data.operand[i]))
24279 rtx addr = XEXP (recog_data.operand[i], 0);
24280 return modified_in_p (addr, set_insn) != 0;
24282 return false;
24285 static int
24286 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24288 enum attr_type insn_type, dep_insn_type;
24289 enum attr_memory memory;
24290 rtx set, set2;
24291 int dep_insn_code_number;
24293 /* Anti and output dependencies have zero cost on all CPUs. */
24294 if (REG_NOTE_KIND (link) != 0)
24295 return 0;
24297 dep_insn_code_number = recog_memoized (dep_insn);
24299 /* If we can't recognize the insns, we can't really do anything. */
24300 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24301 return cost;
24303 insn_type = get_attr_type (insn);
24304 dep_insn_type = get_attr_type (dep_insn);
24306 switch (ix86_tune)
24308 case PROCESSOR_PENTIUM:
24309 /* Address Generation Interlock adds a cycle of latency. */
24310 if (insn_type == TYPE_LEA)
24312 rtx addr = PATTERN (insn);
24314 if (GET_CODE (addr) == PARALLEL)
24315 addr = XVECEXP (addr, 0, 0);
24317 gcc_assert (GET_CODE (addr) == SET);
24319 addr = SET_SRC (addr);
24320 if (modified_in_p (addr, dep_insn))
24321 cost += 1;
24323 else if (ix86_agi_dependent (dep_insn, insn))
24324 cost += 1;
24326 /* ??? Compares pair with jump/setcc. */
24327 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24328 cost = 0;
24330 /* Floating point stores require value to be ready one cycle earlier. */
24331 if (insn_type == TYPE_FMOV
24332 && get_attr_memory (insn) == MEMORY_STORE
24333 && !ix86_agi_dependent (dep_insn, insn))
24334 cost += 1;
24335 break;
24337 case PROCESSOR_PENTIUMPRO:
24338 memory = get_attr_memory (insn);
24340 /* INT->FP conversion is expensive. */
24341 if (get_attr_fp_int_src (dep_insn))
24342 cost += 5;
24344 /* There is one cycle extra latency between an FP op and a store. */
24345 if (insn_type == TYPE_FMOV
24346 && (set = single_set (dep_insn)) != NULL_RTX
24347 && (set2 = single_set (insn)) != NULL_RTX
24348 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24349 && MEM_P (SET_DEST (set2)))
24350 cost += 1;
24352 /* Show ability of reorder buffer to hide latency of load by executing
24353 in parallel with previous instruction in case
24354 previous instruction is not needed to compute the address. */
24355 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24356 && !ix86_agi_dependent (dep_insn, insn))
24358 /* Claim moves to take one cycle, as core can issue one load
24359 at time and the next load can start cycle later. */
24360 if (dep_insn_type == TYPE_IMOV
24361 || dep_insn_type == TYPE_FMOV)
24362 cost = 1;
24363 else if (cost > 1)
24364 cost--;
24366 break;
24368 case PROCESSOR_K6:
24369 memory = get_attr_memory (insn);
24371 /* The esp dependency is resolved before the instruction is really
24372 finished. */
24373 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24374 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24375 return 1;
24377 /* INT->FP conversion is expensive. */
24378 if (get_attr_fp_int_src (dep_insn))
24379 cost += 5;
24381 /* Show ability of reorder buffer to hide latency of load by executing
24382 in parallel with previous instruction in case
24383 previous instruction is not needed to compute the address. */
24384 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24385 && !ix86_agi_dependent (dep_insn, insn))
24387 /* Claim moves to take one cycle, as core can issue one load
24388 at time and the next load can start cycle later. */
24389 if (dep_insn_type == TYPE_IMOV
24390 || dep_insn_type == TYPE_FMOV)
24391 cost = 1;
24392 else if (cost > 2)
24393 cost -= 2;
24394 else
24395 cost = 1;
24397 break;
24399 case PROCESSOR_ATHLON:
24400 case PROCESSOR_K8:
24401 case PROCESSOR_AMDFAM10:
24402 case PROCESSOR_BDVER1:
24403 case PROCESSOR_BDVER2:
24404 case PROCESSOR_BDVER3:
24405 case PROCESSOR_BTVER1:
24406 case PROCESSOR_BTVER2:
24407 case PROCESSOR_ATOM:
24408 case PROCESSOR_GENERIC32:
24409 case PROCESSOR_GENERIC64:
24410 memory = get_attr_memory (insn);
24412 /* Show ability of reorder buffer to hide latency of load by executing
24413 in parallel with previous instruction in case
24414 previous instruction is not needed to compute the address. */
24415 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24416 && !ix86_agi_dependent (dep_insn, insn))
24418 enum attr_unit unit = get_attr_unit (insn);
24419 int loadcost = 3;
24421 /* Because of the difference between the length of integer and
24422 floating unit pipeline preparation stages, the memory operands
24423 for floating point are cheaper.
24425 ??? For Athlon it the difference is most probably 2. */
24426 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24427 loadcost = 3;
24428 else
24429 loadcost = TARGET_ATHLON ? 2 : 0;
24431 if (cost >= loadcost)
24432 cost -= loadcost;
24433 else
24434 cost = 0;
24437 default:
24438 break;
24441 return cost;
24444 /* How many alternative schedules to try. This should be as wide as the
24445 scheduling freedom in the DFA, but no wider. Making this value too
24446 large results extra work for the scheduler. */
24448 static int
24449 ia32_multipass_dfa_lookahead (void)
24451 switch (ix86_tune)
24453 case PROCESSOR_PENTIUM:
24454 return 2;
24456 case PROCESSOR_PENTIUMPRO:
24457 case PROCESSOR_K6:
24458 return 1;
24460 case PROCESSOR_CORE2:
24461 case PROCESSOR_COREI7:
24462 case PROCESSOR_HASWELL:
24463 case PROCESSOR_ATOM:
24464 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24465 as many instructions can be executed on a cycle, i.e.,
24466 issue_rate. I wonder why tuning for many CPUs does not do this. */
24467 if (reload_completed)
24468 return ix86_issue_rate ();
24469 /* Don't use lookahead for pre-reload schedule to save compile time. */
24470 return 0;
24472 default:
24473 return 0;
24477 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24478 execution. It is applied if
24479 (1) IMUL instruction is on the top of list;
24480 (2) There exists the only producer of independent IMUL instruction in
24481 ready list;
24482 (3) Put found producer on the top of ready list.
24483 Returns issue rate. */
24485 static int
24486 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24487 int clock_var ATTRIBUTE_UNUSED)
24489 static int issue_rate = -1;
24490 int n_ready = *pn_ready;
24491 rtx insn, insn1, insn2;
24492 int i;
24493 sd_iterator_def sd_it;
24494 dep_t dep;
24495 int index = -1;
24497 /* Set up issue rate. */
24498 issue_rate = ix86_issue_rate();
24500 /* Do reodering for Atom only. */
24501 if (ix86_tune != PROCESSOR_ATOM)
24502 return issue_rate;
24503 /* Do not perform ready list reodering for pre-reload schedule pass. */
24504 if (!reload_completed)
24505 return issue_rate;
24506 /* Nothing to do if ready list contains only 1 instruction. */
24507 if (n_ready <= 1)
24508 return issue_rate;
24510 /* Check that IMUL instruction is on the top of ready list. */
24511 insn = ready[n_ready - 1];
24512 if (!NONDEBUG_INSN_P (insn))
24513 return issue_rate;
24514 insn = PATTERN (insn);
24515 if (GET_CODE (insn) == PARALLEL)
24516 insn = XVECEXP (insn, 0, 0);
24517 if (GET_CODE (insn) != SET)
24518 return issue_rate;
24519 if (!(GET_CODE (SET_SRC (insn)) == MULT
24520 && GET_MODE (SET_SRC (insn)) == SImode))
24521 return issue_rate;
24523 /* Search for producer of independent IMUL instruction. */
24524 for (i = n_ready - 2; i>= 0; i--)
24526 insn = ready[i];
24527 if (!NONDEBUG_INSN_P (insn))
24528 continue;
24529 /* Skip IMUL instruction. */
24530 insn2 = PATTERN (insn);
24531 if (GET_CODE (insn2) == PARALLEL)
24532 insn2 = XVECEXP (insn2, 0, 0);
24533 if (GET_CODE (insn2) == SET
24534 && GET_CODE (SET_SRC (insn2)) == MULT
24535 && GET_MODE (SET_SRC (insn2)) == SImode)
24536 continue;
24538 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24540 rtx con;
24541 con = DEP_CON (dep);
24542 if (!NONDEBUG_INSN_P (con))
24543 continue;
24544 insn1 = PATTERN (con);
24545 if (GET_CODE (insn1) == PARALLEL)
24546 insn1 = XVECEXP (insn1, 0, 0);
24548 if (GET_CODE (insn1) == SET
24549 && GET_CODE (SET_SRC (insn1)) == MULT
24550 && GET_MODE (SET_SRC (insn1)) == SImode)
24552 sd_iterator_def sd_it1;
24553 dep_t dep1;
24554 /* Check if there is no other dependee for IMUL. */
24555 index = i;
24556 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24558 rtx pro;
24559 pro = DEP_PRO (dep1);
24560 if (!NONDEBUG_INSN_P (pro))
24561 continue;
24562 if (pro != insn)
24563 index = -1;
24565 if (index >= 0)
24566 break;
24569 if (index >= 0)
24570 break;
24572 if (index < 0)
24573 return issue_rate; /* Didn't find IMUL producer. */
24575 if (sched_verbose > 1)
24576 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24577 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24579 /* Put IMUL producer (ready[index]) at the top of ready list. */
24580 insn1= ready[index];
24581 for (i = index; i < n_ready - 1; i++)
24582 ready[i] = ready[i + 1];
24583 ready[n_ready - 1] = insn1;
24585 return issue_rate;
24588 static bool
24589 ix86_class_likely_spilled_p (reg_class_t);
24591 /* Returns true if lhs of insn is HW function argument register and set up
24592 is_spilled to true if it is likely spilled HW register. */
24593 static bool
24594 insn_is_function_arg (rtx insn, bool* is_spilled)
24596 rtx dst;
24598 if (!NONDEBUG_INSN_P (insn))
24599 return false;
24600 /* Call instructions are not movable, ignore it. */
24601 if (CALL_P (insn))
24602 return false;
24603 insn = PATTERN (insn);
24604 if (GET_CODE (insn) == PARALLEL)
24605 insn = XVECEXP (insn, 0, 0);
24606 if (GET_CODE (insn) != SET)
24607 return false;
24608 dst = SET_DEST (insn);
24609 if (REG_P (dst) && HARD_REGISTER_P (dst)
24610 && ix86_function_arg_regno_p (REGNO (dst)))
24612 /* Is it likely spilled HW register? */
24613 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24614 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24615 *is_spilled = true;
24616 return true;
24618 return false;
24621 /* Add output dependencies for chain of function adjacent arguments if only
24622 there is a move to likely spilled HW register. Return first argument
24623 if at least one dependence was added or NULL otherwise. */
24624 static rtx
24625 add_parameter_dependencies (rtx call, rtx head)
24627 rtx insn;
24628 rtx last = call;
24629 rtx first_arg = NULL;
24630 bool is_spilled = false;
24632 head = PREV_INSN (head);
24634 /* Find nearest to call argument passing instruction. */
24635 while (true)
24637 last = PREV_INSN (last);
24638 if (last == head)
24639 return NULL;
24640 if (!NONDEBUG_INSN_P (last))
24641 continue;
24642 if (insn_is_function_arg (last, &is_spilled))
24643 break;
24644 return NULL;
24647 first_arg = last;
24648 while (true)
24650 insn = PREV_INSN (last);
24651 if (!INSN_P (insn))
24652 break;
24653 if (insn == head)
24654 break;
24655 if (!NONDEBUG_INSN_P (insn))
24657 last = insn;
24658 continue;
24660 if (insn_is_function_arg (insn, &is_spilled))
24662 /* Add output depdendence between two function arguments if chain
24663 of output arguments contains likely spilled HW registers. */
24664 if (is_spilled)
24665 add_dependence (last, insn, REG_DEP_OUTPUT);
24666 first_arg = last = insn;
24668 else
24669 break;
24671 if (!is_spilled)
24672 return NULL;
24673 return first_arg;
24676 /* Add output or anti dependency from insn to first_arg to restrict its code
24677 motion. */
24678 static void
24679 avoid_func_arg_motion (rtx first_arg, rtx insn)
24681 rtx set;
24682 rtx tmp;
24684 set = single_set (insn);
24685 if (!set)
24686 return;
24687 tmp = SET_DEST (set);
24688 if (REG_P (tmp))
24690 /* Add output dependency to the first function argument. */
24691 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24692 return;
24694 /* Add anti dependency. */
24695 add_dependence (first_arg, insn, REG_DEP_ANTI);
24698 /* Avoid cross block motion of function argument through adding dependency
24699 from the first non-jump instruction in bb. */
24700 static void
24701 add_dependee_for_func_arg (rtx arg, basic_block bb)
24703 rtx insn = BB_END (bb);
24705 while (insn)
24707 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24709 rtx set = single_set (insn);
24710 if (set)
24712 avoid_func_arg_motion (arg, insn);
24713 return;
24716 if (insn == BB_HEAD (bb))
24717 return;
24718 insn = PREV_INSN (insn);
24722 /* Hook for pre-reload schedule - avoid motion of function arguments
24723 passed in likely spilled HW registers. */
24724 static void
24725 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24727 rtx insn;
24728 rtx first_arg = NULL;
24729 if (reload_completed)
24730 return;
24731 while (head != tail && DEBUG_INSN_P (head))
24732 head = NEXT_INSN (head);
24733 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24734 if (INSN_P (insn) && CALL_P (insn))
24736 first_arg = add_parameter_dependencies (insn, head);
24737 if (first_arg)
24739 /* Add dependee for first argument to predecessors if only
24740 region contains more than one block. */
24741 basic_block bb = BLOCK_FOR_INSN (insn);
24742 int rgn = CONTAINING_RGN (bb->index);
24743 int nr_blks = RGN_NR_BLOCKS (rgn);
24744 /* Skip trivial regions and region head blocks that can have
24745 predecessors outside of region. */
24746 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24748 edge e;
24749 edge_iterator ei;
24750 /* Assume that region is SCC, i.e. all immediate predecessors
24751 of non-head block are in the same region. */
24752 FOR_EACH_EDGE (e, ei, bb->preds)
24754 /* Avoid creating of loop-carried dependencies through
24755 using topological odering in region. */
24756 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24757 add_dependee_for_func_arg (first_arg, e->src);
24760 insn = first_arg;
24761 if (insn == head)
24762 break;
24765 else if (first_arg)
24766 avoid_func_arg_motion (first_arg, insn);
24769 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24770 HW registers to maximum, to schedule them at soon as possible. These are
24771 moves from function argument registers at the top of the function entry
24772 and moves from function return value registers after call. */
24773 static int
24774 ix86_adjust_priority (rtx insn, int priority)
24776 rtx set;
24778 if (reload_completed)
24779 return priority;
24781 if (!NONDEBUG_INSN_P (insn))
24782 return priority;
24784 set = single_set (insn);
24785 if (set)
24787 rtx tmp = SET_SRC (set);
24788 if (REG_P (tmp)
24789 && HARD_REGISTER_P (tmp)
24790 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24791 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24792 return current_sched_info->sched_max_insns_priority;
24795 return priority;
24798 /* Model decoder of Core 2/i7.
24799 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24800 track the instruction fetch block boundaries and make sure that long
24801 (9+ bytes) instructions are assigned to D0. */
24803 /* Maximum length of an insn that can be handled by
24804 a secondary decoder unit. '8' for Core 2/i7. */
24805 static int core2i7_secondary_decoder_max_insn_size;
24807 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24808 '16' for Core 2/i7. */
24809 static int core2i7_ifetch_block_size;
24811 /* Maximum number of instructions decoder can handle per cycle.
24812 '6' for Core 2/i7. */
24813 static int core2i7_ifetch_block_max_insns;
24815 typedef struct ix86_first_cycle_multipass_data_ *
24816 ix86_first_cycle_multipass_data_t;
24817 typedef const struct ix86_first_cycle_multipass_data_ *
24818 const_ix86_first_cycle_multipass_data_t;
24820 /* A variable to store target state across calls to max_issue within
24821 one cycle. */
24822 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24823 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24825 /* Initialize DATA. */
24826 static void
24827 core2i7_first_cycle_multipass_init (void *_data)
24829 ix86_first_cycle_multipass_data_t data
24830 = (ix86_first_cycle_multipass_data_t) _data;
24832 data->ifetch_block_len = 0;
24833 data->ifetch_block_n_insns = 0;
24834 data->ready_try_change = NULL;
24835 data->ready_try_change_size = 0;
24838 /* Advancing the cycle; reset ifetch block counts. */
24839 static void
24840 core2i7_dfa_post_advance_cycle (void)
24842 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24844 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24846 data->ifetch_block_len = 0;
24847 data->ifetch_block_n_insns = 0;
24850 static int min_insn_size (rtx);
24852 /* Filter out insns from ready_try that the core will not be able to issue
24853 on current cycle due to decoder. */
24854 static void
24855 core2i7_first_cycle_multipass_filter_ready_try
24856 (const_ix86_first_cycle_multipass_data_t data,
24857 char *ready_try, int n_ready, bool first_cycle_insn_p)
24859 while (n_ready--)
24861 rtx insn;
24862 int insn_size;
24864 if (ready_try[n_ready])
24865 continue;
24867 insn = get_ready_element (n_ready);
24868 insn_size = min_insn_size (insn);
24870 if (/* If this is a too long an insn for a secondary decoder ... */
24871 (!first_cycle_insn_p
24872 && insn_size > core2i7_secondary_decoder_max_insn_size)
24873 /* ... or it would not fit into the ifetch block ... */
24874 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24875 /* ... or the decoder is full already ... */
24876 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24877 /* ... mask the insn out. */
24879 ready_try[n_ready] = 1;
24881 if (data->ready_try_change)
24882 bitmap_set_bit (data->ready_try_change, n_ready);
24887 /* Prepare for a new round of multipass lookahead scheduling. */
24888 static void
24889 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24890 bool first_cycle_insn_p)
24892 ix86_first_cycle_multipass_data_t data
24893 = (ix86_first_cycle_multipass_data_t) _data;
24894 const_ix86_first_cycle_multipass_data_t prev_data
24895 = ix86_first_cycle_multipass_data;
24897 /* Restore the state from the end of the previous round. */
24898 data->ifetch_block_len = prev_data->ifetch_block_len;
24899 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24901 /* Filter instructions that cannot be issued on current cycle due to
24902 decoder restrictions. */
24903 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24904 first_cycle_insn_p);
24907 /* INSN is being issued in current solution. Account for its impact on
24908 the decoder model. */
24909 static void
24910 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24911 rtx insn, const void *_prev_data)
24913 ix86_first_cycle_multipass_data_t data
24914 = (ix86_first_cycle_multipass_data_t) _data;
24915 const_ix86_first_cycle_multipass_data_t prev_data
24916 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24918 int insn_size = min_insn_size (insn);
24920 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24921 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24922 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24923 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24925 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24926 if (!data->ready_try_change)
24928 data->ready_try_change = sbitmap_alloc (n_ready);
24929 data->ready_try_change_size = n_ready;
24931 else if (data->ready_try_change_size < n_ready)
24933 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24934 n_ready, 0);
24935 data->ready_try_change_size = n_ready;
24937 bitmap_clear (data->ready_try_change);
24939 /* Filter out insns from ready_try that the core will not be able to issue
24940 on current cycle due to decoder. */
24941 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24942 false);
24945 /* Revert the effect on ready_try. */
24946 static void
24947 core2i7_first_cycle_multipass_backtrack (const void *_data,
24948 char *ready_try,
24949 int n_ready ATTRIBUTE_UNUSED)
24951 const_ix86_first_cycle_multipass_data_t data
24952 = (const_ix86_first_cycle_multipass_data_t) _data;
24953 unsigned int i = 0;
24954 sbitmap_iterator sbi;
24956 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24957 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24959 ready_try[i] = 0;
24963 /* Save the result of multipass lookahead scheduling for the next round. */
24964 static void
24965 core2i7_first_cycle_multipass_end (const void *_data)
24967 const_ix86_first_cycle_multipass_data_t data
24968 = (const_ix86_first_cycle_multipass_data_t) _data;
24969 ix86_first_cycle_multipass_data_t next_data
24970 = ix86_first_cycle_multipass_data;
24972 if (data != NULL)
24974 next_data->ifetch_block_len = data->ifetch_block_len;
24975 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24979 /* Deallocate target data. */
24980 static void
24981 core2i7_first_cycle_multipass_fini (void *_data)
24983 ix86_first_cycle_multipass_data_t data
24984 = (ix86_first_cycle_multipass_data_t) _data;
24986 if (data->ready_try_change)
24988 sbitmap_free (data->ready_try_change);
24989 data->ready_try_change = NULL;
24990 data->ready_try_change_size = 0;
24994 /* Prepare for scheduling pass. */
24995 static void
24996 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24997 int verbose ATTRIBUTE_UNUSED,
24998 int max_uid ATTRIBUTE_UNUSED)
25000 /* Install scheduling hooks for current CPU. Some of these hooks are used
25001 in time-critical parts of the scheduler, so we only set them up when
25002 they are actually used. */
25003 switch (ix86_tune)
25005 case PROCESSOR_CORE2:
25006 case PROCESSOR_COREI7:
25007 case PROCESSOR_HASWELL:
25008 /* Do not perform multipass scheduling for pre-reload schedule
25009 to save compile time. */
25010 if (reload_completed)
25012 targetm.sched.dfa_post_advance_cycle
25013 = core2i7_dfa_post_advance_cycle;
25014 targetm.sched.first_cycle_multipass_init
25015 = core2i7_first_cycle_multipass_init;
25016 targetm.sched.first_cycle_multipass_begin
25017 = core2i7_first_cycle_multipass_begin;
25018 targetm.sched.first_cycle_multipass_issue
25019 = core2i7_first_cycle_multipass_issue;
25020 targetm.sched.first_cycle_multipass_backtrack
25021 = core2i7_first_cycle_multipass_backtrack;
25022 targetm.sched.first_cycle_multipass_end
25023 = core2i7_first_cycle_multipass_end;
25024 targetm.sched.first_cycle_multipass_fini
25025 = core2i7_first_cycle_multipass_fini;
25027 /* Set decoder parameters. */
25028 core2i7_secondary_decoder_max_insn_size = 8;
25029 core2i7_ifetch_block_size = 16;
25030 core2i7_ifetch_block_max_insns = 6;
25031 break;
25033 /* ... Fall through ... */
25034 default:
25035 targetm.sched.dfa_post_advance_cycle = NULL;
25036 targetm.sched.first_cycle_multipass_init = NULL;
25037 targetm.sched.first_cycle_multipass_begin = NULL;
25038 targetm.sched.first_cycle_multipass_issue = NULL;
25039 targetm.sched.first_cycle_multipass_backtrack = NULL;
25040 targetm.sched.first_cycle_multipass_end = NULL;
25041 targetm.sched.first_cycle_multipass_fini = NULL;
25042 break;
25047 /* Compute the alignment given to a constant that is being placed in memory.
25048 EXP is the constant and ALIGN is the alignment that the object would
25049 ordinarily have.
25050 The value of this function is used instead of that alignment to align
25051 the object. */
25054 ix86_constant_alignment (tree exp, int align)
25056 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
25057 || TREE_CODE (exp) == INTEGER_CST)
25059 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
25060 return 64;
25061 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
25062 return 128;
25064 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
25065 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
25066 return BITS_PER_WORD;
25068 return align;
25071 /* Compute the alignment for a static variable.
25072 TYPE is the data type, and ALIGN is the alignment that
25073 the object would ordinarily have. The value of this function is used
25074 instead of that alignment to align the object. */
25077 ix86_data_alignment (tree type, int align)
25079 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
25081 if (AGGREGATE_TYPE_P (type)
25082 && TYPE_SIZE (type)
25083 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25084 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
25085 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
25086 && align < max_align)
25087 align = max_align;
25089 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25090 to 16byte boundary. */
25091 if (TARGET_64BIT)
25093 if (AGGREGATE_TYPE_P (type)
25094 && TYPE_SIZE (type)
25095 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25096 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
25097 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25098 return 128;
25101 if (TREE_CODE (type) == ARRAY_TYPE)
25103 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25104 return 64;
25105 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25106 return 128;
25108 else if (TREE_CODE (type) == COMPLEX_TYPE)
25111 if (TYPE_MODE (type) == DCmode && align < 64)
25112 return 64;
25113 if ((TYPE_MODE (type) == XCmode
25114 || TYPE_MODE (type) == TCmode) && align < 128)
25115 return 128;
25117 else if ((TREE_CODE (type) == RECORD_TYPE
25118 || TREE_CODE (type) == UNION_TYPE
25119 || TREE_CODE (type) == QUAL_UNION_TYPE)
25120 && TYPE_FIELDS (type))
25122 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25123 return 64;
25124 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25125 return 128;
25127 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25128 || TREE_CODE (type) == INTEGER_TYPE)
25130 if (TYPE_MODE (type) == DFmode && align < 64)
25131 return 64;
25132 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25133 return 128;
25136 return align;
25139 /* Compute the alignment for a local variable or a stack slot. EXP is
25140 the data type or decl itself, MODE is the widest mode available and
25141 ALIGN is the alignment that the object would ordinarily have. The
25142 value of this macro is used instead of that alignment to align the
25143 object. */
25145 unsigned int
25146 ix86_local_alignment (tree exp, enum machine_mode mode,
25147 unsigned int align)
25149 tree type, decl;
25151 if (exp && DECL_P (exp))
25153 type = TREE_TYPE (exp);
25154 decl = exp;
25156 else
25158 type = exp;
25159 decl = NULL;
25162 /* Don't do dynamic stack realignment for long long objects with
25163 -mpreferred-stack-boundary=2. */
25164 if (!TARGET_64BIT
25165 && align == 64
25166 && ix86_preferred_stack_boundary < 64
25167 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25168 && (!type || !TYPE_USER_ALIGN (type))
25169 && (!decl || !DECL_USER_ALIGN (decl)))
25170 align = 32;
25172 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25173 register in MODE. We will return the largest alignment of XF
25174 and DF. */
25175 if (!type)
25177 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25178 align = GET_MODE_ALIGNMENT (DFmode);
25179 return align;
25182 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25183 to 16byte boundary. Exact wording is:
25185 An array uses the same alignment as its elements, except that a local or
25186 global array variable of length at least 16 bytes or
25187 a C99 variable-length array variable always has alignment of at least 16 bytes.
25189 This was added to allow use of aligned SSE instructions at arrays. This
25190 rule is meant for static storage (where compiler can not do the analysis
25191 by itself). We follow it for automatic variables only when convenient.
25192 We fully control everything in the function compiled and functions from
25193 other unit can not rely on the alignment.
25195 Exclude va_list type. It is the common case of local array where
25196 we can not benefit from the alignment. */
25197 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25198 && TARGET_SSE)
25200 if (AGGREGATE_TYPE_P (type)
25201 && (va_list_type_node == NULL_TREE
25202 || (TYPE_MAIN_VARIANT (type)
25203 != TYPE_MAIN_VARIANT (va_list_type_node)))
25204 && TYPE_SIZE (type)
25205 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25206 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25207 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25208 return 128;
25210 if (TREE_CODE (type) == ARRAY_TYPE)
25212 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25213 return 64;
25214 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25215 return 128;
25217 else if (TREE_CODE (type) == COMPLEX_TYPE)
25219 if (TYPE_MODE (type) == DCmode && align < 64)
25220 return 64;
25221 if ((TYPE_MODE (type) == XCmode
25222 || TYPE_MODE (type) == TCmode) && align < 128)
25223 return 128;
25225 else if ((TREE_CODE (type) == RECORD_TYPE
25226 || TREE_CODE (type) == UNION_TYPE
25227 || TREE_CODE (type) == QUAL_UNION_TYPE)
25228 && TYPE_FIELDS (type))
25230 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25231 return 64;
25232 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25233 return 128;
25235 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25236 || TREE_CODE (type) == INTEGER_TYPE)
25239 if (TYPE_MODE (type) == DFmode && align < 64)
25240 return 64;
25241 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25242 return 128;
25244 return align;
25247 /* Compute the minimum required alignment for dynamic stack realignment
25248 purposes for a local variable, parameter or a stack slot. EXP is
25249 the data type or decl itself, MODE is its mode and ALIGN is the
25250 alignment that the object would ordinarily have. */
25252 unsigned int
25253 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25254 unsigned int align)
25256 tree type, decl;
25258 if (exp && DECL_P (exp))
25260 type = TREE_TYPE (exp);
25261 decl = exp;
25263 else
25265 type = exp;
25266 decl = NULL;
25269 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25270 return align;
25272 /* Don't do dynamic stack realignment for long long objects with
25273 -mpreferred-stack-boundary=2. */
25274 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25275 && (!type || !TYPE_USER_ALIGN (type))
25276 && (!decl || !DECL_USER_ALIGN (decl)))
25277 return 32;
25279 return align;
25282 /* Find a location for the static chain incoming to a nested function.
25283 This is a register, unless all free registers are used by arguments. */
25285 static rtx
25286 ix86_static_chain (const_tree fndecl, bool incoming_p)
25288 unsigned regno;
25290 if (!DECL_STATIC_CHAIN (fndecl))
25291 return NULL;
25293 if (TARGET_64BIT)
25295 /* We always use R10 in 64-bit mode. */
25296 regno = R10_REG;
25298 else
25300 tree fntype;
25301 unsigned int ccvt;
25303 /* By default in 32-bit mode we use ECX to pass the static chain. */
25304 regno = CX_REG;
25306 fntype = TREE_TYPE (fndecl);
25307 ccvt = ix86_get_callcvt (fntype);
25308 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25310 /* Fastcall functions use ecx/edx for arguments, which leaves
25311 us with EAX for the static chain.
25312 Thiscall functions use ecx for arguments, which also
25313 leaves us with EAX for the static chain. */
25314 regno = AX_REG;
25316 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25318 /* Thiscall functions use ecx for arguments, which leaves
25319 us with EAX and EDX for the static chain.
25320 We are using for abi-compatibility EAX. */
25321 regno = AX_REG;
25323 else if (ix86_function_regparm (fntype, fndecl) == 3)
25325 /* For regparm 3, we have no free call-clobbered registers in
25326 which to store the static chain. In order to implement this,
25327 we have the trampoline push the static chain to the stack.
25328 However, we can't push a value below the return address when
25329 we call the nested function directly, so we have to use an
25330 alternate entry point. For this we use ESI, and have the
25331 alternate entry point push ESI, so that things appear the
25332 same once we're executing the nested function. */
25333 if (incoming_p)
25335 if (fndecl == current_function_decl)
25336 ix86_static_chain_on_stack = true;
25337 return gen_frame_mem (SImode,
25338 plus_constant (Pmode,
25339 arg_pointer_rtx, -8));
25341 regno = SI_REG;
25345 return gen_rtx_REG (Pmode, regno);
25348 /* Emit RTL insns to initialize the variable parts of a trampoline.
25349 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25350 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25351 to be passed to the target function. */
25353 static void
25354 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25356 rtx mem, fnaddr;
25357 int opcode;
25358 int offset = 0;
25360 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25362 if (TARGET_64BIT)
25364 int size;
25366 /* Load the function address to r11. Try to load address using
25367 the shorter movl instead of movabs. We may want to support
25368 movq for kernel mode, but kernel does not use trampolines at
25369 the moment. FNADDR is a 32bit address and may not be in
25370 DImode when ptr_mode == SImode. Always use movl in this
25371 case. */
25372 if (ptr_mode == SImode
25373 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25375 fnaddr = copy_addr_to_reg (fnaddr);
25377 mem = adjust_address (m_tramp, HImode, offset);
25378 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25380 mem = adjust_address (m_tramp, SImode, offset + 2);
25381 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25382 offset += 6;
25384 else
25386 mem = adjust_address (m_tramp, HImode, offset);
25387 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25389 mem = adjust_address (m_tramp, DImode, offset + 2);
25390 emit_move_insn (mem, fnaddr);
25391 offset += 10;
25394 /* Load static chain using movabs to r10. Use the shorter movl
25395 instead of movabs when ptr_mode == SImode. */
25396 if (ptr_mode == SImode)
25398 opcode = 0xba41;
25399 size = 6;
25401 else
25403 opcode = 0xba49;
25404 size = 10;
25407 mem = adjust_address (m_tramp, HImode, offset);
25408 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25410 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25411 emit_move_insn (mem, chain_value);
25412 offset += size;
25414 /* Jump to r11; the last (unused) byte is a nop, only there to
25415 pad the write out to a single 32-bit store. */
25416 mem = adjust_address (m_tramp, SImode, offset);
25417 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25418 offset += 4;
25420 else
25422 rtx disp, chain;
25424 /* Depending on the static chain location, either load a register
25425 with a constant, or push the constant to the stack. All of the
25426 instructions are the same size. */
25427 chain = ix86_static_chain (fndecl, true);
25428 if (REG_P (chain))
25430 switch (REGNO (chain))
25432 case AX_REG:
25433 opcode = 0xb8; break;
25434 case CX_REG:
25435 opcode = 0xb9; break;
25436 default:
25437 gcc_unreachable ();
25440 else
25441 opcode = 0x68;
25443 mem = adjust_address (m_tramp, QImode, offset);
25444 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25446 mem = adjust_address (m_tramp, SImode, offset + 1);
25447 emit_move_insn (mem, chain_value);
25448 offset += 5;
25450 mem = adjust_address (m_tramp, QImode, offset);
25451 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25453 mem = adjust_address (m_tramp, SImode, offset + 1);
25455 /* Compute offset from the end of the jmp to the target function.
25456 In the case in which the trampoline stores the static chain on
25457 the stack, we need to skip the first insn which pushes the
25458 (call-saved) register static chain; this push is 1 byte. */
25459 offset += 5;
25460 disp = expand_binop (SImode, sub_optab, fnaddr,
25461 plus_constant (Pmode, XEXP (m_tramp, 0),
25462 offset - (MEM_P (chain) ? 1 : 0)),
25463 NULL_RTX, 1, OPTAB_DIRECT);
25464 emit_move_insn (mem, disp);
25467 gcc_assert (offset <= TRAMPOLINE_SIZE);
25469 #ifdef HAVE_ENABLE_EXECUTE_STACK
25470 #ifdef CHECK_EXECUTE_STACK_ENABLED
25471 if (CHECK_EXECUTE_STACK_ENABLED)
25472 #endif
25473 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25474 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25475 #endif
25478 /* The following file contains several enumerations and data structures
25479 built from the definitions in i386-builtin-types.def. */
25481 #include "i386-builtin-types.inc"
25483 /* Table for the ix86 builtin non-function types. */
25484 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25486 /* Retrieve an element from the above table, building some of
25487 the types lazily. */
25489 static tree
25490 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25492 unsigned int index;
25493 tree type, itype;
25495 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25497 type = ix86_builtin_type_tab[(int) tcode];
25498 if (type != NULL)
25499 return type;
25501 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25502 if (tcode <= IX86_BT_LAST_VECT)
25504 enum machine_mode mode;
25506 index = tcode - IX86_BT_LAST_PRIM - 1;
25507 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25508 mode = ix86_builtin_type_vect_mode[index];
25510 type = build_vector_type_for_mode (itype, mode);
25512 else
25514 int quals;
25516 index = tcode - IX86_BT_LAST_VECT - 1;
25517 if (tcode <= IX86_BT_LAST_PTR)
25518 quals = TYPE_UNQUALIFIED;
25519 else
25520 quals = TYPE_QUAL_CONST;
25522 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25523 if (quals != TYPE_UNQUALIFIED)
25524 itype = build_qualified_type (itype, quals);
25526 type = build_pointer_type (itype);
25529 ix86_builtin_type_tab[(int) tcode] = type;
25530 return type;
25533 /* Table for the ix86 builtin function types. */
25534 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25536 /* Retrieve an element from the above table, building some of
25537 the types lazily. */
25539 static tree
25540 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25542 tree type;
25544 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25546 type = ix86_builtin_func_type_tab[(int) tcode];
25547 if (type != NULL)
25548 return type;
25550 if (tcode <= IX86_BT_LAST_FUNC)
25552 unsigned start = ix86_builtin_func_start[(int) tcode];
25553 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25554 tree rtype, atype, args = void_list_node;
25555 unsigned i;
25557 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25558 for (i = after - 1; i > start; --i)
25560 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25561 args = tree_cons (NULL, atype, args);
25564 type = build_function_type (rtype, args);
25566 else
25568 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25569 enum ix86_builtin_func_type icode;
25571 icode = ix86_builtin_func_alias_base[index];
25572 type = ix86_get_builtin_func_type (icode);
25575 ix86_builtin_func_type_tab[(int) tcode] = type;
25576 return type;
25580 /* Codes for all the SSE/MMX builtins. */
25581 enum ix86_builtins
25583 IX86_BUILTIN_ADDPS,
25584 IX86_BUILTIN_ADDSS,
25585 IX86_BUILTIN_DIVPS,
25586 IX86_BUILTIN_DIVSS,
25587 IX86_BUILTIN_MULPS,
25588 IX86_BUILTIN_MULSS,
25589 IX86_BUILTIN_SUBPS,
25590 IX86_BUILTIN_SUBSS,
25592 IX86_BUILTIN_CMPEQPS,
25593 IX86_BUILTIN_CMPLTPS,
25594 IX86_BUILTIN_CMPLEPS,
25595 IX86_BUILTIN_CMPGTPS,
25596 IX86_BUILTIN_CMPGEPS,
25597 IX86_BUILTIN_CMPNEQPS,
25598 IX86_BUILTIN_CMPNLTPS,
25599 IX86_BUILTIN_CMPNLEPS,
25600 IX86_BUILTIN_CMPNGTPS,
25601 IX86_BUILTIN_CMPNGEPS,
25602 IX86_BUILTIN_CMPORDPS,
25603 IX86_BUILTIN_CMPUNORDPS,
25604 IX86_BUILTIN_CMPEQSS,
25605 IX86_BUILTIN_CMPLTSS,
25606 IX86_BUILTIN_CMPLESS,
25607 IX86_BUILTIN_CMPNEQSS,
25608 IX86_BUILTIN_CMPNLTSS,
25609 IX86_BUILTIN_CMPNLESS,
25610 IX86_BUILTIN_CMPNGTSS,
25611 IX86_BUILTIN_CMPNGESS,
25612 IX86_BUILTIN_CMPORDSS,
25613 IX86_BUILTIN_CMPUNORDSS,
25615 IX86_BUILTIN_COMIEQSS,
25616 IX86_BUILTIN_COMILTSS,
25617 IX86_BUILTIN_COMILESS,
25618 IX86_BUILTIN_COMIGTSS,
25619 IX86_BUILTIN_COMIGESS,
25620 IX86_BUILTIN_COMINEQSS,
25621 IX86_BUILTIN_UCOMIEQSS,
25622 IX86_BUILTIN_UCOMILTSS,
25623 IX86_BUILTIN_UCOMILESS,
25624 IX86_BUILTIN_UCOMIGTSS,
25625 IX86_BUILTIN_UCOMIGESS,
25626 IX86_BUILTIN_UCOMINEQSS,
25628 IX86_BUILTIN_CVTPI2PS,
25629 IX86_BUILTIN_CVTPS2PI,
25630 IX86_BUILTIN_CVTSI2SS,
25631 IX86_BUILTIN_CVTSI642SS,
25632 IX86_BUILTIN_CVTSS2SI,
25633 IX86_BUILTIN_CVTSS2SI64,
25634 IX86_BUILTIN_CVTTPS2PI,
25635 IX86_BUILTIN_CVTTSS2SI,
25636 IX86_BUILTIN_CVTTSS2SI64,
25638 IX86_BUILTIN_MAXPS,
25639 IX86_BUILTIN_MAXSS,
25640 IX86_BUILTIN_MINPS,
25641 IX86_BUILTIN_MINSS,
25643 IX86_BUILTIN_LOADUPS,
25644 IX86_BUILTIN_STOREUPS,
25645 IX86_BUILTIN_MOVSS,
25647 IX86_BUILTIN_MOVHLPS,
25648 IX86_BUILTIN_MOVLHPS,
25649 IX86_BUILTIN_LOADHPS,
25650 IX86_BUILTIN_LOADLPS,
25651 IX86_BUILTIN_STOREHPS,
25652 IX86_BUILTIN_STORELPS,
25654 IX86_BUILTIN_MASKMOVQ,
25655 IX86_BUILTIN_MOVMSKPS,
25656 IX86_BUILTIN_PMOVMSKB,
25658 IX86_BUILTIN_MOVNTPS,
25659 IX86_BUILTIN_MOVNTQ,
25661 IX86_BUILTIN_LOADDQU,
25662 IX86_BUILTIN_STOREDQU,
25664 IX86_BUILTIN_PACKSSWB,
25665 IX86_BUILTIN_PACKSSDW,
25666 IX86_BUILTIN_PACKUSWB,
25668 IX86_BUILTIN_PADDB,
25669 IX86_BUILTIN_PADDW,
25670 IX86_BUILTIN_PADDD,
25671 IX86_BUILTIN_PADDQ,
25672 IX86_BUILTIN_PADDSB,
25673 IX86_BUILTIN_PADDSW,
25674 IX86_BUILTIN_PADDUSB,
25675 IX86_BUILTIN_PADDUSW,
25676 IX86_BUILTIN_PSUBB,
25677 IX86_BUILTIN_PSUBW,
25678 IX86_BUILTIN_PSUBD,
25679 IX86_BUILTIN_PSUBQ,
25680 IX86_BUILTIN_PSUBSB,
25681 IX86_BUILTIN_PSUBSW,
25682 IX86_BUILTIN_PSUBUSB,
25683 IX86_BUILTIN_PSUBUSW,
25685 IX86_BUILTIN_PAND,
25686 IX86_BUILTIN_PANDN,
25687 IX86_BUILTIN_POR,
25688 IX86_BUILTIN_PXOR,
25690 IX86_BUILTIN_PAVGB,
25691 IX86_BUILTIN_PAVGW,
25693 IX86_BUILTIN_PCMPEQB,
25694 IX86_BUILTIN_PCMPEQW,
25695 IX86_BUILTIN_PCMPEQD,
25696 IX86_BUILTIN_PCMPGTB,
25697 IX86_BUILTIN_PCMPGTW,
25698 IX86_BUILTIN_PCMPGTD,
25700 IX86_BUILTIN_PMADDWD,
25702 IX86_BUILTIN_PMAXSW,
25703 IX86_BUILTIN_PMAXUB,
25704 IX86_BUILTIN_PMINSW,
25705 IX86_BUILTIN_PMINUB,
25707 IX86_BUILTIN_PMULHUW,
25708 IX86_BUILTIN_PMULHW,
25709 IX86_BUILTIN_PMULLW,
25711 IX86_BUILTIN_PSADBW,
25712 IX86_BUILTIN_PSHUFW,
25714 IX86_BUILTIN_PSLLW,
25715 IX86_BUILTIN_PSLLD,
25716 IX86_BUILTIN_PSLLQ,
25717 IX86_BUILTIN_PSRAW,
25718 IX86_BUILTIN_PSRAD,
25719 IX86_BUILTIN_PSRLW,
25720 IX86_BUILTIN_PSRLD,
25721 IX86_BUILTIN_PSRLQ,
25722 IX86_BUILTIN_PSLLWI,
25723 IX86_BUILTIN_PSLLDI,
25724 IX86_BUILTIN_PSLLQI,
25725 IX86_BUILTIN_PSRAWI,
25726 IX86_BUILTIN_PSRADI,
25727 IX86_BUILTIN_PSRLWI,
25728 IX86_BUILTIN_PSRLDI,
25729 IX86_BUILTIN_PSRLQI,
25731 IX86_BUILTIN_PUNPCKHBW,
25732 IX86_BUILTIN_PUNPCKHWD,
25733 IX86_BUILTIN_PUNPCKHDQ,
25734 IX86_BUILTIN_PUNPCKLBW,
25735 IX86_BUILTIN_PUNPCKLWD,
25736 IX86_BUILTIN_PUNPCKLDQ,
25738 IX86_BUILTIN_SHUFPS,
25740 IX86_BUILTIN_RCPPS,
25741 IX86_BUILTIN_RCPSS,
25742 IX86_BUILTIN_RSQRTPS,
25743 IX86_BUILTIN_RSQRTPS_NR,
25744 IX86_BUILTIN_RSQRTSS,
25745 IX86_BUILTIN_RSQRTF,
25746 IX86_BUILTIN_SQRTPS,
25747 IX86_BUILTIN_SQRTPS_NR,
25748 IX86_BUILTIN_SQRTSS,
25750 IX86_BUILTIN_UNPCKHPS,
25751 IX86_BUILTIN_UNPCKLPS,
25753 IX86_BUILTIN_ANDPS,
25754 IX86_BUILTIN_ANDNPS,
25755 IX86_BUILTIN_ORPS,
25756 IX86_BUILTIN_XORPS,
25758 IX86_BUILTIN_EMMS,
25759 IX86_BUILTIN_LDMXCSR,
25760 IX86_BUILTIN_STMXCSR,
25761 IX86_BUILTIN_SFENCE,
25763 IX86_BUILTIN_FXSAVE,
25764 IX86_BUILTIN_FXRSTOR,
25765 IX86_BUILTIN_FXSAVE64,
25766 IX86_BUILTIN_FXRSTOR64,
25768 IX86_BUILTIN_XSAVE,
25769 IX86_BUILTIN_XRSTOR,
25770 IX86_BUILTIN_XSAVE64,
25771 IX86_BUILTIN_XRSTOR64,
25773 IX86_BUILTIN_XSAVEOPT,
25774 IX86_BUILTIN_XSAVEOPT64,
25776 /* 3DNow! Original */
25777 IX86_BUILTIN_FEMMS,
25778 IX86_BUILTIN_PAVGUSB,
25779 IX86_BUILTIN_PF2ID,
25780 IX86_BUILTIN_PFACC,
25781 IX86_BUILTIN_PFADD,
25782 IX86_BUILTIN_PFCMPEQ,
25783 IX86_BUILTIN_PFCMPGE,
25784 IX86_BUILTIN_PFCMPGT,
25785 IX86_BUILTIN_PFMAX,
25786 IX86_BUILTIN_PFMIN,
25787 IX86_BUILTIN_PFMUL,
25788 IX86_BUILTIN_PFRCP,
25789 IX86_BUILTIN_PFRCPIT1,
25790 IX86_BUILTIN_PFRCPIT2,
25791 IX86_BUILTIN_PFRSQIT1,
25792 IX86_BUILTIN_PFRSQRT,
25793 IX86_BUILTIN_PFSUB,
25794 IX86_BUILTIN_PFSUBR,
25795 IX86_BUILTIN_PI2FD,
25796 IX86_BUILTIN_PMULHRW,
25798 /* 3DNow! Athlon Extensions */
25799 IX86_BUILTIN_PF2IW,
25800 IX86_BUILTIN_PFNACC,
25801 IX86_BUILTIN_PFPNACC,
25802 IX86_BUILTIN_PI2FW,
25803 IX86_BUILTIN_PSWAPDSI,
25804 IX86_BUILTIN_PSWAPDSF,
25806 /* SSE2 */
25807 IX86_BUILTIN_ADDPD,
25808 IX86_BUILTIN_ADDSD,
25809 IX86_BUILTIN_DIVPD,
25810 IX86_BUILTIN_DIVSD,
25811 IX86_BUILTIN_MULPD,
25812 IX86_BUILTIN_MULSD,
25813 IX86_BUILTIN_SUBPD,
25814 IX86_BUILTIN_SUBSD,
25816 IX86_BUILTIN_CMPEQPD,
25817 IX86_BUILTIN_CMPLTPD,
25818 IX86_BUILTIN_CMPLEPD,
25819 IX86_BUILTIN_CMPGTPD,
25820 IX86_BUILTIN_CMPGEPD,
25821 IX86_BUILTIN_CMPNEQPD,
25822 IX86_BUILTIN_CMPNLTPD,
25823 IX86_BUILTIN_CMPNLEPD,
25824 IX86_BUILTIN_CMPNGTPD,
25825 IX86_BUILTIN_CMPNGEPD,
25826 IX86_BUILTIN_CMPORDPD,
25827 IX86_BUILTIN_CMPUNORDPD,
25828 IX86_BUILTIN_CMPEQSD,
25829 IX86_BUILTIN_CMPLTSD,
25830 IX86_BUILTIN_CMPLESD,
25831 IX86_BUILTIN_CMPNEQSD,
25832 IX86_BUILTIN_CMPNLTSD,
25833 IX86_BUILTIN_CMPNLESD,
25834 IX86_BUILTIN_CMPORDSD,
25835 IX86_BUILTIN_CMPUNORDSD,
25837 IX86_BUILTIN_COMIEQSD,
25838 IX86_BUILTIN_COMILTSD,
25839 IX86_BUILTIN_COMILESD,
25840 IX86_BUILTIN_COMIGTSD,
25841 IX86_BUILTIN_COMIGESD,
25842 IX86_BUILTIN_COMINEQSD,
25843 IX86_BUILTIN_UCOMIEQSD,
25844 IX86_BUILTIN_UCOMILTSD,
25845 IX86_BUILTIN_UCOMILESD,
25846 IX86_BUILTIN_UCOMIGTSD,
25847 IX86_BUILTIN_UCOMIGESD,
25848 IX86_BUILTIN_UCOMINEQSD,
25850 IX86_BUILTIN_MAXPD,
25851 IX86_BUILTIN_MAXSD,
25852 IX86_BUILTIN_MINPD,
25853 IX86_BUILTIN_MINSD,
25855 IX86_BUILTIN_ANDPD,
25856 IX86_BUILTIN_ANDNPD,
25857 IX86_BUILTIN_ORPD,
25858 IX86_BUILTIN_XORPD,
25860 IX86_BUILTIN_SQRTPD,
25861 IX86_BUILTIN_SQRTSD,
25863 IX86_BUILTIN_UNPCKHPD,
25864 IX86_BUILTIN_UNPCKLPD,
25866 IX86_BUILTIN_SHUFPD,
25868 IX86_BUILTIN_LOADUPD,
25869 IX86_BUILTIN_STOREUPD,
25870 IX86_BUILTIN_MOVSD,
25872 IX86_BUILTIN_LOADHPD,
25873 IX86_BUILTIN_LOADLPD,
25875 IX86_BUILTIN_CVTDQ2PD,
25876 IX86_BUILTIN_CVTDQ2PS,
25878 IX86_BUILTIN_CVTPD2DQ,
25879 IX86_BUILTIN_CVTPD2PI,
25880 IX86_BUILTIN_CVTPD2PS,
25881 IX86_BUILTIN_CVTTPD2DQ,
25882 IX86_BUILTIN_CVTTPD2PI,
25884 IX86_BUILTIN_CVTPI2PD,
25885 IX86_BUILTIN_CVTSI2SD,
25886 IX86_BUILTIN_CVTSI642SD,
25888 IX86_BUILTIN_CVTSD2SI,
25889 IX86_BUILTIN_CVTSD2SI64,
25890 IX86_BUILTIN_CVTSD2SS,
25891 IX86_BUILTIN_CVTSS2SD,
25892 IX86_BUILTIN_CVTTSD2SI,
25893 IX86_BUILTIN_CVTTSD2SI64,
25895 IX86_BUILTIN_CVTPS2DQ,
25896 IX86_BUILTIN_CVTPS2PD,
25897 IX86_BUILTIN_CVTTPS2DQ,
25899 IX86_BUILTIN_MOVNTI,
25900 IX86_BUILTIN_MOVNTI64,
25901 IX86_BUILTIN_MOVNTPD,
25902 IX86_BUILTIN_MOVNTDQ,
25904 IX86_BUILTIN_MOVQ128,
25906 /* SSE2 MMX */
25907 IX86_BUILTIN_MASKMOVDQU,
25908 IX86_BUILTIN_MOVMSKPD,
25909 IX86_BUILTIN_PMOVMSKB128,
25911 IX86_BUILTIN_PACKSSWB128,
25912 IX86_BUILTIN_PACKSSDW128,
25913 IX86_BUILTIN_PACKUSWB128,
25915 IX86_BUILTIN_PADDB128,
25916 IX86_BUILTIN_PADDW128,
25917 IX86_BUILTIN_PADDD128,
25918 IX86_BUILTIN_PADDQ128,
25919 IX86_BUILTIN_PADDSB128,
25920 IX86_BUILTIN_PADDSW128,
25921 IX86_BUILTIN_PADDUSB128,
25922 IX86_BUILTIN_PADDUSW128,
25923 IX86_BUILTIN_PSUBB128,
25924 IX86_BUILTIN_PSUBW128,
25925 IX86_BUILTIN_PSUBD128,
25926 IX86_BUILTIN_PSUBQ128,
25927 IX86_BUILTIN_PSUBSB128,
25928 IX86_BUILTIN_PSUBSW128,
25929 IX86_BUILTIN_PSUBUSB128,
25930 IX86_BUILTIN_PSUBUSW128,
25932 IX86_BUILTIN_PAND128,
25933 IX86_BUILTIN_PANDN128,
25934 IX86_BUILTIN_POR128,
25935 IX86_BUILTIN_PXOR128,
25937 IX86_BUILTIN_PAVGB128,
25938 IX86_BUILTIN_PAVGW128,
25940 IX86_BUILTIN_PCMPEQB128,
25941 IX86_BUILTIN_PCMPEQW128,
25942 IX86_BUILTIN_PCMPEQD128,
25943 IX86_BUILTIN_PCMPGTB128,
25944 IX86_BUILTIN_PCMPGTW128,
25945 IX86_BUILTIN_PCMPGTD128,
25947 IX86_BUILTIN_PMADDWD128,
25949 IX86_BUILTIN_PMAXSW128,
25950 IX86_BUILTIN_PMAXUB128,
25951 IX86_BUILTIN_PMINSW128,
25952 IX86_BUILTIN_PMINUB128,
25954 IX86_BUILTIN_PMULUDQ,
25955 IX86_BUILTIN_PMULUDQ128,
25956 IX86_BUILTIN_PMULHUW128,
25957 IX86_BUILTIN_PMULHW128,
25958 IX86_BUILTIN_PMULLW128,
25960 IX86_BUILTIN_PSADBW128,
25961 IX86_BUILTIN_PSHUFHW,
25962 IX86_BUILTIN_PSHUFLW,
25963 IX86_BUILTIN_PSHUFD,
25965 IX86_BUILTIN_PSLLDQI128,
25966 IX86_BUILTIN_PSLLWI128,
25967 IX86_BUILTIN_PSLLDI128,
25968 IX86_BUILTIN_PSLLQI128,
25969 IX86_BUILTIN_PSRAWI128,
25970 IX86_BUILTIN_PSRADI128,
25971 IX86_BUILTIN_PSRLDQI128,
25972 IX86_BUILTIN_PSRLWI128,
25973 IX86_BUILTIN_PSRLDI128,
25974 IX86_BUILTIN_PSRLQI128,
25976 IX86_BUILTIN_PSLLDQ128,
25977 IX86_BUILTIN_PSLLW128,
25978 IX86_BUILTIN_PSLLD128,
25979 IX86_BUILTIN_PSLLQ128,
25980 IX86_BUILTIN_PSRAW128,
25981 IX86_BUILTIN_PSRAD128,
25982 IX86_BUILTIN_PSRLW128,
25983 IX86_BUILTIN_PSRLD128,
25984 IX86_BUILTIN_PSRLQ128,
25986 IX86_BUILTIN_PUNPCKHBW128,
25987 IX86_BUILTIN_PUNPCKHWD128,
25988 IX86_BUILTIN_PUNPCKHDQ128,
25989 IX86_BUILTIN_PUNPCKHQDQ128,
25990 IX86_BUILTIN_PUNPCKLBW128,
25991 IX86_BUILTIN_PUNPCKLWD128,
25992 IX86_BUILTIN_PUNPCKLDQ128,
25993 IX86_BUILTIN_PUNPCKLQDQ128,
25995 IX86_BUILTIN_CLFLUSH,
25996 IX86_BUILTIN_MFENCE,
25997 IX86_BUILTIN_LFENCE,
25998 IX86_BUILTIN_PAUSE,
26000 IX86_BUILTIN_BSRSI,
26001 IX86_BUILTIN_BSRDI,
26002 IX86_BUILTIN_RDPMC,
26003 IX86_BUILTIN_RDTSC,
26004 IX86_BUILTIN_RDTSCP,
26005 IX86_BUILTIN_ROLQI,
26006 IX86_BUILTIN_ROLHI,
26007 IX86_BUILTIN_RORQI,
26008 IX86_BUILTIN_RORHI,
26010 /* SSE3. */
26011 IX86_BUILTIN_ADDSUBPS,
26012 IX86_BUILTIN_HADDPS,
26013 IX86_BUILTIN_HSUBPS,
26014 IX86_BUILTIN_MOVSHDUP,
26015 IX86_BUILTIN_MOVSLDUP,
26016 IX86_BUILTIN_ADDSUBPD,
26017 IX86_BUILTIN_HADDPD,
26018 IX86_BUILTIN_HSUBPD,
26019 IX86_BUILTIN_LDDQU,
26021 IX86_BUILTIN_MONITOR,
26022 IX86_BUILTIN_MWAIT,
26024 /* SSSE3. */
26025 IX86_BUILTIN_PHADDW,
26026 IX86_BUILTIN_PHADDD,
26027 IX86_BUILTIN_PHADDSW,
26028 IX86_BUILTIN_PHSUBW,
26029 IX86_BUILTIN_PHSUBD,
26030 IX86_BUILTIN_PHSUBSW,
26031 IX86_BUILTIN_PMADDUBSW,
26032 IX86_BUILTIN_PMULHRSW,
26033 IX86_BUILTIN_PSHUFB,
26034 IX86_BUILTIN_PSIGNB,
26035 IX86_BUILTIN_PSIGNW,
26036 IX86_BUILTIN_PSIGND,
26037 IX86_BUILTIN_PALIGNR,
26038 IX86_BUILTIN_PABSB,
26039 IX86_BUILTIN_PABSW,
26040 IX86_BUILTIN_PABSD,
26042 IX86_BUILTIN_PHADDW128,
26043 IX86_BUILTIN_PHADDD128,
26044 IX86_BUILTIN_PHADDSW128,
26045 IX86_BUILTIN_PHSUBW128,
26046 IX86_BUILTIN_PHSUBD128,
26047 IX86_BUILTIN_PHSUBSW128,
26048 IX86_BUILTIN_PMADDUBSW128,
26049 IX86_BUILTIN_PMULHRSW128,
26050 IX86_BUILTIN_PSHUFB128,
26051 IX86_BUILTIN_PSIGNB128,
26052 IX86_BUILTIN_PSIGNW128,
26053 IX86_BUILTIN_PSIGND128,
26054 IX86_BUILTIN_PALIGNR128,
26055 IX86_BUILTIN_PABSB128,
26056 IX86_BUILTIN_PABSW128,
26057 IX86_BUILTIN_PABSD128,
26059 /* AMDFAM10 - SSE4A New Instructions. */
26060 IX86_BUILTIN_MOVNTSD,
26061 IX86_BUILTIN_MOVNTSS,
26062 IX86_BUILTIN_EXTRQI,
26063 IX86_BUILTIN_EXTRQ,
26064 IX86_BUILTIN_INSERTQI,
26065 IX86_BUILTIN_INSERTQ,
26067 /* SSE4.1. */
26068 IX86_BUILTIN_BLENDPD,
26069 IX86_BUILTIN_BLENDPS,
26070 IX86_BUILTIN_BLENDVPD,
26071 IX86_BUILTIN_BLENDVPS,
26072 IX86_BUILTIN_PBLENDVB128,
26073 IX86_BUILTIN_PBLENDW128,
26075 IX86_BUILTIN_DPPD,
26076 IX86_BUILTIN_DPPS,
26078 IX86_BUILTIN_INSERTPS128,
26080 IX86_BUILTIN_MOVNTDQA,
26081 IX86_BUILTIN_MPSADBW128,
26082 IX86_BUILTIN_PACKUSDW128,
26083 IX86_BUILTIN_PCMPEQQ,
26084 IX86_BUILTIN_PHMINPOSUW128,
26086 IX86_BUILTIN_PMAXSB128,
26087 IX86_BUILTIN_PMAXSD128,
26088 IX86_BUILTIN_PMAXUD128,
26089 IX86_BUILTIN_PMAXUW128,
26091 IX86_BUILTIN_PMINSB128,
26092 IX86_BUILTIN_PMINSD128,
26093 IX86_BUILTIN_PMINUD128,
26094 IX86_BUILTIN_PMINUW128,
26096 IX86_BUILTIN_PMOVSXBW128,
26097 IX86_BUILTIN_PMOVSXBD128,
26098 IX86_BUILTIN_PMOVSXBQ128,
26099 IX86_BUILTIN_PMOVSXWD128,
26100 IX86_BUILTIN_PMOVSXWQ128,
26101 IX86_BUILTIN_PMOVSXDQ128,
26103 IX86_BUILTIN_PMOVZXBW128,
26104 IX86_BUILTIN_PMOVZXBD128,
26105 IX86_BUILTIN_PMOVZXBQ128,
26106 IX86_BUILTIN_PMOVZXWD128,
26107 IX86_BUILTIN_PMOVZXWQ128,
26108 IX86_BUILTIN_PMOVZXDQ128,
26110 IX86_BUILTIN_PMULDQ128,
26111 IX86_BUILTIN_PMULLD128,
26113 IX86_BUILTIN_ROUNDSD,
26114 IX86_BUILTIN_ROUNDSS,
26116 IX86_BUILTIN_ROUNDPD,
26117 IX86_BUILTIN_ROUNDPS,
26119 IX86_BUILTIN_FLOORPD,
26120 IX86_BUILTIN_CEILPD,
26121 IX86_BUILTIN_TRUNCPD,
26122 IX86_BUILTIN_RINTPD,
26123 IX86_BUILTIN_ROUNDPD_AZ,
26125 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26126 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26127 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26129 IX86_BUILTIN_FLOORPS,
26130 IX86_BUILTIN_CEILPS,
26131 IX86_BUILTIN_TRUNCPS,
26132 IX86_BUILTIN_RINTPS,
26133 IX86_BUILTIN_ROUNDPS_AZ,
26135 IX86_BUILTIN_FLOORPS_SFIX,
26136 IX86_BUILTIN_CEILPS_SFIX,
26137 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26139 IX86_BUILTIN_PTESTZ,
26140 IX86_BUILTIN_PTESTC,
26141 IX86_BUILTIN_PTESTNZC,
26143 IX86_BUILTIN_VEC_INIT_V2SI,
26144 IX86_BUILTIN_VEC_INIT_V4HI,
26145 IX86_BUILTIN_VEC_INIT_V8QI,
26146 IX86_BUILTIN_VEC_EXT_V2DF,
26147 IX86_BUILTIN_VEC_EXT_V2DI,
26148 IX86_BUILTIN_VEC_EXT_V4SF,
26149 IX86_BUILTIN_VEC_EXT_V4SI,
26150 IX86_BUILTIN_VEC_EXT_V8HI,
26151 IX86_BUILTIN_VEC_EXT_V2SI,
26152 IX86_BUILTIN_VEC_EXT_V4HI,
26153 IX86_BUILTIN_VEC_EXT_V16QI,
26154 IX86_BUILTIN_VEC_SET_V2DI,
26155 IX86_BUILTIN_VEC_SET_V4SF,
26156 IX86_BUILTIN_VEC_SET_V4SI,
26157 IX86_BUILTIN_VEC_SET_V8HI,
26158 IX86_BUILTIN_VEC_SET_V4HI,
26159 IX86_BUILTIN_VEC_SET_V16QI,
26161 IX86_BUILTIN_VEC_PACK_SFIX,
26162 IX86_BUILTIN_VEC_PACK_SFIX256,
26164 /* SSE4.2. */
26165 IX86_BUILTIN_CRC32QI,
26166 IX86_BUILTIN_CRC32HI,
26167 IX86_BUILTIN_CRC32SI,
26168 IX86_BUILTIN_CRC32DI,
26170 IX86_BUILTIN_PCMPESTRI128,
26171 IX86_BUILTIN_PCMPESTRM128,
26172 IX86_BUILTIN_PCMPESTRA128,
26173 IX86_BUILTIN_PCMPESTRC128,
26174 IX86_BUILTIN_PCMPESTRO128,
26175 IX86_BUILTIN_PCMPESTRS128,
26176 IX86_BUILTIN_PCMPESTRZ128,
26177 IX86_BUILTIN_PCMPISTRI128,
26178 IX86_BUILTIN_PCMPISTRM128,
26179 IX86_BUILTIN_PCMPISTRA128,
26180 IX86_BUILTIN_PCMPISTRC128,
26181 IX86_BUILTIN_PCMPISTRO128,
26182 IX86_BUILTIN_PCMPISTRS128,
26183 IX86_BUILTIN_PCMPISTRZ128,
26185 IX86_BUILTIN_PCMPGTQ,
26187 /* AES instructions */
26188 IX86_BUILTIN_AESENC128,
26189 IX86_BUILTIN_AESENCLAST128,
26190 IX86_BUILTIN_AESDEC128,
26191 IX86_BUILTIN_AESDECLAST128,
26192 IX86_BUILTIN_AESIMC128,
26193 IX86_BUILTIN_AESKEYGENASSIST128,
26195 /* PCLMUL instruction */
26196 IX86_BUILTIN_PCLMULQDQ128,
26198 /* AVX */
26199 IX86_BUILTIN_ADDPD256,
26200 IX86_BUILTIN_ADDPS256,
26201 IX86_BUILTIN_ADDSUBPD256,
26202 IX86_BUILTIN_ADDSUBPS256,
26203 IX86_BUILTIN_ANDPD256,
26204 IX86_BUILTIN_ANDPS256,
26205 IX86_BUILTIN_ANDNPD256,
26206 IX86_BUILTIN_ANDNPS256,
26207 IX86_BUILTIN_BLENDPD256,
26208 IX86_BUILTIN_BLENDPS256,
26209 IX86_BUILTIN_BLENDVPD256,
26210 IX86_BUILTIN_BLENDVPS256,
26211 IX86_BUILTIN_DIVPD256,
26212 IX86_BUILTIN_DIVPS256,
26213 IX86_BUILTIN_DPPS256,
26214 IX86_BUILTIN_HADDPD256,
26215 IX86_BUILTIN_HADDPS256,
26216 IX86_BUILTIN_HSUBPD256,
26217 IX86_BUILTIN_HSUBPS256,
26218 IX86_BUILTIN_MAXPD256,
26219 IX86_BUILTIN_MAXPS256,
26220 IX86_BUILTIN_MINPD256,
26221 IX86_BUILTIN_MINPS256,
26222 IX86_BUILTIN_MULPD256,
26223 IX86_BUILTIN_MULPS256,
26224 IX86_BUILTIN_ORPD256,
26225 IX86_BUILTIN_ORPS256,
26226 IX86_BUILTIN_SHUFPD256,
26227 IX86_BUILTIN_SHUFPS256,
26228 IX86_BUILTIN_SUBPD256,
26229 IX86_BUILTIN_SUBPS256,
26230 IX86_BUILTIN_XORPD256,
26231 IX86_BUILTIN_XORPS256,
26232 IX86_BUILTIN_CMPSD,
26233 IX86_BUILTIN_CMPSS,
26234 IX86_BUILTIN_CMPPD,
26235 IX86_BUILTIN_CMPPS,
26236 IX86_BUILTIN_CMPPD256,
26237 IX86_BUILTIN_CMPPS256,
26238 IX86_BUILTIN_CVTDQ2PD256,
26239 IX86_BUILTIN_CVTDQ2PS256,
26240 IX86_BUILTIN_CVTPD2PS256,
26241 IX86_BUILTIN_CVTPS2DQ256,
26242 IX86_BUILTIN_CVTPS2PD256,
26243 IX86_BUILTIN_CVTTPD2DQ256,
26244 IX86_BUILTIN_CVTPD2DQ256,
26245 IX86_BUILTIN_CVTTPS2DQ256,
26246 IX86_BUILTIN_EXTRACTF128PD256,
26247 IX86_BUILTIN_EXTRACTF128PS256,
26248 IX86_BUILTIN_EXTRACTF128SI256,
26249 IX86_BUILTIN_VZEROALL,
26250 IX86_BUILTIN_VZEROUPPER,
26251 IX86_BUILTIN_VPERMILVARPD,
26252 IX86_BUILTIN_VPERMILVARPS,
26253 IX86_BUILTIN_VPERMILVARPD256,
26254 IX86_BUILTIN_VPERMILVARPS256,
26255 IX86_BUILTIN_VPERMILPD,
26256 IX86_BUILTIN_VPERMILPS,
26257 IX86_BUILTIN_VPERMILPD256,
26258 IX86_BUILTIN_VPERMILPS256,
26259 IX86_BUILTIN_VPERMIL2PD,
26260 IX86_BUILTIN_VPERMIL2PS,
26261 IX86_BUILTIN_VPERMIL2PD256,
26262 IX86_BUILTIN_VPERMIL2PS256,
26263 IX86_BUILTIN_VPERM2F128PD256,
26264 IX86_BUILTIN_VPERM2F128PS256,
26265 IX86_BUILTIN_VPERM2F128SI256,
26266 IX86_BUILTIN_VBROADCASTSS,
26267 IX86_BUILTIN_VBROADCASTSD256,
26268 IX86_BUILTIN_VBROADCASTSS256,
26269 IX86_BUILTIN_VBROADCASTPD256,
26270 IX86_BUILTIN_VBROADCASTPS256,
26271 IX86_BUILTIN_VINSERTF128PD256,
26272 IX86_BUILTIN_VINSERTF128PS256,
26273 IX86_BUILTIN_VINSERTF128SI256,
26274 IX86_BUILTIN_LOADUPD256,
26275 IX86_BUILTIN_LOADUPS256,
26276 IX86_BUILTIN_STOREUPD256,
26277 IX86_BUILTIN_STOREUPS256,
26278 IX86_BUILTIN_LDDQU256,
26279 IX86_BUILTIN_MOVNTDQ256,
26280 IX86_BUILTIN_MOVNTPD256,
26281 IX86_BUILTIN_MOVNTPS256,
26282 IX86_BUILTIN_LOADDQU256,
26283 IX86_BUILTIN_STOREDQU256,
26284 IX86_BUILTIN_MASKLOADPD,
26285 IX86_BUILTIN_MASKLOADPS,
26286 IX86_BUILTIN_MASKSTOREPD,
26287 IX86_BUILTIN_MASKSTOREPS,
26288 IX86_BUILTIN_MASKLOADPD256,
26289 IX86_BUILTIN_MASKLOADPS256,
26290 IX86_BUILTIN_MASKSTOREPD256,
26291 IX86_BUILTIN_MASKSTOREPS256,
26292 IX86_BUILTIN_MOVSHDUP256,
26293 IX86_BUILTIN_MOVSLDUP256,
26294 IX86_BUILTIN_MOVDDUP256,
26296 IX86_BUILTIN_SQRTPD256,
26297 IX86_BUILTIN_SQRTPS256,
26298 IX86_BUILTIN_SQRTPS_NR256,
26299 IX86_BUILTIN_RSQRTPS256,
26300 IX86_BUILTIN_RSQRTPS_NR256,
26302 IX86_BUILTIN_RCPPS256,
26304 IX86_BUILTIN_ROUNDPD256,
26305 IX86_BUILTIN_ROUNDPS256,
26307 IX86_BUILTIN_FLOORPD256,
26308 IX86_BUILTIN_CEILPD256,
26309 IX86_BUILTIN_TRUNCPD256,
26310 IX86_BUILTIN_RINTPD256,
26311 IX86_BUILTIN_ROUNDPD_AZ256,
26313 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26314 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26315 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26317 IX86_BUILTIN_FLOORPS256,
26318 IX86_BUILTIN_CEILPS256,
26319 IX86_BUILTIN_TRUNCPS256,
26320 IX86_BUILTIN_RINTPS256,
26321 IX86_BUILTIN_ROUNDPS_AZ256,
26323 IX86_BUILTIN_FLOORPS_SFIX256,
26324 IX86_BUILTIN_CEILPS_SFIX256,
26325 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26327 IX86_BUILTIN_UNPCKHPD256,
26328 IX86_BUILTIN_UNPCKLPD256,
26329 IX86_BUILTIN_UNPCKHPS256,
26330 IX86_BUILTIN_UNPCKLPS256,
26332 IX86_BUILTIN_SI256_SI,
26333 IX86_BUILTIN_PS256_PS,
26334 IX86_BUILTIN_PD256_PD,
26335 IX86_BUILTIN_SI_SI256,
26336 IX86_BUILTIN_PS_PS256,
26337 IX86_BUILTIN_PD_PD256,
26339 IX86_BUILTIN_VTESTZPD,
26340 IX86_BUILTIN_VTESTCPD,
26341 IX86_BUILTIN_VTESTNZCPD,
26342 IX86_BUILTIN_VTESTZPS,
26343 IX86_BUILTIN_VTESTCPS,
26344 IX86_BUILTIN_VTESTNZCPS,
26345 IX86_BUILTIN_VTESTZPD256,
26346 IX86_BUILTIN_VTESTCPD256,
26347 IX86_BUILTIN_VTESTNZCPD256,
26348 IX86_BUILTIN_VTESTZPS256,
26349 IX86_BUILTIN_VTESTCPS256,
26350 IX86_BUILTIN_VTESTNZCPS256,
26351 IX86_BUILTIN_PTESTZ256,
26352 IX86_BUILTIN_PTESTC256,
26353 IX86_BUILTIN_PTESTNZC256,
26355 IX86_BUILTIN_MOVMSKPD256,
26356 IX86_BUILTIN_MOVMSKPS256,
26358 /* AVX2 */
26359 IX86_BUILTIN_MPSADBW256,
26360 IX86_BUILTIN_PABSB256,
26361 IX86_BUILTIN_PABSW256,
26362 IX86_BUILTIN_PABSD256,
26363 IX86_BUILTIN_PACKSSDW256,
26364 IX86_BUILTIN_PACKSSWB256,
26365 IX86_BUILTIN_PACKUSDW256,
26366 IX86_BUILTIN_PACKUSWB256,
26367 IX86_BUILTIN_PADDB256,
26368 IX86_BUILTIN_PADDW256,
26369 IX86_BUILTIN_PADDD256,
26370 IX86_BUILTIN_PADDQ256,
26371 IX86_BUILTIN_PADDSB256,
26372 IX86_BUILTIN_PADDSW256,
26373 IX86_BUILTIN_PADDUSB256,
26374 IX86_BUILTIN_PADDUSW256,
26375 IX86_BUILTIN_PALIGNR256,
26376 IX86_BUILTIN_AND256I,
26377 IX86_BUILTIN_ANDNOT256I,
26378 IX86_BUILTIN_PAVGB256,
26379 IX86_BUILTIN_PAVGW256,
26380 IX86_BUILTIN_PBLENDVB256,
26381 IX86_BUILTIN_PBLENDVW256,
26382 IX86_BUILTIN_PCMPEQB256,
26383 IX86_BUILTIN_PCMPEQW256,
26384 IX86_BUILTIN_PCMPEQD256,
26385 IX86_BUILTIN_PCMPEQQ256,
26386 IX86_BUILTIN_PCMPGTB256,
26387 IX86_BUILTIN_PCMPGTW256,
26388 IX86_BUILTIN_PCMPGTD256,
26389 IX86_BUILTIN_PCMPGTQ256,
26390 IX86_BUILTIN_PHADDW256,
26391 IX86_BUILTIN_PHADDD256,
26392 IX86_BUILTIN_PHADDSW256,
26393 IX86_BUILTIN_PHSUBW256,
26394 IX86_BUILTIN_PHSUBD256,
26395 IX86_BUILTIN_PHSUBSW256,
26396 IX86_BUILTIN_PMADDUBSW256,
26397 IX86_BUILTIN_PMADDWD256,
26398 IX86_BUILTIN_PMAXSB256,
26399 IX86_BUILTIN_PMAXSW256,
26400 IX86_BUILTIN_PMAXSD256,
26401 IX86_BUILTIN_PMAXUB256,
26402 IX86_BUILTIN_PMAXUW256,
26403 IX86_BUILTIN_PMAXUD256,
26404 IX86_BUILTIN_PMINSB256,
26405 IX86_BUILTIN_PMINSW256,
26406 IX86_BUILTIN_PMINSD256,
26407 IX86_BUILTIN_PMINUB256,
26408 IX86_BUILTIN_PMINUW256,
26409 IX86_BUILTIN_PMINUD256,
26410 IX86_BUILTIN_PMOVMSKB256,
26411 IX86_BUILTIN_PMOVSXBW256,
26412 IX86_BUILTIN_PMOVSXBD256,
26413 IX86_BUILTIN_PMOVSXBQ256,
26414 IX86_BUILTIN_PMOVSXWD256,
26415 IX86_BUILTIN_PMOVSXWQ256,
26416 IX86_BUILTIN_PMOVSXDQ256,
26417 IX86_BUILTIN_PMOVZXBW256,
26418 IX86_BUILTIN_PMOVZXBD256,
26419 IX86_BUILTIN_PMOVZXBQ256,
26420 IX86_BUILTIN_PMOVZXWD256,
26421 IX86_BUILTIN_PMOVZXWQ256,
26422 IX86_BUILTIN_PMOVZXDQ256,
26423 IX86_BUILTIN_PMULDQ256,
26424 IX86_BUILTIN_PMULHRSW256,
26425 IX86_BUILTIN_PMULHUW256,
26426 IX86_BUILTIN_PMULHW256,
26427 IX86_BUILTIN_PMULLW256,
26428 IX86_BUILTIN_PMULLD256,
26429 IX86_BUILTIN_PMULUDQ256,
26430 IX86_BUILTIN_POR256,
26431 IX86_BUILTIN_PSADBW256,
26432 IX86_BUILTIN_PSHUFB256,
26433 IX86_BUILTIN_PSHUFD256,
26434 IX86_BUILTIN_PSHUFHW256,
26435 IX86_BUILTIN_PSHUFLW256,
26436 IX86_BUILTIN_PSIGNB256,
26437 IX86_BUILTIN_PSIGNW256,
26438 IX86_BUILTIN_PSIGND256,
26439 IX86_BUILTIN_PSLLDQI256,
26440 IX86_BUILTIN_PSLLWI256,
26441 IX86_BUILTIN_PSLLW256,
26442 IX86_BUILTIN_PSLLDI256,
26443 IX86_BUILTIN_PSLLD256,
26444 IX86_BUILTIN_PSLLQI256,
26445 IX86_BUILTIN_PSLLQ256,
26446 IX86_BUILTIN_PSRAWI256,
26447 IX86_BUILTIN_PSRAW256,
26448 IX86_BUILTIN_PSRADI256,
26449 IX86_BUILTIN_PSRAD256,
26450 IX86_BUILTIN_PSRLDQI256,
26451 IX86_BUILTIN_PSRLWI256,
26452 IX86_BUILTIN_PSRLW256,
26453 IX86_BUILTIN_PSRLDI256,
26454 IX86_BUILTIN_PSRLD256,
26455 IX86_BUILTIN_PSRLQI256,
26456 IX86_BUILTIN_PSRLQ256,
26457 IX86_BUILTIN_PSUBB256,
26458 IX86_BUILTIN_PSUBW256,
26459 IX86_BUILTIN_PSUBD256,
26460 IX86_BUILTIN_PSUBQ256,
26461 IX86_BUILTIN_PSUBSB256,
26462 IX86_BUILTIN_PSUBSW256,
26463 IX86_BUILTIN_PSUBUSB256,
26464 IX86_BUILTIN_PSUBUSW256,
26465 IX86_BUILTIN_PUNPCKHBW256,
26466 IX86_BUILTIN_PUNPCKHWD256,
26467 IX86_BUILTIN_PUNPCKHDQ256,
26468 IX86_BUILTIN_PUNPCKHQDQ256,
26469 IX86_BUILTIN_PUNPCKLBW256,
26470 IX86_BUILTIN_PUNPCKLWD256,
26471 IX86_BUILTIN_PUNPCKLDQ256,
26472 IX86_BUILTIN_PUNPCKLQDQ256,
26473 IX86_BUILTIN_PXOR256,
26474 IX86_BUILTIN_MOVNTDQA256,
26475 IX86_BUILTIN_VBROADCASTSS_PS,
26476 IX86_BUILTIN_VBROADCASTSS_PS256,
26477 IX86_BUILTIN_VBROADCASTSD_PD256,
26478 IX86_BUILTIN_VBROADCASTSI256,
26479 IX86_BUILTIN_PBLENDD256,
26480 IX86_BUILTIN_PBLENDD128,
26481 IX86_BUILTIN_PBROADCASTB256,
26482 IX86_BUILTIN_PBROADCASTW256,
26483 IX86_BUILTIN_PBROADCASTD256,
26484 IX86_BUILTIN_PBROADCASTQ256,
26485 IX86_BUILTIN_PBROADCASTB128,
26486 IX86_BUILTIN_PBROADCASTW128,
26487 IX86_BUILTIN_PBROADCASTD128,
26488 IX86_BUILTIN_PBROADCASTQ128,
26489 IX86_BUILTIN_VPERMVARSI256,
26490 IX86_BUILTIN_VPERMDF256,
26491 IX86_BUILTIN_VPERMVARSF256,
26492 IX86_BUILTIN_VPERMDI256,
26493 IX86_BUILTIN_VPERMTI256,
26494 IX86_BUILTIN_VEXTRACT128I256,
26495 IX86_BUILTIN_VINSERT128I256,
26496 IX86_BUILTIN_MASKLOADD,
26497 IX86_BUILTIN_MASKLOADQ,
26498 IX86_BUILTIN_MASKLOADD256,
26499 IX86_BUILTIN_MASKLOADQ256,
26500 IX86_BUILTIN_MASKSTORED,
26501 IX86_BUILTIN_MASKSTOREQ,
26502 IX86_BUILTIN_MASKSTORED256,
26503 IX86_BUILTIN_MASKSTOREQ256,
26504 IX86_BUILTIN_PSLLVV4DI,
26505 IX86_BUILTIN_PSLLVV2DI,
26506 IX86_BUILTIN_PSLLVV8SI,
26507 IX86_BUILTIN_PSLLVV4SI,
26508 IX86_BUILTIN_PSRAVV8SI,
26509 IX86_BUILTIN_PSRAVV4SI,
26510 IX86_BUILTIN_PSRLVV4DI,
26511 IX86_BUILTIN_PSRLVV2DI,
26512 IX86_BUILTIN_PSRLVV8SI,
26513 IX86_BUILTIN_PSRLVV4SI,
26515 IX86_BUILTIN_GATHERSIV2DF,
26516 IX86_BUILTIN_GATHERSIV4DF,
26517 IX86_BUILTIN_GATHERDIV2DF,
26518 IX86_BUILTIN_GATHERDIV4DF,
26519 IX86_BUILTIN_GATHERSIV4SF,
26520 IX86_BUILTIN_GATHERSIV8SF,
26521 IX86_BUILTIN_GATHERDIV4SF,
26522 IX86_BUILTIN_GATHERDIV8SF,
26523 IX86_BUILTIN_GATHERSIV2DI,
26524 IX86_BUILTIN_GATHERSIV4DI,
26525 IX86_BUILTIN_GATHERDIV2DI,
26526 IX86_BUILTIN_GATHERDIV4DI,
26527 IX86_BUILTIN_GATHERSIV4SI,
26528 IX86_BUILTIN_GATHERSIV8SI,
26529 IX86_BUILTIN_GATHERDIV4SI,
26530 IX86_BUILTIN_GATHERDIV8SI,
26532 /* Alternate 4 element gather for the vectorizer where
26533 all operands are 32-byte wide. */
26534 IX86_BUILTIN_GATHERALTSIV4DF,
26535 IX86_BUILTIN_GATHERALTDIV8SF,
26536 IX86_BUILTIN_GATHERALTSIV4DI,
26537 IX86_BUILTIN_GATHERALTDIV8SI,
26539 /* TFmode support builtins. */
26540 IX86_BUILTIN_INFQ,
26541 IX86_BUILTIN_HUGE_VALQ,
26542 IX86_BUILTIN_FABSQ,
26543 IX86_BUILTIN_COPYSIGNQ,
26545 /* Vectorizer support builtins. */
26546 IX86_BUILTIN_CPYSGNPS,
26547 IX86_BUILTIN_CPYSGNPD,
26548 IX86_BUILTIN_CPYSGNPS256,
26549 IX86_BUILTIN_CPYSGNPD256,
26551 /* FMA4 instructions. */
26552 IX86_BUILTIN_VFMADDSS,
26553 IX86_BUILTIN_VFMADDSD,
26554 IX86_BUILTIN_VFMADDPS,
26555 IX86_BUILTIN_VFMADDPD,
26556 IX86_BUILTIN_VFMADDPS256,
26557 IX86_BUILTIN_VFMADDPD256,
26558 IX86_BUILTIN_VFMADDSUBPS,
26559 IX86_BUILTIN_VFMADDSUBPD,
26560 IX86_BUILTIN_VFMADDSUBPS256,
26561 IX86_BUILTIN_VFMADDSUBPD256,
26563 /* FMA3 instructions. */
26564 IX86_BUILTIN_VFMADDSS3,
26565 IX86_BUILTIN_VFMADDSD3,
26567 /* XOP instructions. */
26568 IX86_BUILTIN_VPCMOV,
26569 IX86_BUILTIN_VPCMOV_V2DI,
26570 IX86_BUILTIN_VPCMOV_V4SI,
26571 IX86_BUILTIN_VPCMOV_V8HI,
26572 IX86_BUILTIN_VPCMOV_V16QI,
26573 IX86_BUILTIN_VPCMOV_V4SF,
26574 IX86_BUILTIN_VPCMOV_V2DF,
26575 IX86_BUILTIN_VPCMOV256,
26576 IX86_BUILTIN_VPCMOV_V4DI256,
26577 IX86_BUILTIN_VPCMOV_V8SI256,
26578 IX86_BUILTIN_VPCMOV_V16HI256,
26579 IX86_BUILTIN_VPCMOV_V32QI256,
26580 IX86_BUILTIN_VPCMOV_V8SF256,
26581 IX86_BUILTIN_VPCMOV_V4DF256,
26583 IX86_BUILTIN_VPPERM,
26585 IX86_BUILTIN_VPMACSSWW,
26586 IX86_BUILTIN_VPMACSWW,
26587 IX86_BUILTIN_VPMACSSWD,
26588 IX86_BUILTIN_VPMACSWD,
26589 IX86_BUILTIN_VPMACSSDD,
26590 IX86_BUILTIN_VPMACSDD,
26591 IX86_BUILTIN_VPMACSSDQL,
26592 IX86_BUILTIN_VPMACSSDQH,
26593 IX86_BUILTIN_VPMACSDQL,
26594 IX86_BUILTIN_VPMACSDQH,
26595 IX86_BUILTIN_VPMADCSSWD,
26596 IX86_BUILTIN_VPMADCSWD,
26598 IX86_BUILTIN_VPHADDBW,
26599 IX86_BUILTIN_VPHADDBD,
26600 IX86_BUILTIN_VPHADDBQ,
26601 IX86_BUILTIN_VPHADDWD,
26602 IX86_BUILTIN_VPHADDWQ,
26603 IX86_BUILTIN_VPHADDDQ,
26604 IX86_BUILTIN_VPHADDUBW,
26605 IX86_BUILTIN_VPHADDUBD,
26606 IX86_BUILTIN_VPHADDUBQ,
26607 IX86_BUILTIN_VPHADDUWD,
26608 IX86_BUILTIN_VPHADDUWQ,
26609 IX86_BUILTIN_VPHADDUDQ,
26610 IX86_BUILTIN_VPHSUBBW,
26611 IX86_BUILTIN_VPHSUBWD,
26612 IX86_BUILTIN_VPHSUBDQ,
26614 IX86_BUILTIN_VPROTB,
26615 IX86_BUILTIN_VPROTW,
26616 IX86_BUILTIN_VPROTD,
26617 IX86_BUILTIN_VPROTQ,
26618 IX86_BUILTIN_VPROTB_IMM,
26619 IX86_BUILTIN_VPROTW_IMM,
26620 IX86_BUILTIN_VPROTD_IMM,
26621 IX86_BUILTIN_VPROTQ_IMM,
26623 IX86_BUILTIN_VPSHLB,
26624 IX86_BUILTIN_VPSHLW,
26625 IX86_BUILTIN_VPSHLD,
26626 IX86_BUILTIN_VPSHLQ,
26627 IX86_BUILTIN_VPSHAB,
26628 IX86_BUILTIN_VPSHAW,
26629 IX86_BUILTIN_VPSHAD,
26630 IX86_BUILTIN_VPSHAQ,
26632 IX86_BUILTIN_VFRCZSS,
26633 IX86_BUILTIN_VFRCZSD,
26634 IX86_BUILTIN_VFRCZPS,
26635 IX86_BUILTIN_VFRCZPD,
26636 IX86_BUILTIN_VFRCZPS256,
26637 IX86_BUILTIN_VFRCZPD256,
26639 IX86_BUILTIN_VPCOMEQUB,
26640 IX86_BUILTIN_VPCOMNEUB,
26641 IX86_BUILTIN_VPCOMLTUB,
26642 IX86_BUILTIN_VPCOMLEUB,
26643 IX86_BUILTIN_VPCOMGTUB,
26644 IX86_BUILTIN_VPCOMGEUB,
26645 IX86_BUILTIN_VPCOMFALSEUB,
26646 IX86_BUILTIN_VPCOMTRUEUB,
26648 IX86_BUILTIN_VPCOMEQUW,
26649 IX86_BUILTIN_VPCOMNEUW,
26650 IX86_BUILTIN_VPCOMLTUW,
26651 IX86_BUILTIN_VPCOMLEUW,
26652 IX86_BUILTIN_VPCOMGTUW,
26653 IX86_BUILTIN_VPCOMGEUW,
26654 IX86_BUILTIN_VPCOMFALSEUW,
26655 IX86_BUILTIN_VPCOMTRUEUW,
26657 IX86_BUILTIN_VPCOMEQUD,
26658 IX86_BUILTIN_VPCOMNEUD,
26659 IX86_BUILTIN_VPCOMLTUD,
26660 IX86_BUILTIN_VPCOMLEUD,
26661 IX86_BUILTIN_VPCOMGTUD,
26662 IX86_BUILTIN_VPCOMGEUD,
26663 IX86_BUILTIN_VPCOMFALSEUD,
26664 IX86_BUILTIN_VPCOMTRUEUD,
26666 IX86_BUILTIN_VPCOMEQUQ,
26667 IX86_BUILTIN_VPCOMNEUQ,
26668 IX86_BUILTIN_VPCOMLTUQ,
26669 IX86_BUILTIN_VPCOMLEUQ,
26670 IX86_BUILTIN_VPCOMGTUQ,
26671 IX86_BUILTIN_VPCOMGEUQ,
26672 IX86_BUILTIN_VPCOMFALSEUQ,
26673 IX86_BUILTIN_VPCOMTRUEUQ,
26675 IX86_BUILTIN_VPCOMEQB,
26676 IX86_BUILTIN_VPCOMNEB,
26677 IX86_BUILTIN_VPCOMLTB,
26678 IX86_BUILTIN_VPCOMLEB,
26679 IX86_BUILTIN_VPCOMGTB,
26680 IX86_BUILTIN_VPCOMGEB,
26681 IX86_BUILTIN_VPCOMFALSEB,
26682 IX86_BUILTIN_VPCOMTRUEB,
26684 IX86_BUILTIN_VPCOMEQW,
26685 IX86_BUILTIN_VPCOMNEW,
26686 IX86_BUILTIN_VPCOMLTW,
26687 IX86_BUILTIN_VPCOMLEW,
26688 IX86_BUILTIN_VPCOMGTW,
26689 IX86_BUILTIN_VPCOMGEW,
26690 IX86_BUILTIN_VPCOMFALSEW,
26691 IX86_BUILTIN_VPCOMTRUEW,
26693 IX86_BUILTIN_VPCOMEQD,
26694 IX86_BUILTIN_VPCOMNED,
26695 IX86_BUILTIN_VPCOMLTD,
26696 IX86_BUILTIN_VPCOMLED,
26697 IX86_BUILTIN_VPCOMGTD,
26698 IX86_BUILTIN_VPCOMGED,
26699 IX86_BUILTIN_VPCOMFALSED,
26700 IX86_BUILTIN_VPCOMTRUED,
26702 IX86_BUILTIN_VPCOMEQQ,
26703 IX86_BUILTIN_VPCOMNEQ,
26704 IX86_BUILTIN_VPCOMLTQ,
26705 IX86_BUILTIN_VPCOMLEQ,
26706 IX86_BUILTIN_VPCOMGTQ,
26707 IX86_BUILTIN_VPCOMGEQ,
26708 IX86_BUILTIN_VPCOMFALSEQ,
26709 IX86_BUILTIN_VPCOMTRUEQ,
26711 /* LWP instructions. */
26712 IX86_BUILTIN_LLWPCB,
26713 IX86_BUILTIN_SLWPCB,
26714 IX86_BUILTIN_LWPVAL32,
26715 IX86_BUILTIN_LWPVAL64,
26716 IX86_BUILTIN_LWPINS32,
26717 IX86_BUILTIN_LWPINS64,
26719 IX86_BUILTIN_CLZS,
26721 /* RTM */
26722 IX86_BUILTIN_XBEGIN,
26723 IX86_BUILTIN_XEND,
26724 IX86_BUILTIN_XABORT,
26725 IX86_BUILTIN_XTEST,
26727 /* BMI instructions. */
26728 IX86_BUILTIN_BEXTR32,
26729 IX86_BUILTIN_BEXTR64,
26730 IX86_BUILTIN_CTZS,
26732 /* TBM instructions. */
26733 IX86_BUILTIN_BEXTRI32,
26734 IX86_BUILTIN_BEXTRI64,
26736 /* BMI2 instructions. */
26737 IX86_BUILTIN_BZHI32,
26738 IX86_BUILTIN_BZHI64,
26739 IX86_BUILTIN_PDEP32,
26740 IX86_BUILTIN_PDEP64,
26741 IX86_BUILTIN_PEXT32,
26742 IX86_BUILTIN_PEXT64,
26744 /* ADX instructions. */
26745 IX86_BUILTIN_ADDCARRYX32,
26746 IX86_BUILTIN_ADDCARRYX64,
26748 /* FSGSBASE instructions. */
26749 IX86_BUILTIN_RDFSBASE32,
26750 IX86_BUILTIN_RDFSBASE64,
26751 IX86_BUILTIN_RDGSBASE32,
26752 IX86_BUILTIN_RDGSBASE64,
26753 IX86_BUILTIN_WRFSBASE32,
26754 IX86_BUILTIN_WRFSBASE64,
26755 IX86_BUILTIN_WRGSBASE32,
26756 IX86_BUILTIN_WRGSBASE64,
26758 /* RDRND instructions. */
26759 IX86_BUILTIN_RDRAND16_STEP,
26760 IX86_BUILTIN_RDRAND32_STEP,
26761 IX86_BUILTIN_RDRAND64_STEP,
26763 /* RDSEED instructions. */
26764 IX86_BUILTIN_RDSEED16_STEP,
26765 IX86_BUILTIN_RDSEED32_STEP,
26766 IX86_BUILTIN_RDSEED64_STEP,
26768 /* F16C instructions. */
26769 IX86_BUILTIN_CVTPH2PS,
26770 IX86_BUILTIN_CVTPH2PS256,
26771 IX86_BUILTIN_CVTPS2PH,
26772 IX86_BUILTIN_CVTPS2PH256,
26774 /* CFString built-in for darwin */
26775 IX86_BUILTIN_CFSTRING,
26777 /* Builtins to get CPU type and supported features. */
26778 IX86_BUILTIN_CPU_INIT,
26779 IX86_BUILTIN_CPU_IS,
26780 IX86_BUILTIN_CPU_SUPPORTS,
26782 IX86_BUILTIN_MAX
26785 /* Table for the ix86 builtin decls. */
26786 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26788 /* Table of all of the builtin functions that are possible with different ISA's
26789 but are waiting to be built until a function is declared to use that
26790 ISA. */
26791 struct builtin_isa {
26792 const char *name; /* function name */
26793 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26794 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26795 bool const_p; /* true if the declaration is constant */
26796 bool set_and_not_built_p;
26799 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26802 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26803 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26804 function decl in the ix86_builtins array. Returns the function decl or
26805 NULL_TREE, if the builtin was not added.
26807 If the front end has a special hook for builtin functions, delay adding
26808 builtin functions that aren't in the current ISA until the ISA is changed
26809 with function specific optimization. Doing so, can save about 300K for the
26810 default compiler. When the builtin is expanded, check at that time whether
26811 it is valid.
26813 If the front end doesn't have a special hook, record all builtins, even if
26814 it isn't an instruction set in the current ISA in case the user uses
26815 function specific options for a different ISA, so that we don't get scope
26816 errors if a builtin is added in the middle of a function scope. */
26818 static inline tree
26819 def_builtin (HOST_WIDE_INT mask, const char *name,
26820 enum ix86_builtin_func_type tcode,
26821 enum ix86_builtins code)
26823 tree decl = NULL_TREE;
26825 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26827 ix86_builtins_isa[(int) code].isa = mask;
26829 mask &= ~OPTION_MASK_ISA_64BIT;
26830 if (mask == 0
26831 || (mask & ix86_isa_flags) != 0
26832 || (lang_hooks.builtin_function
26833 == lang_hooks.builtin_function_ext_scope))
26836 tree type = ix86_get_builtin_func_type (tcode);
26837 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26838 NULL, NULL_TREE);
26839 ix86_builtins[(int) code] = decl;
26840 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26842 else
26844 ix86_builtins[(int) code] = NULL_TREE;
26845 ix86_builtins_isa[(int) code].tcode = tcode;
26846 ix86_builtins_isa[(int) code].name = name;
26847 ix86_builtins_isa[(int) code].const_p = false;
26848 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26852 return decl;
26855 /* Like def_builtin, but also marks the function decl "const". */
26857 static inline tree
26858 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26859 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26861 tree decl = def_builtin (mask, name, tcode, code);
26862 if (decl)
26863 TREE_READONLY (decl) = 1;
26864 else
26865 ix86_builtins_isa[(int) code].const_p = true;
26867 return decl;
26870 /* Add any new builtin functions for a given ISA that may not have been
26871 declared. This saves a bit of space compared to adding all of the
26872 declarations to the tree, even if we didn't use them. */
26874 static void
26875 ix86_add_new_builtins (HOST_WIDE_INT isa)
26877 int i;
26879 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26881 if ((ix86_builtins_isa[i].isa & isa) != 0
26882 && ix86_builtins_isa[i].set_and_not_built_p)
26884 tree decl, type;
26886 /* Don't define the builtin again. */
26887 ix86_builtins_isa[i].set_and_not_built_p = false;
26889 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26890 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26891 type, i, BUILT_IN_MD, NULL,
26892 NULL_TREE);
26894 ix86_builtins[i] = decl;
26895 if (ix86_builtins_isa[i].const_p)
26896 TREE_READONLY (decl) = 1;
26901 /* Bits for builtin_description.flag. */
26903 /* Set when we don't support the comparison natively, and should
26904 swap_comparison in order to support it. */
26905 #define BUILTIN_DESC_SWAP_OPERANDS 1
26907 struct builtin_description
26909 const HOST_WIDE_INT mask;
26910 const enum insn_code icode;
26911 const char *const name;
26912 const enum ix86_builtins code;
26913 const enum rtx_code comparison;
26914 const int flag;
26917 static const struct builtin_description bdesc_comi[] =
26919 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26920 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26921 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26922 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26923 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26924 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26925 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26926 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26927 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26928 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26929 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26930 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26931 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26932 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26933 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26934 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26935 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26936 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26937 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26938 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26939 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26940 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26941 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26942 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26945 static const struct builtin_description bdesc_pcmpestr[] =
26947 /* SSE4.2 */
26948 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26949 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26950 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26951 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26952 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26953 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26954 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26957 static const struct builtin_description bdesc_pcmpistr[] =
26959 /* SSE4.2 */
26960 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26961 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26962 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26963 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26964 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26965 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26966 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26969 /* Special builtins with variable number of arguments. */
26970 static const struct builtin_description bdesc_special_args[] =
26972 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26973 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26974 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26976 /* MMX */
26977 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26979 /* 3DNow! */
26980 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26982 /* FXSR, XSAVE and XSAVEOPT */
26983 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26984 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26985 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26986 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26987 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26989 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26990 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26991 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26992 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26993 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26995 /* SSE */
26996 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26997 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26998 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27000 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27001 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27002 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27003 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27005 /* SSE or 3DNow!A */
27006 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27007 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
27009 /* SSE2 */
27010 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27011 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27012 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27013 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
27014 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27015 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
27016 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
27017 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
27018 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
27019 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27021 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27022 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27024 /* SSE3 */
27025 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27027 /* SSE4.1 */
27028 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
27030 /* SSE4A */
27031 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27032 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27034 /* AVX */
27035 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
27036 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
27038 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27039 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27040 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27041 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
27042 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
27044 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27045 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27046 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27047 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27048 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27049 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
27050 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27052 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
27053 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27054 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27056 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
27057 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
27058 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
27059 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
27060 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
27061 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
27062 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
27063 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
27065 /* AVX2 */
27066 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
27067 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
27068 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
27069 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
27070 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
27071 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
27072 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
27073 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
27074 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
27076 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
27077 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
27078 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
27079 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
27080 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
27081 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
27083 /* FSGSBASE */
27084 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27085 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27086 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27087 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27088 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27089 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27090 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27091 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27093 /* RTM */
27094 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27095 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
27096 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
27099 /* Builtins with variable number of arguments. */
27100 static const struct builtin_description bdesc_args[] =
27102 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
27103 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
27104 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
27105 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27106 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27107 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27108 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27110 /* MMX */
27111 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27112 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27113 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27114 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27115 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27116 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27118 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27119 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27120 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27121 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27122 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27123 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27124 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27125 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27127 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27128 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27130 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27131 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27132 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27133 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27135 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27136 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27137 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27138 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27139 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27140 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27142 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27143 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27144 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27145 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27146 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27147 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27149 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27150 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27151 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27153 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27155 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27156 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27157 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27158 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27159 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27160 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27162 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27163 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27164 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27165 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27166 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27167 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27169 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27170 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27171 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27172 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27174 /* 3DNow! */
27175 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27176 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27177 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27178 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27180 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27181 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27182 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27183 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27184 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27185 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27186 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27187 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27188 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27189 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27190 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27191 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27192 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27193 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27194 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27196 /* 3DNow!A */
27197 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27198 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27199 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27200 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27201 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27202 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27204 /* SSE */
27205 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27206 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27207 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27208 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27209 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27210 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27211 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27212 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27213 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27214 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27215 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27216 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27218 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27220 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27221 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27222 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27223 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27224 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27225 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27226 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27227 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27229 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27230 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27231 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27232 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27233 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27234 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27235 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27236 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27237 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27238 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27239 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27240 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27241 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27242 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27243 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27244 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27245 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27246 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27247 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27248 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27249 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27250 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27252 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27253 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27254 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27255 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27257 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27258 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27259 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27260 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27262 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27264 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27265 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27266 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27267 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27268 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27270 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27271 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27272 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27274 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27276 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27277 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27278 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27280 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27281 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27283 /* SSE MMX or 3Dnow!A */
27284 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27285 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27286 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27288 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27289 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27290 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27291 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27293 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27294 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27296 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27298 /* SSE2 */
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27301 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27304 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27308 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27309 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27310 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27315 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27316 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27317 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27318 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27320 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27322 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27324 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27325 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27326 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27327 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27328 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27329 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27330 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27331 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27333 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27334 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27335 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27336 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27337 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27338 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27339 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27341 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27342 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27343 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27344 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27345 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27346 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27347 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27348 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27350 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27351 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27352 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27354 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27355 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27356 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27357 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27359 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27360 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27361 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27362 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27364 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27366 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27367 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27368 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27370 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27372 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27373 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27374 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27375 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27376 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27377 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27378 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27379 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27381 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27382 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27383 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27384 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27385 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27386 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27387 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27388 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27390 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27391 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27393 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27394 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27395 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27396 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27398 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27399 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27401 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27402 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27403 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27404 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27405 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27406 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27408 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27409 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27410 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27411 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27413 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27414 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27415 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27416 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27417 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27418 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27419 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27420 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27422 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27423 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27424 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27426 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27427 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27429 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27430 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27432 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27434 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27435 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27436 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27437 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27439 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27440 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27441 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27442 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27443 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27444 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27445 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27447 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27448 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27449 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27450 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27451 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27452 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27453 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27455 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27456 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27457 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27458 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27460 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27461 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27462 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27464 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27466 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27468 /* SSE2 MMX */
27469 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27470 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27472 /* SSE3 */
27473 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27474 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27476 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27477 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27478 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27479 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27480 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27481 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27483 /* SSSE3 */
27484 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27485 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27486 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27487 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27488 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27489 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27491 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27492 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27493 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27494 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27495 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27496 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27497 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27498 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27499 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27500 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27501 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27502 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27503 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27504 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27505 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27506 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27507 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27508 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27509 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27510 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27511 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27512 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27513 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27514 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27516 /* SSSE3. */
27517 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27518 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27520 /* SSE4.1 */
27521 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27522 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27523 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27524 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27525 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27526 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27527 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27528 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27529 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27530 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27532 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27533 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27534 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27535 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27536 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27537 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27538 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27539 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27540 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27541 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27542 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27543 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27544 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27546 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27547 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27548 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27549 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27550 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27551 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27552 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27553 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27554 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27555 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27556 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27557 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27559 /* SSE4.1 */
27560 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27561 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27562 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27563 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27565 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27566 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27567 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27568 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27570 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27571 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27573 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27574 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27576 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27577 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27578 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27579 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27581 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27582 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27584 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27585 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27587 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27588 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27589 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27591 /* SSE4.2 */
27592 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27593 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27594 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27595 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27596 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27598 /* SSE4A */
27599 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27600 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27601 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27602 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27604 /* AES */
27605 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27606 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27608 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27609 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27610 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27611 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27613 /* PCLMUL */
27614 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27616 /* AVX */
27617 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27618 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27621 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27622 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27623 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27624 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27625 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27627 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27628 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27630 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27631 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27632 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27633 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27634 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27635 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27636 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27637 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27638 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27639 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27640 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27641 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27642 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27644 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27645 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27646 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27647 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27649 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27650 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27651 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27652 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27653 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27654 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27655 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27656 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27657 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27658 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27659 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27660 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27661 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27662 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27663 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27664 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27665 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27666 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27667 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27668 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27669 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27670 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27671 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27672 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27673 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27674 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27675 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27676 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27677 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27678 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27679 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27680 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27681 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27682 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27684 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27685 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27686 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27688 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27689 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27690 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27691 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27692 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27694 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27696 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27697 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27699 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27700 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27701 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27702 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27704 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27705 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27707 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27708 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27710 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27711 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27712 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27713 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27715 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27716 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27718 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27719 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27721 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27722 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27723 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27724 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27726 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27727 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27728 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27729 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27730 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27731 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27733 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27734 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27735 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27736 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27737 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27738 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27739 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27740 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27741 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27742 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27743 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27744 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27745 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27746 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27747 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27749 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27750 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27752 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27753 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27755 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27757 /* AVX2 */
27758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27759 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27760 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27761 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27762 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27764 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27766 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27767 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27768 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27769 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27775 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27782 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27783 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27784 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27785 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27787 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27795 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27796 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27797 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27798 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27799 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27800 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27801 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27802 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27803 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27804 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27805 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27806 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27807 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27808 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27809 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27810 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27811 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27812 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27813 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27814 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27815 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27816 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27817 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27818 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27819 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27820 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27821 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27822 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27823 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27824 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27825 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27826 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27827 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27828 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27829 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27830 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27831 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27832 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27833 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27834 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27835 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27836 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27837 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27838 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27839 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27840 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27841 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27842 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27843 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27844 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27845 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27846 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27847 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27848 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27849 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27850 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27851 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27852 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27853 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27854 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27855 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27856 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27857 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27858 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27859 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27860 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27861 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27862 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27863 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27864 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27865 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27866 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27867 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27868 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27869 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27870 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27871 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27872 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27873 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27874 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27875 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27876 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27877 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27878 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27879 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27880 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27881 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27882 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27883 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27884 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27885 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27886 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27887 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27888 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27889 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27890 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27891 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27892 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27893 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27894 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27895 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27896 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27897 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27898 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27899 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27900 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27901 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27905 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27907 /* BMI */
27908 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27909 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27910 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27912 /* TBM */
27913 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27914 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27916 /* F16C */
27917 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27918 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27919 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27920 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27922 /* BMI2 */
27923 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27924 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27925 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27926 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27927 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27928 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27931 /* FMA4 and XOP. */
27932 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27933 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27934 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27935 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27936 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27937 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27938 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27939 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27940 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27941 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27942 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27943 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27944 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27945 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27946 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27947 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27948 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27949 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27950 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27951 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27952 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27953 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27954 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27955 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27956 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27957 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27958 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27959 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27960 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27961 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27962 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27963 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27964 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27965 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27966 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27967 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27968 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27969 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27970 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27971 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27972 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27973 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27974 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27975 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27976 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27977 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27978 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27979 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27980 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27981 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27982 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27983 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27985 static const struct builtin_description bdesc_multi_arg[] =
27987 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27988 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27989 UNKNOWN, (int)MULTI_ARG_3_SF },
27990 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27991 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27992 UNKNOWN, (int)MULTI_ARG_3_DF },
27994 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27995 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27996 UNKNOWN, (int)MULTI_ARG_3_SF },
27997 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27998 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27999 UNKNOWN, (int)MULTI_ARG_3_DF },
28001 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
28002 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
28003 UNKNOWN, (int)MULTI_ARG_3_SF },
28004 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
28005 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
28006 UNKNOWN, (int)MULTI_ARG_3_DF },
28007 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
28008 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
28009 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28010 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
28011 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
28012 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28014 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
28015 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
28016 UNKNOWN, (int)MULTI_ARG_3_SF },
28017 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
28018 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
28019 UNKNOWN, (int)MULTI_ARG_3_DF },
28020 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
28021 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
28022 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28023 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
28024 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
28025 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
28028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
28029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
28030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
28031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
28032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
28033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
28035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
28038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
28039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
28040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
28041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
28043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
28045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
28060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
28061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
28062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
28063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
28064 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
28065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
28066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
28068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
28069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
28070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28071 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
28072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
28073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
28075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
28076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
28077 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
28078 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
28079 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
28080 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
28082 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28083 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28084 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28085 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28086 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28087 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28088 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28089 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28090 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28091 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28092 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28093 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28094 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28095 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28096 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28098 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
28099 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28100 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28101 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
28102 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
28103 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
28104 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
28106 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
28107 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28108 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28109 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
28110 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28111 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28112 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28114 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28115 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28119 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28120 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28123 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28128 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28131 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28133 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28142 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28143 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28144 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28146 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28147 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28148 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28149 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28150 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28151 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28152 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28155 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28156 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28157 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28158 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28159 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28160 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28163 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28164 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28166 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28168 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28171 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28174 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28175 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28176 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28177 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28181 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28183 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28187 /* TM vector builtins. */
28189 /* Reuse the existing x86-specific `struct builtin_description' cause
28190 we're lazy. Add casts to make them fit. */
28191 static const struct builtin_description bdesc_tm[] =
28193 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28194 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28195 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28196 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28197 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28198 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28199 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28201 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28202 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28203 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28204 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28205 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28206 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28207 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28209 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28210 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28211 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28212 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28213 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28214 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28215 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28217 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28218 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28219 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28222 /* TM callbacks. */
28224 /* Return the builtin decl needed to load a vector of TYPE. */
28226 static tree
28227 ix86_builtin_tm_load (tree type)
28229 if (TREE_CODE (type) == VECTOR_TYPE)
28231 switch (tree_low_cst (TYPE_SIZE (type), 1))
28233 case 64:
28234 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28235 case 128:
28236 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28237 case 256:
28238 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28241 return NULL_TREE;
28244 /* Return the builtin decl needed to store a vector of TYPE. */
28246 static tree
28247 ix86_builtin_tm_store (tree type)
28249 if (TREE_CODE (type) == VECTOR_TYPE)
28251 switch (tree_low_cst (TYPE_SIZE (type), 1))
28253 case 64:
28254 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28255 case 128:
28256 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28257 case 256:
28258 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28261 return NULL_TREE;
28264 /* Initialize the transactional memory vector load/store builtins. */
28266 static void
28267 ix86_init_tm_builtins (void)
28269 enum ix86_builtin_func_type ftype;
28270 const struct builtin_description *d;
28271 size_t i;
28272 tree decl;
28273 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28274 tree attrs_log, attrs_type_log;
28276 if (!flag_tm)
28277 return;
28279 /* If there are no builtins defined, we must be compiling in a
28280 language without trans-mem support. */
28281 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28282 return;
28284 /* Use whatever attributes a normal TM load has. */
28285 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28286 attrs_load = DECL_ATTRIBUTES (decl);
28287 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28288 /* Use whatever attributes a normal TM store has. */
28289 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28290 attrs_store = DECL_ATTRIBUTES (decl);
28291 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28292 /* Use whatever attributes a normal TM log has. */
28293 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28294 attrs_log = DECL_ATTRIBUTES (decl);
28295 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28297 for (i = 0, d = bdesc_tm;
28298 i < ARRAY_SIZE (bdesc_tm);
28299 i++, d++)
28301 if ((d->mask & ix86_isa_flags) != 0
28302 || (lang_hooks.builtin_function
28303 == lang_hooks.builtin_function_ext_scope))
28305 tree type, attrs, attrs_type;
28306 enum built_in_function code = (enum built_in_function) d->code;
28308 ftype = (enum ix86_builtin_func_type) d->flag;
28309 type = ix86_get_builtin_func_type (ftype);
28311 if (BUILTIN_TM_LOAD_P (code))
28313 attrs = attrs_load;
28314 attrs_type = attrs_type_load;
28316 else if (BUILTIN_TM_STORE_P (code))
28318 attrs = attrs_store;
28319 attrs_type = attrs_type_store;
28321 else
28323 attrs = attrs_log;
28324 attrs_type = attrs_type_log;
28326 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28327 /* The builtin without the prefix for
28328 calling it directly. */
28329 d->name + strlen ("__builtin_"),
28330 attrs);
28331 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28332 set the TYPE_ATTRIBUTES. */
28333 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28335 set_builtin_decl (code, decl, false);
28340 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28341 in the current target ISA to allow the user to compile particular modules
28342 with different target specific options that differ from the command line
28343 options. */
28344 static void
28345 ix86_init_mmx_sse_builtins (void)
28347 const struct builtin_description * d;
28348 enum ix86_builtin_func_type ftype;
28349 size_t i;
28351 /* Add all special builtins with variable number of operands. */
28352 for (i = 0, d = bdesc_special_args;
28353 i < ARRAY_SIZE (bdesc_special_args);
28354 i++, d++)
28356 if (d->name == 0)
28357 continue;
28359 ftype = (enum ix86_builtin_func_type) d->flag;
28360 def_builtin (d->mask, d->name, ftype, d->code);
28363 /* Add all builtins with variable number of operands. */
28364 for (i = 0, d = bdesc_args;
28365 i < ARRAY_SIZE (bdesc_args);
28366 i++, d++)
28368 if (d->name == 0)
28369 continue;
28371 ftype = (enum ix86_builtin_func_type) d->flag;
28372 def_builtin_const (d->mask, d->name, ftype, d->code);
28375 /* pcmpestr[im] insns. */
28376 for (i = 0, d = bdesc_pcmpestr;
28377 i < ARRAY_SIZE (bdesc_pcmpestr);
28378 i++, d++)
28380 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28381 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28382 else
28383 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28384 def_builtin_const (d->mask, d->name, ftype, d->code);
28387 /* pcmpistr[im] insns. */
28388 for (i = 0, d = bdesc_pcmpistr;
28389 i < ARRAY_SIZE (bdesc_pcmpistr);
28390 i++, d++)
28392 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28393 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28394 else
28395 ftype = INT_FTYPE_V16QI_V16QI_INT;
28396 def_builtin_const (d->mask, d->name, ftype, d->code);
28399 /* comi/ucomi insns. */
28400 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28402 if (d->mask == OPTION_MASK_ISA_SSE2)
28403 ftype = INT_FTYPE_V2DF_V2DF;
28404 else
28405 ftype = INT_FTYPE_V4SF_V4SF;
28406 def_builtin_const (d->mask, d->name, ftype, d->code);
28409 /* SSE */
28410 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28411 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28412 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28413 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28415 /* SSE or 3DNow!A */
28416 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28417 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28418 IX86_BUILTIN_MASKMOVQ);
28420 /* SSE2 */
28421 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28422 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28424 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28425 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28426 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28427 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28429 /* SSE3. */
28430 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28431 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28432 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28433 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28435 /* AES */
28436 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28437 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28438 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28439 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28440 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28441 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28442 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28443 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28444 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28445 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28446 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28447 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28449 /* PCLMUL */
28450 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28451 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28453 /* RDRND */
28454 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28455 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28456 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28457 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28458 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28459 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28460 IX86_BUILTIN_RDRAND64_STEP);
28462 /* AVX2 */
28463 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28464 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28465 IX86_BUILTIN_GATHERSIV2DF);
28467 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28468 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28469 IX86_BUILTIN_GATHERSIV4DF);
28471 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28472 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28473 IX86_BUILTIN_GATHERDIV2DF);
28475 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28476 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28477 IX86_BUILTIN_GATHERDIV4DF);
28479 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28480 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28481 IX86_BUILTIN_GATHERSIV4SF);
28483 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28484 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28485 IX86_BUILTIN_GATHERSIV8SF);
28487 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28488 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28489 IX86_BUILTIN_GATHERDIV4SF);
28491 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28492 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28493 IX86_BUILTIN_GATHERDIV8SF);
28495 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28496 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28497 IX86_BUILTIN_GATHERSIV2DI);
28499 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28500 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28501 IX86_BUILTIN_GATHERSIV4DI);
28503 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28504 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28505 IX86_BUILTIN_GATHERDIV2DI);
28507 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28508 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28509 IX86_BUILTIN_GATHERDIV4DI);
28511 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28512 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28513 IX86_BUILTIN_GATHERSIV4SI);
28515 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28516 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28517 IX86_BUILTIN_GATHERSIV8SI);
28519 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28520 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28521 IX86_BUILTIN_GATHERDIV4SI);
28523 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28524 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28525 IX86_BUILTIN_GATHERDIV8SI);
28527 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28528 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28529 IX86_BUILTIN_GATHERALTSIV4DF);
28531 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28532 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28533 IX86_BUILTIN_GATHERALTDIV8SF);
28535 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28536 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28537 IX86_BUILTIN_GATHERALTSIV4DI);
28539 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28540 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28541 IX86_BUILTIN_GATHERALTDIV8SI);
28543 /* RTM. */
28544 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28545 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28547 /* MMX access to the vec_init patterns. */
28548 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28549 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28551 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28552 V4HI_FTYPE_HI_HI_HI_HI,
28553 IX86_BUILTIN_VEC_INIT_V4HI);
28555 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28556 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28557 IX86_BUILTIN_VEC_INIT_V8QI);
28559 /* Access to the vec_extract patterns. */
28560 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28561 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28562 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28563 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28564 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28565 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28566 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28567 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28568 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28569 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28571 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28572 "__builtin_ia32_vec_ext_v4hi",
28573 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28575 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28576 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28578 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28579 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28581 /* Access to the vec_set patterns. */
28582 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28583 "__builtin_ia32_vec_set_v2di",
28584 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28586 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28587 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28589 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28590 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28592 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28593 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28595 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28596 "__builtin_ia32_vec_set_v4hi",
28597 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28599 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28600 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28602 /* RDSEED */
28603 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28604 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28605 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28606 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28607 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28608 "__builtin_ia32_rdseed_di_step",
28609 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28611 /* ADCX */
28612 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28613 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28614 def_builtin (OPTION_MASK_ISA_64BIT,
28615 "__builtin_ia32_addcarryx_u64",
28616 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28617 IX86_BUILTIN_ADDCARRYX64);
28619 /* Add FMA4 multi-arg argument instructions */
28620 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28622 if (d->name == 0)
28623 continue;
28625 ftype = (enum ix86_builtin_func_type) d->flag;
28626 def_builtin_const (d->mask, d->name, ftype, d->code);
28630 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28631 to return a pointer to VERSION_DECL if the outcome of the expression
28632 formed by PREDICATE_CHAIN is true. This function will be called during
28633 version dispatch to decide which function version to execute. It returns
28634 the basic block at the end, to which more conditions can be added. */
28636 static basic_block
28637 add_condition_to_bb (tree function_decl, tree version_decl,
28638 tree predicate_chain, basic_block new_bb)
28640 gimple return_stmt;
28641 tree convert_expr, result_var;
28642 gimple convert_stmt;
28643 gimple call_cond_stmt;
28644 gimple if_else_stmt;
28646 basic_block bb1, bb2, bb3;
28647 edge e12, e23;
28649 tree cond_var, and_expr_var = NULL_TREE;
28650 gimple_seq gseq;
28652 tree predicate_decl, predicate_arg;
28654 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28656 gcc_assert (new_bb != NULL);
28657 gseq = bb_seq (new_bb);
28660 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28661 build_fold_addr_expr (version_decl));
28662 result_var = create_tmp_var (ptr_type_node, NULL);
28663 convert_stmt = gimple_build_assign (result_var, convert_expr);
28664 return_stmt = gimple_build_return (result_var);
28666 if (predicate_chain == NULL_TREE)
28668 gimple_seq_add_stmt (&gseq, convert_stmt);
28669 gimple_seq_add_stmt (&gseq, return_stmt);
28670 set_bb_seq (new_bb, gseq);
28671 gimple_set_bb (convert_stmt, new_bb);
28672 gimple_set_bb (return_stmt, new_bb);
28673 pop_cfun ();
28674 return new_bb;
28677 while (predicate_chain != NULL)
28679 cond_var = create_tmp_var (integer_type_node, NULL);
28680 predicate_decl = TREE_PURPOSE (predicate_chain);
28681 predicate_arg = TREE_VALUE (predicate_chain);
28682 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28683 gimple_call_set_lhs (call_cond_stmt, cond_var);
28685 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28686 gimple_set_bb (call_cond_stmt, new_bb);
28687 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28689 predicate_chain = TREE_CHAIN (predicate_chain);
28691 if (and_expr_var == NULL)
28692 and_expr_var = cond_var;
28693 else
28695 gimple assign_stmt;
28696 /* Use MIN_EXPR to check if any integer is zero?.
28697 and_expr_var = min_expr <cond_var, and_expr_var> */
28698 assign_stmt = gimple_build_assign (and_expr_var,
28699 build2 (MIN_EXPR, integer_type_node,
28700 cond_var, and_expr_var));
28702 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28703 gimple_set_bb (assign_stmt, new_bb);
28704 gimple_seq_add_stmt (&gseq, assign_stmt);
28708 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28709 integer_zero_node,
28710 NULL_TREE, NULL_TREE);
28711 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28712 gimple_set_bb (if_else_stmt, new_bb);
28713 gimple_seq_add_stmt (&gseq, if_else_stmt);
28715 gimple_seq_add_stmt (&gseq, convert_stmt);
28716 gimple_seq_add_stmt (&gseq, return_stmt);
28717 set_bb_seq (new_bb, gseq);
28719 bb1 = new_bb;
28720 e12 = split_block (bb1, if_else_stmt);
28721 bb2 = e12->dest;
28722 e12->flags &= ~EDGE_FALLTHRU;
28723 e12->flags |= EDGE_TRUE_VALUE;
28725 e23 = split_block (bb2, return_stmt);
28727 gimple_set_bb (convert_stmt, bb2);
28728 gimple_set_bb (return_stmt, bb2);
28730 bb3 = e23->dest;
28731 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28733 remove_edge (e23);
28734 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28736 pop_cfun ();
28738 return bb3;
28741 /* This parses the attribute arguments to target in DECL and determines
28742 the right builtin to use to match the platform specification.
28743 It returns the priority value for this version decl. If PREDICATE_LIST
28744 is not NULL, it stores the list of cpu features that need to be checked
28745 before dispatching this function. */
28747 static unsigned int
28748 get_builtin_code_for_version (tree decl, tree *predicate_list)
28750 tree attrs;
28751 struct cl_target_option cur_target;
28752 tree target_node;
28753 struct cl_target_option *new_target;
28754 const char *arg_str = NULL;
28755 const char *attrs_str = NULL;
28756 char *tok_str = NULL;
28757 char *token;
28759 /* Priority of i386 features, greater value is higher priority. This is
28760 used to decide the order in which function dispatch must happen. For
28761 instance, a version specialized for SSE4.2 should be checked for dispatch
28762 before a version for SSE3, as SSE4.2 implies SSE3. */
28763 enum feature_priority
28765 P_ZERO = 0,
28766 P_MMX,
28767 P_SSE,
28768 P_SSE2,
28769 P_SSE3,
28770 P_SSSE3,
28771 P_PROC_SSSE3,
28772 P_SSE4_a,
28773 P_PROC_SSE4_a,
28774 P_SSE4_1,
28775 P_SSE4_2,
28776 P_PROC_SSE4_2,
28777 P_POPCNT,
28778 P_AVX,
28779 P_AVX2,
28780 P_FMA,
28781 P_PROC_FMA
28784 enum feature_priority priority = P_ZERO;
28786 /* These are the target attribute strings for which a dispatcher is
28787 available, from fold_builtin_cpu. */
28789 static struct _feature_list
28791 const char *const name;
28792 const enum feature_priority priority;
28794 const feature_list[] =
28796 {"mmx", P_MMX},
28797 {"sse", P_SSE},
28798 {"sse2", P_SSE2},
28799 {"sse3", P_SSE3},
28800 {"ssse3", P_SSSE3},
28801 {"sse4.1", P_SSE4_1},
28802 {"sse4.2", P_SSE4_2},
28803 {"popcnt", P_POPCNT},
28804 {"avx", P_AVX},
28805 {"avx2", P_AVX2}
28809 static unsigned int NUM_FEATURES
28810 = sizeof (feature_list) / sizeof (struct _feature_list);
28812 unsigned int i;
28814 tree predicate_chain = NULL_TREE;
28815 tree predicate_decl, predicate_arg;
28817 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28818 gcc_assert (attrs != NULL);
28820 attrs = TREE_VALUE (TREE_VALUE (attrs));
28822 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28823 attrs_str = TREE_STRING_POINTER (attrs);
28825 /* Return priority zero for default function. */
28826 if (strcmp (attrs_str, "default") == 0)
28827 return 0;
28829 /* Handle arch= if specified. For priority, set it to be 1 more than
28830 the best instruction set the processor can handle. For instance, if
28831 there is a version for atom and a version for ssse3 (the highest ISA
28832 priority for atom), the atom version must be checked for dispatch
28833 before the ssse3 version. */
28834 if (strstr (attrs_str, "arch=") != NULL)
28836 cl_target_option_save (&cur_target, &global_options);
28837 target_node = ix86_valid_target_attribute_tree (attrs);
28839 gcc_assert (target_node);
28840 new_target = TREE_TARGET_OPTION (target_node);
28841 gcc_assert (new_target);
28843 if (new_target->arch_specified && new_target->arch > 0)
28845 switch (new_target->arch)
28847 case PROCESSOR_CORE2:
28848 arg_str = "core2";
28849 priority = P_PROC_SSSE3;
28850 break;
28851 case PROCESSOR_COREI7:
28852 arg_str = "corei7";
28853 priority = P_PROC_SSE4_2;
28854 break;
28855 case PROCESSOR_ATOM:
28856 arg_str = "atom";
28857 priority = P_PROC_SSSE3;
28858 break;
28859 case PROCESSOR_AMDFAM10:
28860 arg_str = "amdfam10h";
28861 priority = P_PROC_SSE4_a;
28862 break;
28863 case PROCESSOR_BDVER1:
28864 arg_str = "bdver1";
28865 priority = P_PROC_FMA;
28866 break;
28867 case PROCESSOR_BDVER2:
28868 arg_str = "bdver2";
28869 priority = P_PROC_FMA;
28870 break;
28874 cl_target_option_restore (&global_options, &cur_target);
28876 if (predicate_list && arg_str == NULL)
28878 error_at (DECL_SOURCE_LOCATION (decl),
28879 "No dispatcher found for the versioning attributes");
28880 return 0;
28883 if (predicate_list)
28885 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28886 /* For a C string literal the length includes the trailing NULL. */
28887 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28888 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28889 predicate_chain);
28893 /* Process feature name. */
28894 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28895 strcpy (tok_str, attrs_str);
28896 token = strtok (tok_str, ",");
28897 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28899 while (token != NULL)
28901 /* Do not process "arch=" */
28902 if (strncmp (token, "arch=", 5) == 0)
28904 token = strtok (NULL, ",");
28905 continue;
28907 for (i = 0; i < NUM_FEATURES; ++i)
28909 if (strcmp (token, feature_list[i].name) == 0)
28911 if (predicate_list)
28913 predicate_arg = build_string_literal (
28914 strlen (feature_list[i].name) + 1,
28915 feature_list[i].name);
28916 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28917 predicate_chain);
28919 /* Find the maximum priority feature. */
28920 if (feature_list[i].priority > priority)
28921 priority = feature_list[i].priority;
28923 break;
28926 if (predicate_list && i == NUM_FEATURES)
28928 error_at (DECL_SOURCE_LOCATION (decl),
28929 "No dispatcher found for %s", token);
28930 return 0;
28932 token = strtok (NULL, ",");
28934 free (tok_str);
28936 if (predicate_list && predicate_chain == NULL_TREE)
28938 error_at (DECL_SOURCE_LOCATION (decl),
28939 "No dispatcher found for the versioning attributes : %s",
28940 attrs_str);
28941 return 0;
28943 else if (predicate_list)
28945 predicate_chain = nreverse (predicate_chain);
28946 *predicate_list = predicate_chain;
28949 return priority;
28952 /* This compares the priority of target features in function DECL1
28953 and DECL2. It returns positive value if DECL1 is higher priority,
28954 negative value if DECL2 is higher priority and 0 if they are the
28955 same. */
28957 static int
28958 ix86_compare_version_priority (tree decl1, tree decl2)
28960 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
28961 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
28963 return (int)priority1 - (int)priority2;
28966 /* V1 and V2 point to function versions with different priorities
28967 based on the target ISA. This function compares their priorities. */
28969 static int
28970 feature_compare (const void *v1, const void *v2)
28972 typedef struct _function_version_info
28974 tree version_decl;
28975 tree predicate_chain;
28976 unsigned int dispatch_priority;
28977 } function_version_info;
28979 const function_version_info c1 = *(const function_version_info *)v1;
28980 const function_version_info c2 = *(const function_version_info *)v2;
28981 return (c2.dispatch_priority - c1.dispatch_priority);
28984 /* This function generates the dispatch function for
28985 multi-versioned functions. DISPATCH_DECL is the function which will
28986 contain the dispatch logic. FNDECLS are the function choices for
28987 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28988 in DISPATCH_DECL in which the dispatch code is generated. */
28990 static int
28991 dispatch_function_versions (tree dispatch_decl,
28992 void *fndecls_p,
28993 basic_block *empty_bb)
28995 tree default_decl;
28996 gimple ifunc_cpu_init_stmt;
28997 gimple_seq gseq;
28998 int ix;
28999 tree ele;
29000 vec<tree> *fndecls;
29001 unsigned int num_versions = 0;
29002 unsigned int actual_versions = 0;
29003 unsigned int i;
29005 struct _function_version_info
29007 tree version_decl;
29008 tree predicate_chain;
29009 unsigned int dispatch_priority;
29010 }*function_version_info;
29012 gcc_assert (dispatch_decl != NULL
29013 && fndecls_p != NULL
29014 && empty_bb != NULL);
29016 /*fndecls_p is actually a vector. */
29017 fndecls = static_cast<vec<tree> *> (fndecls_p);
29019 /* At least one more version other than the default. */
29020 num_versions = fndecls->length ();
29021 gcc_assert (num_versions >= 2);
29023 function_version_info = (struct _function_version_info *)
29024 XNEWVEC (struct _function_version_info, (num_versions - 1));
29026 /* The first version in the vector is the default decl. */
29027 default_decl = (*fndecls)[0];
29029 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
29031 gseq = bb_seq (*empty_bb);
29032 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
29033 constructors, so explicity call __builtin_cpu_init here. */
29034 ifunc_cpu_init_stmt = gimple_build_call_vec (
29035 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
29036 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
29037 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
29038 set_bb_seq (*empty_bb, gseq);
29040 pop_cfun ();
29043 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
29045 tree version_decl = ele;
29046 tree predicate_chain = NULL_TREE;
29047 unsigned int priority;
29048 /* Get attribute string, parse it and find the right predicate decl.
29049 The predicate function could be a lengthy combination of many
29050 features, like arch-type and various isa-variants. */
29051 priority = get_builtin_code_for_version (version_decl,
29052 &predicate_chain);
29054 if (predicate_chain == NULL_TREE)
29055 continue;
29057 actual_versions++;
29058 function_version_info [ix - 1].version_decl = version_decl;
29059 function_version_info [ix - 1].predicate_chain = predicate_chain;
29060 function_version_info [ix - 1].dispatch_priority = priority;
29063 /* Sort the versions according to descending order of dispatch priority. The
29064 priority is based on the ISA. This is not a perfect solution. There
29065 could still be ambiguity. If more than one function version is suitable
29066 to execute, which one should be dispatched? In future, allow the user
29067 to specify a dispatch priority next to the version. */
29068 qsort (function_version_info, actual_versions,
29069 sizeof (struct _function_version_info), feature_compare);
29071 for (i = 0; i < actual_versions; ++i)
29072 *empty_bb = add_condition_to_bb (dispatch_decl,
29073 function_version_info[i].version_decl,
29074 function_version_info[i].predicate_chain,
29075 *empty_bb);
29077 /* dispatch default version at the end. */
29078 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
29079 NULL, *empty_bb);
29081 free (function_version_info);
29082 return 0;
29085 /* Comparator function to be used in qsort routine to sort attribute
29086 specification strings to "target". */
29088 static int
29089 attr_strcmp (const void *v1, const void *v2)
29091 const char *c1 = *(char *const*)v1;
29092 const char *c2 = *(char *const*)v2;
29093 return strcmp (c1, c2);
29096 /* ARGLIST is the argument to target attribute. This function tokenizes
29097 the comma separated arguments, sorts them and returns a string which
29098 is a unique identifier for the comma separated arguments. It also
29099 replaces non-identifier characters "=,-" with "_". */
29101 static char *
29102 sorted_attr_string (tree arglist)
29104 tree arg;
29105 size_t str_len_sum = 0;
29106 char **args = NULL;
29107 char *attr_str, *ret_str;
29108 char *attr = NULL;
29109 unsigned int argnum = 1;
29110 unsigned int i;
29112 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29114 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29115 size_t len = strlen (str);
29116 str_len_sum += len + 1;
29117 if (arg != arglist)
29118 argnum++;
29119 for (i = 0; i < strlen (str); i++)
29120 if (str[i] == ',')
29121 argnum++;
29124 attr_str = XNEWVEC (char, str_len_sum);
29125 str_len_sum = 0;
29126 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29128 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29129 size_t len = strlen (str);
29130 memcpy (attr_str + str_len_sum, str, len);
29131 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29132 str_len_sum += len + 1;
29135 /* Replace "=,-" with "_". */
29136 for (i = 0; i < strlen (attr_str); i++)
29137 if (attr_str[i] == '=' || attr_str[i]== '-')
29138 attr_str[i] = '_';
29140 if (argnum == 1)
29141 return attr_str;
29143 args = XNEWVEC (char *, argnum);
29145 i = 0;
29146 attr = strtok (attr_str, ",");
29147 while (attr != NULL)
29149 args[i] = attr;
29150 i++;
29151 attr = strtok (NULL, ",");
29154 qsort (args, argnum, sizeof (char *), attr_strcmp);
29156 ret_str = XNEWVEC (char, str_len_sum);
29157 str_len_sum = 0;
29158 for (i = 0; i < argnum; i++)
29160 size_t len = strlen (args[i]);
29161 memcpy (ret_str + str_len_sum, args[i], len);
29162 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29163 str_len_sum += len + 1;
29166 XDELETEVEC (args);
29167 XDELETEVEC (attr_str);
29168 return ret_str;
29171 /* This function changes the assembler name for functions that are
29172 versions. If DECL is a function version and has a "target"
29173 attribute, it appends the attribute string to its assembler name. */
29175 static tree
29176 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29178 tree version_attr;
29179 const char *orig_name, *version_string;
29180 char *attr_str, *assembler_name;
29182 if (DECL_DECLARED_INLINE_P (decl)
29183 && lookup_attribute ("gnu_inline",
29184 DECL_ATTRIBUTES (decl)))
29185 error_at (DECL_SOURCE_LOCATION (decl),
29186 "Function versions cannot be marked as gnu_inline,"
29187 " bodies have to be generated");
29189 if (DECL_VIRTUAL_P (decl)
29190 || DECL_VINDEX (decl))
29191 sorry ("Virtual function multiversioning not supported");
29193 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29195 /* target attribute string cannot be NULL. */
29196 gcc_assert (version_attr != NULL_TREE);
29198 orig_name = IDENTIFIER_POINTER (id);
29199 version_string
29200 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29202 if (strcmp (version_string, "default") == 0)
29203 return id;
29205 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29206 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29208 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29210 /* Allow assembler name to be modified if already set. */
29211 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29212 SET_DECL_RTL (decl, NULL);
29214 tree ret = get_identifier (assembler_name);
29215 XDELETEVEC (attr_str);
29216 XDELETEVEC (assembler_name);
29217 return ret;
29220 /* This function returns true if FN1 and FN2 are versions of the same function,
29221 that is, the target strings of the function decls are different. This assumes
29222 that FN1 and FN2 have the same signature. */
29224 static bool
29225 ix86_function_versions (tree fn1, tree fn2)
29227 tree attr1, attr2;
29228 char *target1, *target2;
29229 bool result;
29231 if (TREE_CODE (fn1) != FUNCTION_DECL
29232 || TREE_CODE (fn2) != FUNCTION_DECL)
29233 return false;
29235 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29236 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29238 /* At least one function decl should have the target attribute specified. */
29239 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29240 return false;
29242 /* Diagnose missing target attribute if one of the decls is already
29243 multi-versioned. */
29244 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29246 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29248 if (attr2 != NULL_TREE)
29250 tree tem = fn1;
29251 fn1 = fn2;
29252 fn2 = tem;
29253 attr1 = attr2;
29255 error_at (DECL_SOURCE_LOCATION (fn2),
29256 "missing %<target%> attribute for multi-versioned %D",
29257 fn2);
29258 error_at (DECL_SOURCE_LOCATION (fn1),
29259 "previous declaration of %D", fn1);
29260 /* Prevent diagnosing of the same error multiple times. */
29261 DECL_ATTRIBUTES (fn2)
29262 = tree_cons (get_identifier ("target"),
29263 copy_node (TREE_VALUE (attr1)),
29264 DECL_ATTRIBUTES (fn2));
29266 return false;
29269 target1 = sorted_attr_string (TREE_VALUE (attr1));
29270 target2 = sorted_attr_string (TREE_VALUE (attr2));
29272 /* The sorted target strings must be different for fn1 and fn2
29273 to be versions. */
29274 if (strcmp (target1, target2) == 0)
29275 result = false;
29276 else
29277 result = true;
29279 XDELETEVEC (target1);
29280 XDELETEVEC (target2);
29282 return result;
29285 static tree
29286 ix86_mangle_decl_assembler_name (tree decl, tree id)
29288 /* For function version, add the target suffix to the assembler name. */
29289 if (TREE_CODE (decl) == FUNCTION_DECL
29290 && DECL_FUNCTION_VERSIONED (decl))
29291 id = ix86_mangle_function_version_assembler_name (decl, id);
29292 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29293 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29294 #endif
29296 return id;
29299 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29300 is true, append the full path name of the source file. */
29302 static char *
29303 make_name (tree decl, const char *suffix, bool make_unique)
29305 char *global_var_name;
29306 int name_len;
29307 const char *name;
29308 const char *unique_name = NULL;
29310 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29312 /* Get a unique name that can be used globally without any chances
29313 of collision at link time. */
29314 if (make_unique)
29315 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29317 name_len = strlen (name) + strlen (suffix) + 2;
29319 if (make_unique)
29320 name_len += strlen (unique_name) + 1;
29321 global_var_name = XNEWVEC (char, name_len);
29323 /* Use '.' to concatenate names as it is demangler friendly. */
29324 if (make_unique)
29325 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29326 suffix);
29327 else
29328 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29330 return global_var_name;
29333 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29335 /* Make a dispatcher declaration for the multi-versioned function DECL.
29336 Calls to DECL function will be replaced with calls to the dispatcher
29337 by the front-end. Return the decl created. */
29339 static tree
29340 make_dispatcher_decl (const tree decl)
29342 tree func_decl;
29343 char *func_name;
29344 tree fn_type, func_type;
29345 bool is_uniq = false;
29347 if (TREE_PUBLIC (decl) == 0)
29348 is_uniq = true;
29350 func_name = make_name (decl, "ifunc", is_uniq);
29352 fn_type = TREE_TYPE (decl);
29353 func_type = build_function_type (TREE_TYPE (fn_type),
29354 TYPE_ARG_TYPES (fn_type));
29356 func_decl = build_fn_decl (func_name, func_type);
29357 XDELETEVEC (func_name);
29358 TREE_USED (func_decl) = 1;
29359 DECL_CONTEXT (func_decl) = NULL_TREE;
29360 DECL_INITIAL (func_decl) = error_mark_node;
29361 DECL_ARTIFICIAL (func_decl) = 1;
29362 /* Mark this func as external, the resolver will flip it again if
29363 it gets generated. */
29364 DECL_EXTERNAL (func_decl) = 1;
29365 /* This will be of type IFUNCs have to be externally visible. */
29366 TREE_PUBLIC (func_decl) = 1;
29368 return func_decl;
29371 #endif
29373 /* Returns true if decl is multi-versioned and DECL is the default function,
29374 that is it is not tagged with target specific optimization. */
29376 static bool
29377 is_function_default_version (const tree decl)
29379 if (TREE_CODE (decl) != FUNCTION_DECL
29380 || !DECL_FUNCTION_VERSIONED (decl))
29381 return false;
29382 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29383 gcc_assert (attr);
29384 attr = TREE_VALUE (TREE_VALUE (attr));
29385 return (TREE_CODE (attr) == STRING_CST
29386 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29389 /* Make a dispatcher declaration for the multi-versioned function DECL.
29390 Calls to DECL function will be replaced with calls to the dispatcher
29391 by the front-end. Returns the decl of the dispatcher function. */
29393 static tree
29394 ix86_get_function_versions_dispatcher (void *decl)
29396 tree fn = (tree) decl;
29397 struct cgraph_node *node = NULL;
29398 struct cgraph_node *default_node = NULL;
29399 struct cgraph_function_version_info *node_v = NULL;
29400 struct cgraph_function_version_info *first_v = NULL;
29402 tree dispatch_decl = NULL;
29404 struct cgraph_function_version_info *default_version_info = NULL;
29406 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29408 node = cgraph_get_node (fn);
29409 gcc_assert (node != NULL);
29411 node_v = get_cgraph_node_version (node);
29412 gcc_assert (node_v != NULL);
29414 if (node_v->dispatcher_resolver != NULL)
29415 return node_v->dispatcher_resolver;
29417 /* Find the default version and make it the first node. */
29418 first_v = node_v;
29419 /* Go to the beginnig of the chain. */
29420 while (first_v->prev != NULL)
29421 first_v = first_v->prev;
29422 default_version_info = first_v;
29423 while (default_version_info != NULL)
29425 if (is_function_default_version
29426 (default_version_info->this_node->symbol.decl))
29427 break;
29428 default_version_info = default_version_info->next;
29431 /* If there is no default node, just return NULL. */
29432 if (default_version_info == NULL)
29433 return NULL;
29435 /* Make default info the first node. */
29436 if (first_v != default_version_info)
29438 default_version_info->prev->next = default_version_info->next;
29439 if (default_version_info->next)
29440 default_version_info->next->prev = default_version_info->prev;
29441 first_v->prev = default_version_info;
29442 default_version_info->next = first_v;
29443 default_version_info->prev = NULL;
29446 default_node = default_version_info->this_node;
29448 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29449 if (targetm.has_ifunc_p ())
29451 struct cgraph_function_version_info *it_v = NULL;
29452 struct cgraph_node *dispatcher_node = NULL;
29453 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29455 /* Right now, the dispatching is done via ifunc. */
29456 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29458 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29459 gcc_assert (dispatcher_node != NULL);
29460 dispatcher_node->dispatcher_function = 1;
29461 dispatcher_version_info
29462 = insert_new_cgraph_node_version (dispatcher_node);
29463 dispatcher_version_info->next = default_version_info;
29464 dispatcher_node->local.finalized = 1;
29466 /* Set the dispatcher for all the versions. */
29467 it_v = default_version_info;
29468 while (it_v != NULL)
29470 it_v->dispatcher_resolver = dispatch_decl;
29471 it_v = it_v->next;
29474 else
29475 #endif
29477 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29478 "multiversioning needs ifunc which is not supported "
29479 "on this target");
29482 return dispatch_decl;
29485 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29486 it to CHAIN. */
29488 static tree
29489 make_attribute (const char *name, const char *arg_name, tree chain)
29491 tree attr_name;
29492 tree attr_arg_name;
29493 tree attr_args;
29494 tree attr;
29496 attr_name = get_identifier (name);
29497 attr_arg_name = build_string (strlen (arg_name), arg_name);
29498 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29499 attr = tree_cons (attr_name, attr_args, chain);
29500 return attr;
29503 /* Make the resolver function decl to dispatch the versions of
29504 a multi-versioned function, DEFAULT_DECL. Create an
29505 empty basic block in the resolver and store the pointer in
29506 EMPTY_BB. Return the decl of the resolver function. */
29508 static tree
29509 make_resolver_func (const tree default_decl,
29510 const tree dispatch_decl,
29511 basic_block *empty_bb)
29513 char *resolver_name;
29514 tree decl, type, decl_name, t;
29515 bool is_uniq = false;
29517 /* IFUNC's have to be globally visible. So, if the default_decl is
29518 not, then the name of the IFUNC should be made unique. */
29519 if (TREE_PUBLIC (default_decl) == 0)
29520 is_uniq = true;
29522 /* Append the filename to the resolver function if the versions are
29523 not externally visible. This is because the resolver function has
29524 to be externally visible for the loader to find it. So, appending
29525 the filename will prevent conflicts with a resolver function from
29526 another module which is based on the same version name. */
29527 resolver_name = make_name (default_decl, "resolver", is_uniq);
29529 /* The resolver function should return a (void *). */
29530 type = build_function_type_list (ptr_type_node, NULL_TREE);
29532 decl = build_fn_decl (resolver_name, type);
29533 decl_name = get_identifier (resolver_name);
29534 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29536 DECL_NAME (decl) = decl_name;
29537 TREE_USED (decl) = 1;
29538 DECL_ARTIFICIAL (decl) = 1;
29539 DECL_IGNORED_P (decl) = 0;
29540 /* IFUNC resolvers have to be externally visible. */
29541 TREE_PUBLIC (decl) = 1;
29542 DECL_UNINLINABLE (decl) = 0;
29544 /* Resolver is not external, body is generated. */
29545 DECL_EXTERNAL (decl) = 0;
29546 DECL_EXTERNAL (dispatch_decl) = 0;
29548 DECL_CONTEXT (decl) = NULL_TREE;
29549 DECL_INITIAL (decl) = make_node (BLOCK);
29550 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29552 if (DECL_COMDAT_GROUP (default_decl)
29553 || TREE_PUBLIC (default_decl))
29555 /* In this case, each translation unit with a call to this
29556 versioned function will put out a resolver. Ensure it
29557 is comdat to keep just one copy. */
29558 DECL_COMDAT (decl) = 1;
29559 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29561 /* Build result decl and add to function_decl. */
29562 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29563 DECL_ARTIFICIAL (t) = 1;
29564 DECL_IGNORED_P (t) = 1;
29565 DECL_RESULT (decl) = t;
29567 gimplify_function_tree (decl);
29568 push_cfun (DECL_STRUCT_FUNCTION (decl));
29569 *empty_bb = init_lowered_empty_function (decl, false);
29571 cgraph_add_new_function (decl, true);
29572 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29574 pop_cfun ();
29576 gcc_assert (dispatch_decl != NULL);
29577 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29578 DECL_ATTRIBUTES (dispatch_decl)
29579 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29581 /* Create the alias for dispatch to resolver here. */
29582 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29583 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29584 XDELETEVEC (resolver_name);
29585 return decl;
29588 /* Generate the dispatching code body to dispatch multi-versioned function
29589 DECL. The target hook is called to process the "target" attributes and
29590 provide the code to dispatch the right function at run-time. NODE points
29591 to the dispatcher decl whose body will be created. */
29593 static tree
29594 ix86_generate_version_dispatcher_body (void *node_p)
29596 tree resolver_decl;
29597 basic_block empty_bb;
29598 vec<tree> fn_ver_vec = vNULL;
29599 tree default_ver_decl;
29600 struct cgraph_node *versn;
29601 struct cgraph_node *node;
29603 struct cgraph_function_version_info *node_version_info = NULL;
29604 struct cgraph_function_version_info *versn_info = NULL;
29606 node = (cgraph_node *)node_p;
29608 node_version_info = get_cgraph_node_version (node);
29609 gcc_assert (node->dispatcher_function
29610 && node_version_info != NULL);
29612 if (node_version_info->dispatcher_resolver)
29613 return node_version_info->dispatcher_resolver;
29615 /* The first version in the chain corresponds to the default version. */
29616 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29618 /* node is going to be an alias, so remove the finalized bit. */
29619 node->local.finalized = false;
29621 resolver_decl = make_resolver_func (default_ver_decl,
29622 node->symbol.decl, &empty_bb);
29624 node_version_info->dispatcher_resolver = resolver_decl;
29626 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29628 fn_ver_vec.create (2);
29630 for (versn_info = node_version_info->next; versn_info;
29631 versn_info = versn_info->next)
29633 versn = versn_info->this_node;
29634 /* Check for virtual functions here again, as by this time it should
29635 have been determined if this function needs a vtable index or
29636 not. This happens for methods in derived classes that override
29637 virtual methods in base classes but are not explicitly marked as
29638 virtual. */
29639 if (DECL_VINDEX (versn->symbol.decl))
29640 sorry ("Virtual function multiversioning not supported");
29642 fn_ver_vec.safe_push (versn->symbol.decl);
29645 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29646 fn_ver_vec.release ();
29647 rebuild_cgraph_edges ();
29648 pop_cfun ();
29649 return resolver_decl;
29651 /* This builds the processor_model struct type defined in
29652 libgcc/config/i386/cpuinfo.c */
29654 static tree
29655 build_processor_model_struct (void)
29657 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29658 "__cpu_features"};
29659 tree field = NULL_TREE, field_chain = NULL_TREE;
29660 int i;
29661 tree type = make_node (RECORD_TYPE);
29663 /* The first 3 fields are unsigned int. */
29664 for (i = 0; i < 3; ++i)
29666 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29667 get_identifier (field_name[i]), unsigned_type_node);
29668 if (field_chain != NULL_TREE)
29669 DECL_CHAIN (field) = field_chain;
29670 field_chain = field;
29673 /* The last field is an array of unsigned integers of size one. */
29674 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29675 get_identifier (field_name[3]),
29676 build_array_type (unsigned_type_node,
29677 build_index_type (size_one_node)));
29678 if (field_chain != NULL_TREE)
29679 DECL_CHAIN (field) = field_chain;
29680 field_chain = field;
29682 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29683 return type;
29686 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29688 static tree
29689 make_var_decl (tree type, const char *name)
29691 tree new_decl;
29693 new_decl = build_decl (UNKNOWN_LOCATION,
29694 VAR_DECL,
29695 get_identifier(name),
29696 type);
29698 DECL_EXTERNAL (new_decl) = 1;
29699 TREE_STATIC (new_decl) = 1;
29700 TREE_PUBLIC (new_decl) = 1;
29701 DECL_INITIAL (new_decl) = 0;
29702 DECL_ARTIFICIAL (new_decl) = 0;
29703 DECL_PRESERVE_P (new_decl) = 1;
29705 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29706 assemble_variable (new_decl, 0, 0, 0);
29708 return new_decl;
29711 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29712 into an integer defined in libgcc/config/i386/cpuinfo.c */
29714 static tree
29715 fold_builtin_cpu (tree fndecl, tree *args)
29717 unsigned int i;
29718 enum ix86_builtins fn_code = (enum ix86_builtins)
29719 DECL_FUNCTION_CODE (fndecl);
29720 tree param_string_cst = NULL;
29722 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29723 enum processor_features
29725 F_CMOV = 0,
29726 F_MMX,
29727 F_POPCNT,
29728 F_SSE,
29729 F_SSE2,
29730 F_SSE3,
29731 F_SSSE3,
29732 F_SSE4_1,
29733 F_SSE4_2,
29734 F_AVX,
29735 F_AVX2,
29736 F_MAX
29739 /* These are the values for vendor types and cpu types and subtypes
29740 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29741 the corresponding start value. */
29742 enum processor_model
29744 M_INTEL = 1,
29745 M_AMD,
29746 M_CPU_TYPE_START,
29747 M_INTEL_ATOM,
29748 M_INTEL_CORE2,
29749 M_INTEL_COREI7,
29750 M_AMDFAM10H,
29751 M_AMDFAM15H,
29752 M_CPU_SUBTYPE_START,
29753 M_INTEL_COREI7_NEHALEM,
29754 M_INTEL_COREI7_WESTMERE,
29755 M_INTEL_COREI7_SANDYBRIDGE,
29756 M_AMDFAM10H_BARCELONA,
29757 M_AMDFAM10H_SHANGHAI,
29758 M_AMDFAM10H_ISTANBUL,
29759 M_AMDFAM15H_BDVER1,
29760 M_AMDFAM15H_BDVER2,
29761 M_AMDFAM15H_BDVER3
29764 static struct _arch_names_table
29766 const char *const name;
29767 const enum processor_model model;
29769 const arch_names_table[] =
29771 {"amd", M_AMD},
29772 {"intel", M_INTEL},
29773 {"atom", M_INTEL_ATOM},
29774 {"core2", M_INTEL_CORE2},
29775 {"corei7", M_INTEL_COREI7},
29776 {"nehalem", M_INTEL_COREI7_NEHALEM},
29777 {"westmere", M_INTEL_COREI7_WESTMERE},
29778 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29779 {"amdfam10h", M_AMDFAM10H},
29780 {"barcelona", M_AMDFAM10H_BARCELONA},
29781 {"shanghai", M_AMDFAM10H_SHANGHAI},
29782 {"istanbul", M_AMDFAM10H_ISTANBUL},
29783 {"amdfam15h", M_AMDFAM15H},
29784 {"bdver1", M_AMDFAM15H_BDVER1},
29785 {"bdver2", M_AMDFAM15H_BDVER2},
29786 {"bdver3", M_AMDFAM15H_BDVER3},
29789 static struct _isa_names_table
29791 const char *const name;
29792 const enum processor_features feature;
29794 const isa_names_table[] =
29796 {"cmov", F_CMOV},
29797 {"mmx", F_MMX},
29798 {"popcnt", F_POPCNT},
29799 {"sse", F_SSE},
29800 {"sse2", F_SSE2},
29801 {"sse3", F_SSE3},
29802 {"ssse3", F_SSSE3},
29803 {"sse4.1", F_SSE4_1},
29804 {"sse4.2", F_SSE4_2},
29805 {"avx", F_AVX},
29806 {"avx2", F_AVX2}
29809 tree __processor_model_type = build_processor_model_struct ();
29810 tree __cpu_model_var = make_var_decl (__processor_model_type,
29811 "__cpu_model");
29813 gcc_assert ((args != NULL) && (*args != NULL));
29815 param_string_cst = *args;
29816 while (param_string_cst
29817 && TREE_CODE (param_string_cst) != STRING_CST)
29819 /* *args must be a expr that can contain other EXPRS leading to a
29820 STRING_CST. */
29821 if (!EXPR_P (param_string_cst))
29823 error ("Parameter to builtin must be a string constant or literal");
29824 return integer_zero_node;
29826 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29829 gcc_assert (param_string_cst);
29831 if (fn_code == IX86_BUILTIN_CPU_IS)
29833 tree ref;
29834 tree field;
29835 tree final;
29837 unsigned int field_val = 0;
29838 unsigned int NUM_ARCH_NAMES
29839 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29841 for (i = 0; i < NUM_ARCH_NAMES; i++)
29842 if (strcmp (arch_names_table[i].name,
29843 TREE_STRING_POINTER (param_string_cst)) == 0)
29844 break;
29846 if (i == NUM_ARCH_NAMES)
29848 error ("Parameter to builtin not valid: %s",
29849 TREE_STRING_POINTER (param_string_cst));
29850 return integer_zero_node;
29853 field = TYPE_FIELDS (__processor_model_type);
29854 field_val = arch_names_table[i].model;
29856 /* CPU types are stored in the next field. */
29857 if (field_val > M_CPU_TYPE_START
29858 && field_val < M_CPU_SUBTYPE_START)
29860 field = DECL_CHAIN (field);
29861 field_val -= M_CPU_TYPE_START;
29864 /* CPU subtypes are stored in the next field. */
29865 if (field_val > M_CPU_SUBTYPE_START)
29867 field = DECL_CHAIN ( DECL_CHAIN (field));
29868 field_val -= M_CPU_SUBTYPE_START;
29871 /* Get the appropriate field in __cpu_model. */
29872 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29873 field, NULL_TREE);
29875 /* Check the value. */
29876 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29877 build_int_cstu (unsigned_type_node, field_val));
29878 return build1 (CONVERT_EXPR, integer_type_node, final);
29880 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29882 tree ref;
29883 tree array_elt;
29884 tree field;
29885 tree final;
29887 unsigned int field_val = 0;
29888 unsigned int NUM_ISA_NAMES
29889 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29891 for (i = 0; i < NUM_ISA_NAMES; i++)
29892 if (strcmp (isa_names_table[i].name,
29893 TREE_STRING_POINTER (param_string_cst)) == 0)
29894 break;
29896 if (i == NUM_ISA_NAMES)
29898 error ("Parameter to builtin not valid: %s",
29899 TREE_STRING_POINTER (param_string_cst));
29900 return integer_zero_node;
29903 field = TYPE_FIELDS (__processor_model_type);
29904 /* Get the last field, which is __cpu_features. */
29905 while (DECL_CHAIN (field))
29906 field = DECL_CHAIN (field);
29908 /* Get the appropriate field: __cpu_model.__cpu_features */
29909 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29910 field, NULL_TREE);
29912 /* Access the 0th element of __cpu_features array. */
29913 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29914 integer_zero_node, NULL_TREE, NULL_TREE);
29916 field_val = (1 << isa_names_table[i].feature);
29917 /* Return __cpu_model.__cpu_features[0] & field_val */
29918 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29919 build_int_cstu (unsigned_type_node, field_val));
29920 return build1 (CONVERT_EXPR, integer_type_node, final);
29922 gcc_unreachable ();
29925 static tree
29926 ix86_fold_builtin (tree fndecl, int n_args,
29927 tree *args, bool ignore ATTRIBUTE_UNUSED)
29929 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29931 enum ix86_builtins fn_code = (enum ix86_builtins)
29932 DECL_FUNCTION_CODE (fndecl);
29933 if (fn_code == IX86_BUILTIN_CPU_IS
29934 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29936 gcc_assert (n_args == 1);
29937 return fold_builtin_cpu (fndecl, args);
29941 #ifdef SUBTARGET_FOLD_BUILTIN
29942 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29943 #endif
29945 return NULL_TREE;
29948 /* Make builtins to detect cpu type and features supported. NAME is
29949 the builtin name, CODE is the builtin code, and FTYPE is the function
29950 type of the builtin. */
29952 static void
29953 make_cpu_type_builtin (const char* name, int code,
29954 enum ix86_builtin_func_type ftype, bool is_const)
29956 tree decl;
29957 tree type;
29959 type = ix86_get_builtin_func_type (ftype);
29960 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29961 NULL, NULL_TREE);
29962 gcc_assert (decl != NULL_TREE);
29963 ix86_builtins[(int) code] = decl;
29964 TREE_READONLY (decl) = is_const;
29967 /* Make builtins to get CPU type and features supported. The created
29968 builtins are :
29970 __builtin_cpu_init (), to detect cpu type and features,
29971 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29972 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29975 static void
29976 ix86_init_platform_type_builtins (void)
29978 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29979 INT_FTYPE_VOID, false);
29980 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29981 INT_FTYPE_PCCHAR, true);
29982 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29983 INT_FTYPE_PCCHAR, true);
29986 /* Internal method for ix86_init_builtins. */
29988 static void
29989 ix86_init_builtins_va_builtins_abi (void)
29991 tree ms_va_ref, sysv_va_ref;
29992 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29993 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29994 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29995 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29997 if (!TARGET_64BIT)
29998 return;
29999 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
30000 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
30001 ms_va_ref = build_reference_type (ms_va_list_type_node);
30002 sysv_va_ref =
30003 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
30005 fnvoid_va_end_ms =
30006 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30007 fnvoid_va_start_ms =
30008 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30009 fnvoid_va_end_sysv =
30010 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
30011 fnvoid_va_start_sysv =
30012 build_varargs_function_type_list (void_type_node, sysv_va_ref,
30013 NULL_TREE);
30014 fnvoid_va_copy_ms =
30015 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
30016 NULL_TREE);
30017 fnvoid_va_copy_sysv =
30018 build_function_type_list (void_type_node, sysv_va_ref,
30019 sysv_va_ref, NULL_TREE);
30021 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
30022 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
30023 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
30024 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
30025 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
30026 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
30027 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
30028 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30029 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
30030 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30031 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
30032 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30035 static void
30036 ix86_init_builtin_types (void)
30038 tree float128_type_node, float80_type_node;
30040 /* The __float80 type. */
30041 float80_type_node = long_double_type_node;
30042 if (TYPE_MODE (float80_type_node) != XFmode)
30044 /* The __float80 type. */
30045 float80_type_node = make_node (REAL_TYPE);
30047 TYPE_PRECISION (float80_type_node) = 80;
30048 layout_type (float80_type_node);
30050 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
30052 /* The __float128 type. */
30053 float128_type_node = make_node (REAL_TYPE);
30054 TYPE_PRECISION (float128_type_node) = 128;
30055 layout_type (float128_type_node);
30056 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
30058 /* This macro is built by i386-builtin-types.awk. */
30059 DEFINE_BUILTIN_PRIMITIVE_TYPES;
30062 static void
30063 ix86_init_builtins (void)
30065 tree t;
30067 ix86_init_builtin_types ();
30069 /* Builtins to get CPU type and features. */
30070 ix86_init_platform_type_builtins ();
30072 /* TFmode support builtins. */
30073 def_builtin_const (0, "__builtin_infq",
30074 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
30075 def_builtin_const (0, "__builtin_huge_valq",
30076 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
30078 /* We will expand them to normal call if SSE isn't available since
30079 they are used by libgcc. */
30080 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
30081 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
30082 BUILT_IN_MD, "__fabstf2", NULL_TREE);
30083 TREE_READONLY (t) = 1;
30084 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
30086 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
30087 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
30088 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
30089 TREE_READONLY (t) = 1;
30090 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
30092 ix86_init_tm_builtins ();
30093 ix86_init_mmx_sse_builtins ();
30095 if (TARGET_LP64)
30096 ix86_init_builtins_va_builtins_abi ();
30098 #ifdef SUBTARGET_INIT_BUILTINS
30099 SUBTARGET_INIT_BUILTINS;
30100 #endif
30103 /* Return the ix86 builtin for CODE. */
30105 static tree
30106 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
30108 if (code >= IX86_BUILTIN_MAX)
30109 return error_mark_node;
30111 return ix86_builtins[code];
30114 /* Errors in the source file can cause expand_expr to return const0_rtx
30115 where we expect a vector. To avoid crashing, use one of the vector
30116 clear instructions. */
30117 static rtx
30118 safe_vector_operand (rtx x, enum machine_mode mode)
30120 if (x == const0_rtx)
30121 x = CONST0_RTX (mode);
30122 return x;
30125 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30127 static rtx
30128 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30130 rtx pat;
30131 tree arg0 = CALL_EXPR_ARG (exp, 0);
30132 tree arg1 = CALL_EXPR_ARG (exp, 1);
30133 rtx op0 = expand_normal (arg0);
30134 rtx op1 = expand_normal (arg1);
30135 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30136 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30137 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30139 if (VECTOR_MODE_P (mode0))
30140 op0 = safe_vector_operand (op0, mode0);
30141 if (VECTOR_MODE_P (mode1))
30142 op1 = safe_vector_operand (op1, mode1);
30144 if (optimize || !target
30145 || GET_MODE (target) != tmode
30146 || !insn_data[icode].operand[0].predicate (target, tmode))
30147 target = gen_reg_rtx (tmode);
30149 if (GET_MODE (op1) == SImode && mode1 == TImode)
30151 rtx x = gen_reg_rtx (V4SImode);
30152 emit_insn (gen_sse2_loadd (x, op1));
30153 op1 = gen_lowpart (TImode, x);
30156 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30157 op0 = copy_to_mode_reg (mode0, op0);
30158 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30159 op1 = copy_to_mode_reg (mode1, op1);
30161 pat = GEN_FCN (icode) (target, op0, op1);
30162 if (! pat)
30163 return 0;
30165 emit_insn (pat);
30167 return target;
30170 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30172 static rtx
30173 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30174 enum ix86_builtin_func_type m_type,
30175 enum rtx_code sub_code)
30177 rtx pat;
30178 int i;
30179 int nargs;
30180 bool comparison_p = false;
30181 bool tf_p = false;
30182 bool last_arg_constant = false;
30183 int num_memory = 0;
30184 struct {
30185 rtx op;
30186 enum machine_mode mode;
30187 } args[4];
30189 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30191 switch (m_type)
30193 case MULTI_ARG_4_DF2_DI_I:
30194 case MULTI_ARG_4_DF2_DI_I1:
30195 case MULTI_ARG_4_SF2_SI_I:
30196 case MULTI_ARG_4_SF2_SI_I1:
30197 nargs = 4;
30198 last_arg_constant = true;
30199 break;
30201 case MULTI_ARG_3_SF:
30202 case MULTI_ARG_3_DF:
30203 case MULTI_ARG_3_SF2:
30204 case MULTI_ARG_3_DF2:
30205 case MULTI_ARG_3_DI:
30206 case MULTI_ARG_3_SI:
30207 case MULTI_ARG_3_SI_DI:
30208 case MULTI_ARG_3_HI:
30209 case MULTI_ARG_3_HI_SI:
30210 case MULTI_ARG_3_QI:
30211 case MULTI_ARG_3_DI2:
30212 case MULTI_ARG_3_SI2:
30213 case MULTI_ARG_3_HI2:
30214 case MULTI_ARG_3_QI2:
30215 nargs = 3;
30216 break;
30218 case MULTI_ARG_2_SF:
30219 case MULTI_ARG_2_DF:
30220 case MULTI_ARG_2_DI:
30221 case MULTI_ARG_2_SI:
30222 case MULTI_ARG_2_HI:
30223 case MULTI_ARG_2_QI:
30224 nargs = 2;
30225 break;
30227 case MULTI_ARG_2_DI_IMM:
30228 case MULTI_ARG_2_SI_IMM:
30229 case MULTI_ARG_2_HI_IMM:
30230 case MULTI_ARG_2_QI_IMM:
30231 nargs = 2;
30232 last_arg_constant = true;
30233 break;
30235 case MULTI_ARG_1_SF:
30236 case MULTI_ARG_1_DF:
30237 case MULTI_ARG_1_SF2:
30238 case MULTI_ARG_1_DF2:
30239 case MULTI_ARG_1_DI:
30240 case MULTI_ARG_1_SI:
30241 case MULTI_ARG_1_HI:
30242 case MULTI_ARG_1_QI:
30243 case MULTI_ARG_1_SI_DI:
30244 case MULTI_ARG_1_HI_DI:
30245 case MULTI_ARG_1_HI_SI:
30246 case MULTI_ARG_1_QI_DI:
30247 case MULTI_ARG_1_QI_SI:
30248 case MULTI_ARG_1_QI_HI:
30249 nargs = 1;
30250 break;
30252 case MULTI_ARG_2_DI_CMP:
30253 case MULTI_ARG_2_SI_CMP:
30254 case MULTI_ARG_2_HI_CMP:
30255 case MULTI_ARG_2_QI_CMP:
30256 nargs = 2;
30257 comparison_p = true;
30258 break;
30260 case MULTI_ARG_2_SF_TF:
30261 case MULTI_ARG_2_DF_TF:
30262 case MULTI_ARG_2_DI_TF:
30263 case MULTI_ARG_2_SI_TF:
30264 case MULTI_ARG_2_HI_TF:
30265 case MULTI_ARG_2_QI_TF:
30266 nargs = 2;
30267 tf_p = true;
30268 break;
30270 default:
30271 gcc_unreachable ();
30274 if (optimize || !target
30275 || GET_MODE (target) != tmode
30276 || !insn_data[icode].operand[0].predicate (target, tmode))
30277 target = gen_reg_rtx (tmode);
30279 gcc_assert (nargs <= 4);
30281 for (i = 0; i < nargs; i++)
30283 tree arg = CALL_EXPR_ARG (exp, i);
30284 rtx op = expand_normal (arg);
30285 int adjust = (comparison_p) ? 1 : 0;
30286 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30288 if (last_arg_constant && i == nargs - 1)
30290 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30292 enum insn_code new_icode = icode;
30293 switch (icode)
30295 case CODE_FOR_xop_vpermil2v2df3:
30296 case CODE_FOR_xop_vpermil2v4sf3:
30297 case CODE_FOR_xop_vpermil2v4df3:
30298 case CODE_FOR_xop_vpermil2v8sf3:
30299 error ("the last argument must be a 2-bit immediate");
30300 return gen_reg_rtx (tmode);
30301 case CODE_FOR_xop_rotlv2di3:
30302 new_icode = CODE_FOR_rotlv2di3;
30303 goto xop_rotl;
30304 case CODE_FOR_xop_rotlv4si3:
30305 new_icode = CODE_FOR_rotlv4si3;
30306 goto xop_rotl;
30307 case CODE_FOR_xop_rotlv8hi3:
30308 new_icode = CODE_FOR_rotlv8hi3;
30309 goto xop_rotl;
30310 case CODE_FOR_xop_rotlv16qi3:
30311 new_icode = CODE_FOR_rotlv16qi3;
30312 xop_rotl:
30313 if (CONST_INT_P (op))
30315 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30316 op = GEN_INT (INTVAL (op) & mask);
30317 gcc_checking_assert
30318 (insn_data[icode].operand[i + 1].predicate (op, mode));
30320 else
30322 gcc_checking_assert
30323 (nargs == 2
30324 && insn_data[new_icode].operand[0].mode == tmode
30325 && insn_data[new_icode].operand[1].mode == tmode
30326 && insn_data[new_icode].operand[2].mode == mode
30327 && insn_data[new_icode].operand[0].predicate
30328 == insn_data[icode].operand[0].predicate
30329 && insn_data[new_icode].operand[1].predicate
30330 == insn_data[icode].operand[1].predicate);
30331 icode = new_icode;
30332 goto non_constant;
30334 break;
30335 default:
30336 gcc_unreachable ();
30340 else
30342 non_constant:
30343 if (VECTOR_MODE_P (mode))
30344 op = safe_vector_operand (op, mode);
30346 /* If we aren't optimizing, only allow one memory operand to be
30347 generated. */
30348 if (memory_operand (op, mode))
30349 num_memory++;
30351 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30353 if (optimize
30354 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30355 || num_memory > 1)
30356 op = force_reg (mode, op);
30359 args[i].op = op;
30360 args[i].mode = mode;
30363 switch (nargs)
30365 case 1:
30366 pat = GEN_FCN (icode) (target, args[0].op);
30367 break;
30369 case 2:
30370 if (tf_p)
30371 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30372 GEN_INT ((int)sub_code));
30373 else if (! comparison_p)
30374 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30375 else
30377 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30378 args[0].op,
30379 args[1].op);
30381 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30383 break;
30385 case 3:
30386 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30387 break;
30389 case 4:
30390 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30391 break;
30393 default:
30394 gcc_unreachable ();
30397 if (! pat)
30398 return 0;
30400 emit_insn (pat);
30401 return target;
30404 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30405 insns with vec_merge. */
30407 static rtx
30408 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30409 rtx target)
30411 rtx pat;
30412 tree arg0 = CALL_EXPR_ARG (exp, 0);
30413 rtx op1, op0 = expand_normal (arg0);
30414 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30415 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30417 if (optimize || !target
30418 || GET_MODE (target) != tmode
30419 || !insn_data[icode].operand[0].predicate (target, tmode))
30420 target = gen_reg_rtx (tmode);
30422 if (VECTOR_MODE_P (mode0))
30423 op0 = safe_vector_operand (op0, mode0);
30425 if ((optimize && !register_operand (op0, mode0))
30426 || !insn_data[icode].operand[1].predicate (op0, mode0))
30427 op0 = copy_to_mode_reg (mode0, op0);
30429 op1 = op0;
30430 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30431 op1 = copy_to_mode_reg (mode0, op1);
30433 pat = GEN_FCN (icode) (target, op0, op1);
30434 if (! pat)
30435 return 0;
30436 emit_insn (pat);
30437 return target;
30440 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30442 static rtx
30443 ix86_expand_sse_compare (const struct builtin_description *d,
30444 tree exp, rtx target, bool swap)
30446 rtx pat;
30447 tree arg0 = CALL_EXPR_ARG (exp, 0);
30448 tree arg1 = CALL_EXPR_ARG (exp, 1);
30449 rtx op0 = expand_normal (arg0);
30450 rtx op1 = expand_normal (arg1);
30451 rtx op2;
30452 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30453 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30454 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30455 enum rtx_code comparison = d->comparison;
30457 if (VECTOR_MODE_P (mode0))
30458 op0 = safe_vector_operand (op0, mode0);
30459 if (VECTOR_MODE_P (mode1))
30460 op1 = safe_vector_operand (op1, mode1);
30462 /* Swap operands if we have a comparison that isn't available in
30463 hardware. */
30464 if (swap)
30466 rtx tmp = gen_reg_rtx (mode1);
30467 emit_move_insn (tmp, op1);
30468 op1 = op0;
30469 op0 = tmp;
30472 if (optimize || !target
30473 || GET_MODE (target) != tmode
30474 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30475 target = gen_reg_rtx (tmode);
30477 if ((optimize && !register_operand (op0, mode0))
30478 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30479 op0 = copy_to_mode_reg (mode0, op0);
30480 if ((optimize && !register_operand (op1, mode1))
30481 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30482 op1 = copy_to_mode_reg (mode1, op1);
30484 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30485 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30486 if (! pat)
30487 return 0;
30488 emit_insn (pat);
30489 return target;
30492 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30494 static rtx
30495 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30496 rtx target)
30498 rtx pat;
30499 tree arg0 = CALL_EXPR_ARG (exp, 0);
30500 tree arg1 = CALL_EXPR_ARG (exp, 1);
30501 rtx op0 = expand_normal (arg0);
30502 rtx op1 = expand_normal (arg1);
30503 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30504 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30505 enum rtx_code comparison = d->comparison;
30507 if (VECTOR_MODE_P (mode0))
30508 op0 = safe_vector_operand (op0, mode0);
30509 if (VECTOR_MODE_P (mode1))
30510 op1 = safe_vector_operand (op1, mode1);
30512 /* Swap operands if we have a comparison that isn't available in
30513 hardware. */
30514 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30516 rtx tmp = op1;
30517 op1 = op0;
30518 op0 = tmp;
30521 target = gen_reg_rtx (SImode);
30522 emit_move_insn (target, const0_rtx);
30523 target = gen_rtx_SUBREG (QImode, target, 0);
30525 if ((optimize && !register_operand (op0, mode0))
30526 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30527 op0 = copy_to_mode_reg (mode0, op0);
30528 if ((optimize && !register_operand (op1, mode1))
30529 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30530 op1 = copy_to_mode_reg (mode1, op1);
30532 pat = GEN_FCN (d->icode) (op0, op1);
30533 if (! pat)
30534 return 0;
30535 emit_insn (pat);
30536 emit_insn (gen_rtx_SET (VOIDmode,
30537 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30538 gen_rtx_fmt_ee (comparison, QImode,
30539 SET_DEST (pat),
30540 const0_rtx)));
30542 return SUBREG_REG (target);
30545 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30547 static rtx
30548 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30549 rtx target)
30551 rtx pat;
30552 tree arg0 = CALL_EXPR_ARG (exp, 0);
30553 rtx op1, op0 = expand_normal (arg0);
30554 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30555 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30557 if (optimize || target == 0
30558 || GET_MODE (target) != tmode
30559 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30560 target = gen_reg_rtx (tmode);
30562 if (VECTOR_MODE_P (mode0))
30563 op0 = safe_vector_operand (op0, mode0);
30565 if ((optimize && !register_operand (op0, mode0))
30566 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30567 op0 = copy_to_mode_reg (mode0, op0);
30569 op1 = GEN_INT (d->comparison);
30571 pat = GEN_FCN (d->icode) (target, op0, op1);
30572 if (! pat)
30573 return 0;
30574 emit_insn (pat);
30575 return target;
30578 static rtx
30579 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30580 tree exp, rtx target)
30582 rtx pat;
30583 tree arg0 = CALL_EXPR_ARG (exp, 0);
30584 tree arg1 = CALL_EXPR_ARG (exp, 1);
30585 rtx op0 = expand_normal (arg0);
30586 rtx op1 = expand_normal (arg1);
30587 rtx op2;
30588 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30589 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30590 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30592 if (optimize || target == 0
30593 || GET_MODE (target) != tmode
30594 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30595 target = gen_reg_rtx (tmode);
30597 op0 = safe_vector_operand (op0, mode0);
30598 op1 = safe_vector_operand (op1, mode1);
30600 if ((optimize && !register_operand (op0, mode0))
30601 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30602 op0 = copy_to_mode_reg (mode0, op0);
30603 if ((optimize && !register_operand (op1, mode1))
30604 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30605 op1 = copy_to_mode_reg (mode1, op1);
30607 op2 = GEN_INT (d->comparison);
30609 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30610 if (! pat)
30611 return 0;
30612 emit_insn (pat);
30613 return target;
30616 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30618 static rtx
30619 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30620 rtx target)
30622 rtx pat;
30623 tree arg0 = CALL_EXPR_ARG (exp, 0);
30624 tree arg1 = CALL_EXPR_ARG (exp, 1);
30625 rtx op0 = expand_normal (arg0);
30626 rtx op1 = expand_normal (arg1);
30627 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30628 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30629 enum rtx_code comparison = d->comparison;
30631 if (VECTOR_MODE_P (mode0))
30632 op0 = safe_vector_operand (op0, mode0);
30633 if (VECTOR_MODE_P (mode1))
30634 op1 = safe_vector_operand (op1, mode1);
30636 target = gen_reg_rtx (SImode);
30637 emit_move_insn (target, const0_rtx);
30638 target = gen_rtx_SUBREG (QImode, target, 0);
30640 if ((optimize && !register_operand (op0, mode0))
30641 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30642 op0 = copy_to_mode_reg (mode0, op0);
30643 if ((optimize && !register_operand (op1, mode1))
30644 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30645 op1 = copy_to_mode_reg (mode1, op1);
30647 pat = GEN_FCN (d->icode) (op0, op1);
30648 if (! pat)
30649 return 0;
30650 emit_insn (pat);
30651 emit_insn (gen_rtx_SET (VOIDmode,
30652 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30653 gen_rtx_fmt_ee (comparison, QImode,
30654 SET_DEST (pat),
30655 const0_rtx)));
30657 return SUBREG_REG (target);
30660 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30662 static rtx
30663 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30664 tree exp, rtx target)
30666 rtx pat;
30667 tree arg0 = CALL_EXPR_ARG (exp, 0);
30668 tree arg1 = CALL_EXPR_ARG (exp, 1);
30669 tree arg2 = CALL_EXPR_ARG (exp, 2);
30670 tree arg3 = CALL_EXPR_ARG (exp, 3);
30671 tree arg4 = CALL_EXPR_ARG (exp, 4);
30672 rtx scratch0, scratch1;
30673 rtx op0 = expand_normal (arg0);
30674 rtx op1 = expand_normal (arg1);
30675 rtx op2 = expand_normal (arg2);
30676 rtx op3 = expand_normal (arg3);
30677 rtx op4 = expand_normal (arg4);
30678 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30680 tmode0 = insn_data[d->icode].operand[0].mode;
30681 tmode1 = insn_data[d->icode].operand[1].mode;
30682 modev2 = insn_data[d->icode].operand[2].mode;
30683 modei3 = insn_data[d->icode].operand[3].mode;
30684 modev4 = insn_data[d->icode].operand[4].mode;
30685 modei5 = insn_data[d->icode].operand[5].mode;
30686 modeimm = insn_data[d->icode].operand[6].mode;
30688 if (VECTOR_MODE_P (modev2))
30689 op0 = safe_vector_operand (op0, modev2);
30690 if (VECTOR_MODE_P (modev4))
30691 op2 = safe_vector_operand (op2, modev4);
30693 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30694 op0 = copy_to_mode_reg (modev2, op0);
30695 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30696 op1 = copy_to_mode_reg (modei3, op1);
30697 if ((optimize && !register_operand (op2, modev4))
30698 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30699 op2 = copy_to_mode_reg (modev4, op2);
30700 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30701 op3 = copy_to_mode_reg (modei5, op3);
30703 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30705 error ("the fifth argument must be an 8-bit immediate");
30706 return const0_rtx;
30709 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30711 if (optimize || !target
30712 || GET_MODE (target) != tmode0
30713 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30714 target = gen_reg_rtx (tmode0);
30716 scratch1 = gen_reg_rtx (tmode1);
30718 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30720 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30722 if (optimize || !target
30723 || GET_MODE (target) != tmode1
30724 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30725 target = gen_reg_rtx (tmode1);
30727 scratch0 = gen_reg_rtx (tmode0);
30729 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30731 else
30733 gcc_assert (d->flag);
30735 scratch0 = gen_reg_rtx (tmode0);
30736 scratch1 = gen_reg_rtx (tmode1);
30738 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30741 if (! pat)
30742 return 0;
30744 emit_insn (pat);
30746 if (d->flag)
30748 target = gen_reg_rtx (SImode);
30749 emit_move_insn (target, const0_rtx);
30750 target = gen_rtx_SUBREG (QImode, target, 0);
30752 emit_insn
30753 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30754 gen_rtx_fmt_ee (EQ, QImode,
30755 gen_rtx_REG ((enum machine_mode) d->flag,
30756 FLAGS_REG),
30757 const0_rtx)));
30758 return SUBREG_REG (target);
30760 else
30761 return target;
30765 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30767 static rtx
30768 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30769 tree exp, rtx target)
30771 rtx pat;
30772 tree arg0 = CALL_EXPR_ARG (exp, 0);
30773 tree arg1 = CALL_EXPR_ARG (exp, 1);
30774 tree arg2 = CALL_EXPR_ARG (exp, 2);
30775 rtx scratch0, scratch1;
30776 rtx op0 = expand_normal (arg0);
30777 rtx op1 = expand_normal (arg1);
30778 rtx op2 = expand_normal (arg2);
30779 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30781 tmode0 = insn_data[d->icode].operand[0].mode;
30782 tmode1 = insn_data[d->icode].operand[1].mode;
30783 modev2 = insn_data[d->icode].operand[2].mode;
30784 modev3 = insn_data[d->icode].operand[3].mode;
30785 modeimm = insn_data[d->icode].operand[4].mode;
30787 if (VECTOR_MODE_P (modev2))
30788 op0 = safe_vector_operand (op0, modev2);
30789 if (VECTOR_MODE_P (modev3))
30790 op1 = safe_vector_operand (op1, modev3);
30792 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30793 op0 = copy_to_mode_reg (modev2, op0);
30794 if ((optimize && !register_operand (op1, modev3))
30795 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30796 op1 = copy_to_mode_reg (modev3, op1);
30798 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30800 error ("the third argument must be an 8-bit immediate");
30801 return const0_rtx;
30804 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30806 if (optimize || !target
30807 || GET_MODE (target) != tmode0
30808 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30809 target = gen_reg_rtx (tmode0);
30811 scratch1 = gen_reg_rtx (tmode1);
30813 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30815 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30817 if (optimize || !target
30818 || GET_MODE (target) != tmode1
30819 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30820 target = gen_reg_rtx (tmode1);
30822 scratch0 = gen_reg_rtx (tmode0);
30824 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30826 else
30828 gcc_assert (d->flag);
30830 scratch0 = gen_reg_rtx (tmode0);
30831 scratch1 = gen_reg_rtx (tmode1);
30833 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30836 if (! pat)
30837 return 0;
30839 emit_insn (pat);
30841 if (d->flag)
30843 target = gen_reg_rtx (SImode);
30844 emit_move_insn (target, const0_rtx);
30845 target = gen_rtx_SUBREG (QImode, target, 0);
30847 emit_insn
30848 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30849 gen_rtx_fmt_ee (EQ, QImode,
30850 gen_rtx_REG ((enum machine_mode) d->flag,
30851 FLAGS_REG),
30852 const0_rtx)));
30853 return SUBREG_REG (target);
30855 else
30856 return target;
30859 /* Subroutine of ix86_expand_builtin to take care of insns with
30860 variable number of operands. */
30862 static rtx
30863 ix86_expand_args_builtin (const struct builtin_description *d,
30864 tree exp, rtx target)
30866 rtx pat, real_target;
30867 unsigned int i, nargs;
30868 unsigned int nargs_constant = 0;
30869 int num_memory = 0;
30870 struct
30872 rtx op;
30873 enum machine_mode mode;
30874 } args[4];
30875 bool last_arg_count = false;
30876 enum insn_code icode = d->icode;
30877 const struct insn_data_d *insn_p = &insn_data[icode];
30878 enum machine_mode tmode = insn_p->operand[0].mode;
30879 enum machine_mode rmode = VOIDmode;
30880 bool swap = false;
30881 enum rtx_code comparison = d->comparison;
30883 switch ((enum ix86_builtin_func_type) d->flag)
30885 case V2DF_FTYPE_V2DF_ROUND:
30886 case V4DF_FTYPE_V4DF_ROUND:
30887 case V4SF_FTYPE_V4SF_ROUND:
30888 case V8SF_FTYPE_V8SF_ROUND:
30889 case V4SI_FTYPE_V4SF_ROUND:
30890 case V8SI_FTYPE_V8SF_ROUND:
30891 return ix86_expand_sse_round (d, exp, target);
30892 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30893 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30894 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30895 case INT_FTYPE_V8SF_V8SF_PTEST:
30896 case INT_FTYPE_V4DI_V4DI_PTEST:
30897 case INT_FTYPE_V4DF_V4DF_PTEST:
30898 case INT_FTYPE_V4SF_V4SF_PTEST:
30899 case INT_FTYPE_V2DI_V2DI_PTEST:
30900 case INT_FTYPE_V2DF_V2DF_PTEST:
30901 return ix86_expand_sse_ptest (d, exp, target);
30902 case FLOAT128_FTYPE_FLOAT128:
30903 case FLOAT_FTYPE_FLOAT:
30904 case INT_FTYPE_INT:
30905 case UINT64_FTYPE_INT:
30906 case UINT16_FTYPE_UINT16:
30907 case INT64_FTYPE_INT64:
30908 case INT64_FTYPE_V4SF:
30909 case INT64_FTYPE_V2DF:
30910 case INT_FTYPE_V16QI:
30911 case INT_FTYPE_V8QI:
30912 case INT_FTYPE_V8SF:
30913 case INT_FTYPE_V4DF:
30914 case INT_FTYPE_V4SF:
30915 case INT_FTYPE_V2DF:
30916 case INT_FTYPE_V32QI:
30917 case V16QI_FTYPE_V16QI:
30918 case V8SI_FTYPE_V8SF:
30919 case V8SI_FTYPE_V4SI:
30920 case V8HI_FTYPE_V8HI:
30921 case V8HI_FTYPE_V16QI:
30922 case V8QI_FTYPE_V8QI:
30923 case V8SF_FTYPE_V8SF:
30924 case V8SF_FTYPE_V8SI:
30925 case V8SF_FTYPE_V4SF:
30926 case V8SF_FTYPE_V8HI:
30927 case V4SI_FTYPE_V4SI:
30928 case V4SI_FTYPE_V16QI:
30929 case V4SI_FTYPE_V4SF:
30930 case V4SI_FTYPE_V8SI:
30931 case V4SI_FTYPE_V8HI:
30932 case V4SI_FTYPE_V4DF:
30933 case V4SI_FTYPE_V2DF:
30934 case V4HI_FTYPE_V4HI:
30935 case V4DF_FTYPE_V4DF:
30936 case V4DF_FTYPE_V4SI:
30937 case V4DF_FTYPE_V4SF:
30938 case V4DF_FTYPE_V2DF:
30939 case V4SF_FTYPE_V4SF:
30940 case V4SF_FTYPE_V4SI:
30941 case V4SF_FTYPE_V8SF:
30942 case V4SF_FTYPE_V4DF:
30943 case V4SF_FTYPE_V8HI:
30944 case V4SF_FTYPE_V2DF:
30945 case V2DI_FTYPE_V2DI:
30946 case V2DI_FTYPE_V16QI:
30947 case V2DI_FTYPE_V8HI:
30948 case V2DI_FTYPE_V4SI:
30949 case V2DF_FTYPE_V2DF:
30950 case V2DF_FTYPE_V4SI:
30951 case V2DF_FTYPE_V4DF:
30952 case V2DF_FTYPE_V4SF:
30953 case V2DF_FTYPE_V2SI:
30954 case V2SI_FTYPE_V2SI:
30955 case V2SI_FTYPE_V4SF:
30956 case V2SI_FTYPE_V2SF:
30957 case V2SI_FTYPE_V2DF:
30958 case V2SF_FTYPE_V2SF:
30959 case V2SF_FTYPE_V2SI:
30960 case V32QI_FTYPE_V32QI:
30961 case V32QI_FTYPE_V16QI:
30962 case V16HI_FTYPE_V16HI:
30963 case V16HI_FTYPE_V8HI:
30964 case V8SI_FTYPE_V8SI:
30965 case V16HI_FTYPE_V16QI:
30966 case V8SI_FTYPE_V16QI:
30967 case V4DI_FTYPE_V16QI:
30968 case V8SI_FTYPE_V8HI:
30969 case V4DI_FTYPE_V8HI:
30970 case V4DI_FTYPE_V4SI:
30971 case V4DI_FTYPE_V2DI:
30972 nargs = 1;
30973 break;
30974 case V4SF_FTYPE_V4SF_VEC_MERGE:
30975 case V2DF_FTYPE_V2DF_VEC_MERGE:
30976 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30977 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30978 case V16QI_FTYPE_V16QI_V16QI:
30979 case V16QI_FTYPE_V8HI_V8HI:
30980 case V8QI_FTYPE_V8QI_V8QI:
30981 case V8QI_FTYPE_V4HI_V4HI:
30982 case V8HI_FTYPE_V8HI_V8HI:
30983 case V8HI_FTYPE_V16QI_V16QI:
30984 case V8HI_FTYPE_V4SI_V4SI:
30985 case V8SF_FTYPE_V8SF_V8SF:
30986 case V8SF_FTYPE_V8SF_V8SI:
30987 case V4SI_FTYPE_V4SI_V4SI:
30988 case V4SI_FTYPE_V8HI_V8HI:
30989 case V4SI_FTYPE_V4SF_V4SF:
30990 case V4SI_FTYPE_V2DF_V2DF:
30991 case V4HI_FTYPE_V4HI_V4HI:
30992 case V4HI_FTYPE_V8QI_V8QI:
30993 case V4HI_FTYPE_V2SI_V2SI:
30994 case V4DF_FTYPE_V4DF_V4DF:
30995 case V4DF_FTYPE_V4DF_V4DI:
30996 case V4SF_FTYPE_V4SF_V4SF:
30997 case V4SF_FTYPE_V4SF_V4SI:
30998 case V4SF_FTYPE_V4SF_V2SI:
30999 case V4SF_FTYPE_V4SF_V2DF:
31000 case V4SF_FTYPE_V4SF_DI:
31001 case V4SF_FTYPE_V4SF_SI:
31002 case V2DI_FTYPE_V2DI_V2DI:
31003 case V2DI_FTYPE_V16QI_V16QI:
31004 case V2DI_FTYPE_V4SI_V4SI:
31005 case V2UDI_FTYPE_V4USI_V4USI:
31006 case V2DI_FTYPE_V2DI_V16QI:
31007 case V2DI_FTYPE_V2DF_V2DF:
31008 case V2SI_FTYPE_V2SI_V2SI:
31009 case V2SI_FTYPE_V4HI_V4HI:
31010 case V2SI_FTYPE_V2SF_V2SF:
31011 case V2DF_FTYPE_V2DF_V2DF:
31012 case V2DF_FTYPE_V2DF_V4SF:
31013 case V2DF_FTYPE_V2DF_V2DI:
31014 case V2DF_FTYPE_V2DF_DI:
31015 case V2DF_FTYPE_V2DF_SI:
31016 case V2SF_FTYPE_V2SF_V2SF:
31017 case V1DI_FTYPE_V1DI_V1DI:
31018 case V1DI_FTYPE_V8QI_V8QI:
31019 case V1DI_FTYPE_V2SI_V2SI:
31020 case V32QI_FTYPE_V16HI_V16HI:
31021 case V16HI_FTYPE_V8SI_V8SI:
31022 case V32QI_FTYPE_V32QI_V32QI:
31023 case V16HI_FTYPE_V32QI_V32QI:
31024 case V16HI_FTYPE_V16HI_V16HI:
31025 case V8SI_FTYPE_V4DF_V4DF:
31026 case V8SI_FTYPE_V8SI_V8SI:
31027 case V8SI_FTYPE_V16HI_V16HI:
31028 case V4DI_FTYPE_V4DI_V4DI:
31029 case V4DI_FTYPE_V8SI_V8SI:
31030 case V4UDI_FTYPE_V8USI_V8USI:
31031 if (comparison == UNKNOWN)
31032 return ix86_expand_binop_builtin (icode, exp, target);
31033 nargs = 2;
31034 break;
31035 case V4SF_FTYPE_V4SF_V4SF_SWAP:
31036 case V2DF_FTYPE_V2DF_V2DF_SWAP:
31037 gcc_assert (comparison != UNKNOWN);
31038 nargs = 2;
31039 swap = true;
31040 break;
31041 case V16HI_FTYPE_V16HI_V8HI_COUNT:
31042 case V16HI_FTYPE_V16HI_SI_COUNT:
31043 case V8SI_FTYPE_V8SI_V4SI_COUNT:
31044 case V8SI_FTYPE_V8SI_SI_COUNT:
31045 case V4DI_FTYPE_V4DI_V2DI_COUNT:
31046 case V4DI_FTYPE_V4DI_INT_COUNT:
31047 case V8HI_FTYPE_V8HI_V8HI_COUNT:
31048 case V8HI_FTYPE_V8HI_SI_COUNT:
31049 case V4SI_FTYPE_V4SI_V4SI_COUNT:
31050 case V4SI_FTYPE_V4SI_SI_COUNT:
31051 case V4HI_FTYPE_V4HI_V4HI_COUNT:
31052 case V4HI_FTYPE_V4HI_SI_COUNT:
31053 case V2DI_FTYPE_V2DI_V2DI_COUNT:
31054 case V2DI_FTYPE_V2DI_SI_COUNT:
31055 case V2SI_FTYPE_V2SI_V2SI_COUNT:
31056 case V2SI_FTYPE_V2SI_SI_COUNT:
31057 case V1DI_FTYPE_V1DI_V1DI_COUNT:
31058 case V1DI_FTYPE_V1DI_SI_COUNT:
31059 nargs = 2;
31060 last_arg_count = true;
31061 break;
31062 case UINT64_FTYPE_UINT64_UINT64:
31063 case UINT_FTYPE_UINT_UINT:
31064 case UINT_FTYPE_UINT_USHORT:
31065 case UINT_FTYPE_UINT_UCHAR:
31066 case UINT16_FTYPE_UINT16_INT:
31067 case UINT8_FTYPE_UINT8_INT:
31068 nargs = 2;
31069 break;
31070 case V2DI_FTYPE_V2DI_INT_CONVERT:
31071 nargs = 2;
31072 rmode = V1TImode;
31073 nargs_constant = 1;
31074 break;
31075 case V4DI_FTYPE_V4DI_INT_CONVERT:
31076 nargs = 2;
31077 rmode = V2TImode;
31078 nargs_constant = 1;
31079 break;
31080 case V8HI_FTYPE_V8HI_INT:
31081 case V8HI_FTYPE_V8SF_INT:
31082 case V8HI_FTYPE_V4SF_INT:
31083 case V8SF_FTYPE_V8SF_INT:
31084 case V4SI_FTYPE_V4SI_INT:
31085 case V4SI_FTYPE_V8SI_INT:
31086 case V4HI_FTYPE_V4HI_INT:
31087 case V4DF_FTYPE_V4DF_INT:
31088 case V4SF_FTYPE_V4SF_INT:
31089 case V4SF_FTYPE_V8SF_INT:
31090 case V2DI_FTYPE_V2DI_INT:
31091 case V2DF_FTYPE_V2DF_INT:
31092 case V2DF_FTYPE_V4DF_INT:
31093 case V16HI_FTYPE_V16HI_INT:
31094 case V8SI_FTYPE_V8SI_INT:
31095 case V4DI_FTYPE_V4DI_INT:
31096 case V2DI_FTYPE_V4DI_INT:
31097 nargs = 2;
31098 nargs_constant = 1;
31099 break;
31100 case V16QI_FTYPE_V16QI_V16QI_V16QI:
31101 case V8SF_FTYPE_V8SF_V8SF_V8SF:
31102 case V4DF_FTYPE_V4DF_V4DF_V4DF:
31103 case V4SF_FTYPE_V4SF_V4SF_V4SF:
31104 case V2DF_FTYPE_V2DF_V2DF_V2DF:
31105 case V32QI_FTYPE_V32QI_V32QI_V32QI:
31106 nargs = 3;
31107 break;
31108 case V32QI_FTYPE_V32QI_V32QI_INT:
31109 case V16HI_FTYPE_V16HI_V16HI_INT:
31110 case V16QI_FTYPE_V16QI_V16QI_INT:
31111 case V4DI_FTYPE_V4DI_V4DI_INT:
31112 case V8HI_FTYPE_V8HI_V8HI_INT:
31113 case V8SI_FTYPE_V8SI_V8SI_INT:
31114 case V8SI_FTYPE_V8SI_V4SI_INT:
31115 case V8SF_FTYPE_V8SF_V8SF_INT:
31116 case V8SF_FTYPE_V8SF_V4SF_INT:
31117 case V4SI_FTYPE_V4SI_V4SI_INT:
31118 case V4DF_FTYPE_V4DF_V4DF_INT:
31119 case V4DF_FTYPE_V4DF_V2DF_INT:
31120 case V4SF_FTYPE_V4SF_V4SF_INT:
31121 case V2DI_FTYPE_V2DI_V2DI_INT:
31122 case V4DI_FTYPE_V4DI_V2DI_INT:
31123 case V2DF_FTYPE_V2DF_V2DF_INT:
31124 nargs = 3;
31125 nargs_constant = 1;
31126 break;
31127 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31128 nargs = 3;
31129 rmode = V4DImode;
31130 nargs_constant = 1;
31131 break;
31132 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31133 nargs = 3;
31134 rmode = V2DImode;
31135 nargs_constant = 1;
31136 break;
31137 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31138 nargs = 3;
31139 rmode = DImode;
31140 nargs_constant = 1;
31141 break;
31142 case V2DI_FTYPE_V2DI_UINT_UINT:
31143 nargs = 3;
31144 nargs_constant = 2;
31145 break;
31146 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31147 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31148 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31149 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31150 nargs = 4;
31151 nargs_constant = 1;
31152 break;
31153 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31154 nargs = 4;
31155 nargs_constant = 2;
31156 break;
31157 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31158 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31159 nargs = 4;
31160 break;
31161 default:
31162 gcc_unreachable ();
31165 gcc_assert (nargs <= ARRAY_SIZE (args));
31167 if (comparison != UNKNOWN)
31169 gcc_assert (nargs == 2);
31170 return ix86_expand_sse_compare (d, exp, target, swap);
31173 if (rmode == VOIDmode || rmode == tmode)
31175 if (optimize
31176 || target == 0
31177 || GET_MODE (target) != tmode
31178 || !insn_p->operand[0].predicate (target, tmode))
31179 target = gen_reg_rtx (tmode);
31180 real_target = target;
31182 else
31184 target = gen_reg_rtx (rmode);
31185 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31188 for (i = 0; i < nargs; i++)
31190 tree arg = CALL_EXPR_ARG (exp, i);
31191 rtx op = expand_normal (arg);
31192 enum machine_mode mode = insn_p->operand[i + 1].mode;
31193 bool match = insn_p->operand[i + 1].predicate (op, mode);
31195 if (last_arg_count && (i + 1) == nargs)
31197 /* SIMD shift insns take either an 8-bit immediate or
31198 register as count. But builtin functions take int as
31199 count. If count doesn't match, we put it in register. */
31200 if (!match)
31202 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31203 if (!insn_p->operand[i + 1].predicate (op, mode))
31204 op = copy_to_reg (op);
31207 else if ((nargs - i) <= nargs_constant)
31209 if (!match)
31210 switch (icode)
31212 case CODE_FOR_avx2_inserti128:
31213 case CODE_FOR_avx2_extracti128:
31214 error ("the last argument must be an 1-bit immediate");
31215 return const0_rtx;
31217 case CODE_FOR_sse4_1_roundsd:
31218 case CODE_FOR_sse4_1_roundss:
31220 case CODE_FOR_sse4_1_roundpd:
31221 case CODE_FOR_sse4_1_roundps:
31222 case CODE_FOR_avx_roundpd256:
31223 case CODE_FOR_avx_roundps256:
31225 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31226 case CODE_FOR_sse4_1_roundps_sfix:
31227 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31228 case CODE_FOR_avx_roundps_sfix256:
31230 case CODE_FOR_sse4_1_blendps:
31231 case CODE_FOR_avx_blendpd256:
31232 case CODE_FOR_avx_vpermilv4df:
31233 error ("the last argument must be a 4-bit immediate");
31234 return const0_rtx;
31236 case CODE_FOR_sse4_1_blendpd:
31237 case CODE_FOR_avx_vpermilv2df:
31238 case CODE_FOR_xop_vpermil2v2df3:
31239 case CODE_FOR_xop_vpermil2v4sf3:
31240 case CODE_FOR_xop_vpermil2v4df3:
31241 case CODE_FOR_xop_vpermil2v8sf3:
31242 error ("the last argument must be a 2-bit immediate");
31243 return const0_rtx;
31245 case CODE_FOR_avx_vextractf128v4df:
31246 case CODE_FOR_avx_vextractf128v8sf:
31247 case CODE_FOR_avx_vextractf128v8si:
31248 case CODE_FOR_avx_vinsertf128v4df:
31249 case CODE_FOR_avx_vinsertf128v8sf:
31250 case CODE_FOR_avx_vinsertf128v8si:
31251 error ("the last argument must be a 1-bit immediate");
31252 return const0_rtx;
31254 case CODE_FOR_avx_vmcmpv2df3:
31255 case CODE_FOR_avx_vmcmpv4sf3:
31256 case CODE_FOR_avx_cmpv2df3:
31257 case CODE_FOR_avx_cmpv4sf3:
31258 case CODE_FOR_avx_cmpv4df3:
31259 case CODE_FOR_avx_cmpv8sf3:
31260 error ("the last argument must be a 5-bit immediate");
31261 return const0_rtx;
31263 default:
31264 switch (nargs_constant)
31266 case 2:
31267 if ((nargs - i) == nargs_constant)
31269 error ("the next to last argument must be an 8-bit immediate");
31270 break;
31272 case 1:
31273 error ("the last argument must be an 8-bit immediate");
31274 break;
31275 default:
31276 gcc_unreachable ();
31278 return const0_rtx;
31281 else
31283 if (VECTOR_MODE_P (mode))
31284 op = safe_vector_operand (op, mode);
31286 /* If we aren't optimizing, only allow one memory operand to
31287 be generated. */
31288 if (memory_operand (op, mode))
31289 num_memory++;
31291 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31293 if (optimize || !match || num_memory > 1)
31294 op = copy_to_mode_reg (mode, op);
31296 else
31298 op = copy_to_reg (op);
31299 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31303 args[i].op = op;
31304 args[i].mode = mode;
31307 switch (nargs)
31309 case 1:
31310 pat = GEN_FCN (icode) (real_target, args[0].op);
31311 break;
31312 case 2:
31313 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31314 break;
31315 case 3:
31316 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31317 args[2].op);
31318 break;
31319 case 4:
31320 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31321 args[2].op, args[3].op);
31322 break;
31323 default:
31324 gcc_unreachable ();
31327 if (! pat)
31328 return 0;
31330 emit_insn (pat);
31331 return target;
31334 /* Subroutine of ix86_expand_builtin to take care of special insns
31335 with variable number of operands. */
31337 static rtx
31338 ix86_expand_special_args_builtin (const struct builtin_description *d,
31339 tree exp, rtx target)
31341 tree arg;
31342 rtx pat, op;
31343 unsigned int i, nargs, arg_adjust, memory;
31344 struct
31346 rtx op;
31347 enum machine_mode mode;
31348 } args[3];
31349 enum insn_code icode = d->icode;
31350 bool last_arg_constant = false;
31351 const struct insn_data_d *insn_p = &insn_data[icode];
31352 enum machine_mode tmode = insn_p->operand[0].mode;
31353 enum { load, store } klass;
31355 switch ((enum ix86_builtin_func_type) d->flag)
31357 case VOID_FTYPE_VOID:
31358 emit_insn (GEN_FCN (icode) (target));
31359 return 0;
31360 case VOID_FTYPE_UINT64:
31361 case VOID_FTYPE_UNSIGNED:
31362 nargs = 0;
31363 klass = store;
31364 memory = 0;
31365 break;
31367 case INT_FTYPE_VOID:
31368 case UINT64_FTYPE_VOID:
31369 case UNSIGNED_FTYPE_VOID:
31370 nargs = 0;
31371 klass = load;
31372 memory = 0;
31373 break;
31374 case UINT64_FTYPE_PUNSIGNED:
31375 case V2DI_FTYPE_PV2DI:
31376 case V4DI_FTYPE_PV4DI:
31377 case V32QI_FTYPE_PCCHAR:
31378 case V16QI_FTYPE_PCCHAR:
31379 case V8SF_FTYPE_PCV4SF:
31380 case V8SF_FTYPE_PCFLOAT:
31381 case V4SF_FTYPE_PCFLOAT:
31382 case V4DF_FTYPE_PCV2DF:
31383 case V4DF_FTYPE_PCDOUBLE:
31384 case V2DF_FTYPE_PCDOUBLE:
31385 case VOID_FTYPE_PVOID:
31386 nargs = 1;
31387 klass = load;
31388 memory = 0;
31389 break;
31390 case VOID_FTYPE_PV2SF_V4SF:
31391 case VOID_FTYPE_PV4DI_V4DI:
31392 case VOID_FTYPE_PV2DI_V2DI:
31393 case VOID_FTYPE_PCHAR_V32QI:
31394 case VOID_FTYPE_PCHAR_V16QI:
31395 case VOID_FTYPE_PFLOAT_V8SF:
31396 case VOID_FTYPE_PFLOAT_V4SF:
31397 case VOID_FTYPE_PDOUBLE_V4DF:
31398 case VOID_FTYPE_PDOUBLE_V2DF:
31399 case VOID_FTYPE_PLONGLONG_LONGLONG:
31400 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31401 case VOID_FTYPE_PINT_INT:
31402 nargs = 1;
31403 klass = store;
31404 /* Reserve memory operand for target. */
31405 memory = ARRAY_SIZE (args);
31406 break;
31407 case V4SF_FTYPE_V4SF_PCV2SF:
31408 case V2DF_FTYPE_V2DF_PCDOUBLE:
31409 nargs = 2;
31410 klass = load;
31411 memory = 1;
31412 break;
31413 case V8SF_FTYPE_PCV8SF_V8SI:
31414 case V4DF_FTYPE_PCV4DF_V4DI:
31415 case V4SF_FTYPE_PCV4SF_V4SI:
31416 case V2DF_FTYPE_PCV2DF_V2DI:
31417 case V8SI_FTYPE_PCV8SI_V8SI:
31418 case V4DI_FTYPE_PCV4DI_V4DI:
31419 case V4SI_FTYPE_PCV4SI_V4SI:
31420 case V2DI_FTYPE_PCV2DI_V2DI:
31421 nargs = 2;
31422 klass = load;
31423 memory = 0;
31424 break;
31425 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31426 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31427 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31428 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31429 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31430 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31431 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31432 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31433 nargs = 2;
31434 klass = store;
31435 /* Reserve memory operand for target. */
31436 memory = ARRAY_SIZE (args);
31437 break;
31438 case VOID_FTYPE_UINT_UINT_UINT:
31439 case VOID_FTYPE_UINT64_UINT_UINT:
31440 case UCHAR_FTYPE_UINT_UINT_UINT:
31441 case UCHAR_FTYPE_UINT64_UINT_UINT:
31442 nargs = 3;
31443 klass = load;
31444 memory = ARRAY_SIZE (args);
31445 last_arg_constant = true;
31446 break;
31447 default:
31448 gcc_unreachable ();
31451 gcc_assert (nargs <= ARRAY_SIZE (args));
31453 if (klass == store)
31455 arg = CALL_EXPR_ARG (exp, 0);
31456 op = expand_normal (arg);
31457 gcc_assert (target == 0);
31458 if (memory)
31460 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31461 target = gen_rtx_MEM (tmode, op);
31463 else
31464 target = force_reg (tmode, op);
31465 arg_adjust = 1;
31467 else
31469 arg_adjust = 0;
31470 if (optimize
31471 || target == 0
31472 || !register_operand (target, tmode)
31473 || GET_MODE (target) != tmode)
31474 target = gen_reg_rtx (tmode);
31477 for (i = 0; i < nargs; i++)
31479 enum machine_mode mode = insn_p->operand[i + 1].mode;
31480 bool match;
31482 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31483 op = expand_normal (arg);
31484 match = insn_p->operand[i + 1].predicate (op, mode);
31486 if (last_arg_constant && (i + 1) == nargs)
31488 if (!match)
31490 if (icode == CODE_FOR_lwp_lwpvalsi3
31491 || icode == CODE_FOR_lwp_lwpinssi3
31492 || icode == CODE_FOR_lwp_lwpvaldi3
31493 || icode == CODE_FOR_lwp_lwpinsdi3)
31494 error ("the last argument must be a 32-bit immediate");
31495 else
31496 error ("the last argument must be an 8-bit immediate");
31497 return const0_rtx;
31500 else
31502 if (i == memory)
31504 /* This must be the memory operand. */
31505 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31506 op = gen_rtx_MEM (mode, op);
31507 gcc_assert (GET_MODE (op) == mode
31508 || GET_MODE (op) == VOIDmode);
31510 else
31512 /* This must be register. */
31513 if (VECTOR_MODE_P (mode))
31514 op = safe_vector_operand (op, mode);
31516 gcc_assert (GET_MODE (op) == mode
31517 || GET_MODE (op) == VOIDmode);
31518 op = copy_to_mode_reg (mode, op);
31522 args[i].op = op;
31523 args[i].mode = mode;
31526 switch (nargs)
31528 case 0:
31529 pat = GEN_FCN (icode) (target);
31530 break;
31531 case 1:
31532 pat = GEN_FCN (icode) (target, args[0].op);
31533 break;
31534 case 2:
31535 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31536 break;
31537 case 3:
31538 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31539 break;
31540 default:
31541 gcc_unreachable ();
31544 if (! pat)
31545 return 0;
31546 emit_insn (pat);
31547 return klass == store ? 0 : target;
31550 /* Return the integer constant in ARG. Constrain it to be in the range
31551 of the subparts of VEC_TYPE; issue an error if not. */
31553 static int
31554 get_element_number (tree vec_type, tree arg)
31556 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31558 if (!host_integerp (arg, 1)
31559 || (elt = tree_low_cst (arg, 1), elt > max))
31561 error ("selector must be an integer constant in the range 0..%wi", max);
31562 return 0;
31565 return elt;
31568 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31569 ix86_expand_vector_init. We DO have language-level syntax for this, in
31570 the form of (type){ init-list }. Except that since we can't place emms
31571 instructions from inside the compiler, we can't allow the use of MMX
31572 registers unless the user explicitly asks for it. So we do *not* define
31573 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31574 we have builtins invoked by mmintrin.h that gives us license to emit
31575 these sorts of instructions. */
31577 static rtx
31578 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31580 enum machine_mode tmode = TYPE_MODE (type);
31581 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31582 int i, n_elt = GET_MODE_NUNITS (tmode);
31583 rtvec v = rtvec_alloc (n_elt);
31585 gcc_assert (VECTOR_MODE_P (tmode));
31586 gcc_assert (call_expr_nargs (exp) == n_elt);
31588 for (i = 0; i < n_elt; ++i)
31590 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31591 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31594 if (!target || !register_operand (target, tmode))
31595 target = gen_reg_rtx (tmode);
31597 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31598 return target;
31601 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31602 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31603 had a language-level syntax for referencing vector elements. */
31605 static rtx
31606 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31608 enum machine_mode tmode, mode0;
31609 tree arg0, arg1;
31610 int elt;
31611 rtx op0;
31613 arg0 = CALL_EXPR_ARG (exp, 0);
31614 arg1 = CALL_EXPR_ARG (exp, 1);
31616 op0 = expand_normal (arg0);
31617 elt = get_element_number (TREE_TYPE (arg0), arg1);
31619 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31620 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31621 gcc_assert (VECTOR_MODE_P (mode0));
31623 op0 = force_reg (mode0, op0);
31625 if (optimize || !target || !register_operand (target, tmode))
31626 target = gen_reg_rtx (tmode);
31628 ix86_expand_vector_extract (true, target, op0, elt);
31630 return target;
31633 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31634 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31635 a language-level syntax for referencing vector elements. */
31637 static rtx
31638 ix86_expand_vec_set_builtin (tree exp)
31640 enum machine_mode tmode, mode1;
31641 tree arg0, arg1, arg2;
31642 int elt;
31643 rtx op0, op1, target;
31645 arg0 = CALL_EXPR_ARG (exp, 0);
31646 arg1 = CALL_EXPR_ARG (exp, 1);
31647 arg2 = CALL_EXPR_ARG (exp, 2);
31649 tmode = TYPE_MODE (TREE_TYPE (arg0));
31650 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31651 gcc_assert (VECTOR_MODE_P (tmode));
31653 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31654 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31655 elt = get_element_number (TREE_TYPE (arg0), arg2);
31657 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31658 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31660 op0 = force_reg (tmode, op0);
31661 op1 = force_reg (mode1, op1);
31663 /* OP0 is the source of these builtin functions and shouldn't be
31664 modified. Create a copy, use it and return it as target. */
31665 target = gen_reg_rtx (tmode);
31666 emit_move_insn (target, op0);
31667 ix86_expand_vector_set (true, target, op1, elt);
31669 return target;
31672 /* Expand an expression EXP that calls a built-in function,
31673 with result going to TARGET if that's convenient
31674 (and in mode MODE if that's convenient).
31675 SUBTARGET may be used as the target for computing one of EXP's operands.
31676 IGNORE is nonzero if the value is to be ignored. */
31678 static rtx
31679 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31680 enum machine_mode mode ATTRIBUTE_UNUSED,
31681 int ignore ATTRIBUTE_UNUSED)
31683 const struct builtin_description *d;
31684 size_t i;
31685 enum insn_code icode;
31686 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31687 tree arg0, arg1, arg2, arg3, arg4;
31688 rtx op0, op1, op2, op3, op4, pat, insn;
31689 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31690 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31692 /* For CPU builtins that can be folded, fold first and expand the fold. */
31693 switch (fcode)
31695 case IX86_BUILTIN_CPU_INIT:
31697 /* Make it call __cpu_indicator_init in libgcc. */
31698 tree call_expr, fndecl, type;
31699 type = build_function_type_list (integer_type_node, NULL_TREE);
31700 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31701 call_expr = build_call_expr (fndecl, 0);
31702 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31704 case IX86_BUILTIN_CPU_IS:
31705 case IX86_BUILTIN_CPU_SUPPORTS:
31707 tree arg0 = CALL_EXPR_ARG (exp, 0);
31708 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31709 gcc_assert (fold_expr != NULL_TREE);
31710 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31714 /* Determine whether the builtin function is available under the current ISA.
31715 Originally the builtin was not created if it wasn't applicable to the
31716 current ISA based on the command line switches. With function specific
31717 options, we need to check in the context of the function making the call
31718 whether it is supported. */
31719 if (ix86_builtins_isa[fcode].isa
31720 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31722 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31723 NULL, (enum fpmath_unit) 0, false);
31725 if (!opts)
31726 error ("%qE needs unknown isa option", fndecl);
31727 else
31729 gcc_assert (opts != NULL);
31730 error ("%qE needs isa option %s", fndecl, opts);
31731 free (opts);
31733 return const0_rtx;
31736 switch (fcode)
31738 case IX86_BUILTIN_MASKMOVQ:
31739 case IX86_BUILTIN_MASKMOVDQU:
31740 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31741 ? CODE_FOR_mmx_maskmovq
31742 : CODE_FOR_sse2_maskmovdqu);
31743 /* Note the arg order is different from the operand order. */
31744 arg1 = CALL_EXPR_ARG (exp, 0);
31745 arg2 = CALL_EXPR_ARG (exp, 1);
31746 arg0 = CALL_EXPR_ARG (exp, 2);
31747 op0 = expand_normal (arg0);
31748 op1 = expand_normal (arg1);
31749 op2 = expand_normal (arg2);
31750 mode0 = insn_data[icode].operand[0].mode;
31751 mode1 = insn_data[icode].operand[1].mode;
31752 mode2 = insn_data[icode].operand[2].mode;
31754 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31755 op0 = gen_rtx_MEM (mode1, op0);
31757 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31758 op0 = copy_to_mode_reg (mode0, op0);
31759 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31760 op1 = copy_to_mode_reg (mode1, op1);
31761 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31762 op2 = copy_to_mode_reg (mode2, op2);
31763 pat = GEN_FCN (icode) (op0, op1, op2);
31764 if (! pat)
31765 return 0;
31766 emit_insn (pat);
31767 return 0;
31769 case IX86_BUILTIN_LDMXCSR:
31770 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31771 target = assign_386_stack_local (SImode, SLOT_TEMP);
31772 emit_move_insn (target, op0);
31773 emit_insn (gen_sse_ldmxcsr (target));
31774 return 0;
31776 case IX86_BUILTIN_STMXCSR:
31777 target = assign_386_stack_local (SImode, SLOT_TEMP);
31778 emit_insn (gen_sse_stmxcsr (target));
31779 return copy_to_mode_reg (SImode, target);
31781 case IX86_BUILTIN_CLFLUSH:
31782 arg0 = CALL_EXPR_ARG (exp, 0);
31783 op0 = expand_normal (arg0);
31784 icode = CODE_FOR_sse2_clflush;
31785 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31786 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31788 emit_insn (gen_sse2_clflush (op0));
31789 return 0;
31791 case IX86_BUILTIN_MONITOR:
31792 arg0 = CALL_EXPR_ARG (exp, 0);
31793 arg1 = CALL_EXPR_ARG (exp, 1);
31794 arg2 = CALL_EXPR_ARG (exp, 2);
31795 op0 = expand_normal (arg0);
31796 op1 = expand_normal (arg1);
31797 op2 = expand_normal (arg2);
31798 if (!REG_P (op0))
31799 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31800 if (!REG_P (op1))
31801 op1 = copy_to_mode_reg (SImode, op1);
31802 if (!REG_P (op2))
31803 op2 = copy_to_mode_reg (SImode, op2);
31804 emit_insn (ix86_gen_monitor (op0, op1, op2));
31805 return 0;
31807 case IX86_BUILTIN_MWAIT:
31808 arg0 = CALL_EXPR_ARG (exp, 0);
31809 arg1 = CALL_EXPR_ARG (exp, 1);
31810 op0 = expand_normal (arg0);
31811 op1 = expand_normal (arg1);
31812 if (!REG_P (op0))
31813 op0 = copy_to_mode_reg (SImode, op0);
31814 if (!REG_P (op1))
31815 op1 = copy_to_mode_reg (SImode, op1);
31816 emit_insn (gen_sse3_mwait (op0, op1));
31817 return 0;
31819 case IX86_BUILTIN_VEC_INIT_V2SI:
31820 case IX86_BUILTIN_VEC_INIT_V4HI:
31821 case IX86_BUILTIN_VEC_INIT_V8QI:
31822 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31824 case IX86_BUILTIN_VEC_EXT_V2DF:
31825 case IX86_BUILTIN_VEC_EXT_V2DI:
31826 case IX86_BUILTIN_VEC_EXT_V4SF:
31827 case IX86_BUILTIN_VEC_EXT_V4SI:
31828 case IX86_BUILTIN_VEC_EXT_V8HI:
31829 case IX86_BUILTIN_VEC_EXT_V2SI:
31830 case IX86_BUILTIN_VEC_EXT_V4HI:
31831 case IX86_BUILTIN_VEC_EXT_V16QI:
31832 return ix86_expand_vec_ext_builtin (exp, target);
31834 case IX86_BUILTIN_VEC_SET_V2DI:
31835 case IX86_BUILTIN_VEC_SET_V4SF:
31836 case IX86_BUILTIN_VEC_SET_V4SI:
31837 case IX86_BUILTIN_VEC_SET_V8HI:
31838 case IX86_BUILTIN_VEC_SET_V4HI:
31839 case IX86_BUILTIN_VEC_SET_V16QI:
31840 return ix86_expand_vec_set_builtin (exp);
31842 case IX86_BUILTIN_INFQ:
31843 case IX86_BUILTIN_HUGE_VALQ:
31845 REAL_VALUE_TYPE inf;
31846 rtx tmp;
31848 real_inf (&inf);
31849 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31851 tmp = validize_mem (force_const_mem (mode, tmp));
31853 if (target == 0)
31854 target = gen_reg_rtx (mode);
31856 emit_move_insn (target, tmp);
31857 return target;
31860 case IX86_BUILTIN_RDPMC:
31861 case IX86_BUILTIN_RDTSC:
31862 case IX86_BUILTIN_RDTSCP:
31864 op0 = gen_reg_rtx (DImode);
31865 op1 = gen_reg_rtx (DImode);
31867 if (fcode == IX86_BUILTIN_RDPMC)
31869 arg0 = CALL_EXPR_ARG (exp, 0);
31870 op2 = expand_normal (arg0);
31871 if (!register_operand (op2, SImode))
31872 op2 = copy_to_mode_reg (SImode, op2);
31874 insn = (TARGET_64BIT
31875 ? gen_rdpmc_rex64 (op0, op1, op2)
31876 : gen_rdpmc (op0, op2));
31877 emit_insn (insn);
31879 else if (fcode == IX86_BUILTIN_RDTSC)
31881 insn = (TARGET_64BIT
31882 ? gen_rdtsc_rex64 (op0, op1)
31883 : gen_rdtsc (op0));
31884 emit_insn (insn);
31886 else
31888 op2 = gen_reg_rtx (SImode);
31890 insn = (TARGET_64BIT
31891 ? gen_rdtscp_rex64 (op0, op1, op2)
31892 : gen_rdtscp (op0, op2));
31893 emit_insn (insn);
31895 arg0 = CALL_EXPR_ARG (exp, 0);
31896 op4 = expand_normal (arg0);
31897 if (!address_operand (op4, VOIDmode))
31899 op4 = convert_memory_address (Pmode, op4);
31900 op4 = copy_addr_to_reg (op4);
31902 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31905 if (target == 0)
31906 target = gen_reg_rtx (mode);
31908 if (TARGET_64BIT)
31910 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31911 op1, 1, OPTAB_DIRECT);
31912 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31913 op0, 1, OPTAB_DIRECT);
31916 emit_move_insn (target, op0);
31917 return target;
31919 case IX86_BUILTIN_FXSAVE:
31920 case IX86_BUILTIN_FXRSTOR:
31921 case IX86_BUILTIN_FXSAVE64:
31922 case IX86_BUILTIN_FXRSTOR64:
31923 switch (fcode)
31925 case IX86_BUILTIN_FXSAVE:
31926 icode = CODE_FOR_fxsave;
31927 break;
31928 case IX86_BUILTIN_FXRSTOR:
31929 icode = CODE_FOR_fxrstor;
31930 break;
31931 case IX86_BUILTIN_FXSAVE64:
31932 icode = CODE_FOR_fxsave64;
31933 break;
31934 case IX86_BUILTIN_FXRSTOR64:
31935 icode = CODE_FOR_fxrstor64;
31936 break;
31937 default:
31938 gcc_unreachable ();
31941 arg0 = CALL_EXPR_ARG (exp, 0);
31942 op0 = expand_normal (arg0);
31944 if (!address_operand (op0, VOIDmode))
31946 op0 = convert_memory_address (Pmode, op0);
31947 op0 = copy_addr_to_reg (op0);
31949 op0 = gen_rtx_MEM (BLKmode, op0);
31951 pat = GEN_FCN (icode) (op0);
31952 if (pat)
31953 emit_insn (pat);
31954 return 0;
31956 case IX86_BUILTIN_XSAVE:
31957 case IX86_BUILTIN_XRSTOR:
31958 case IX86_BUILTIN_XSAVE64:
31959 case IX86_BUILTIN_XRSTOR64:
31960 case IX86_BUILTIN_XSAVEOPT:
31961 case IX86_BUILTIN_XSAVEOPT64:
31962 arg0 = CALL_EXPR_ARG (exp, 0);
31963 arg1 = CALL_EXPR_ARG (exp, 1);
31964 op0 = expand_normal (arg0);
31965 op1 = expand_normal (arg1);
31967 if (!address_operand (op0, VOIDmode))
31969 op0 = convert_memory_address (Pmode, op0);
31970 op0 = copy_addr_to_reg (op0);
31972 op0 = gen_rtx_MEM (BLKmode, op0);
31974 op1 = force_reg (DImode, op1);
31976 if (TARGET_64BIT)
31978 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31979 NULL, 1, OPTAB_DIRECT);
31980 switch (fcode)
31982 case IX86_BUILTIN_XSAVE:
31983 icode = CODE_FOR_xsave_rex64;
31984 break;
31985 case IX86_BUILTIN_XRSTOR:
31986 icode = CODE_FOR_xrstor_rex64;
31987 break;
31988 case IX86_BUILTIN_XSAVE64:
31989 icode = CODE_FOR_xsave64;
31990 break;
31991 case IX86_BUILTIN_XRSTOR64:
31992 icode = CODE_FOR_xrstor64;
31993 break;
31994 case IX86_BUILTIN_XSAVEOPT:
31995 icode = CODE_FOR_xsaveopt_rex64;
31996 break;
31997 case IX86_BUILTIN_XSAVEOPT64:
31998 icode = CODE_FOR_xsaveopt64;
31999 break;
32000 default:
32001 gcc_unreachable ();
32004 op2 = gen_lowpart (SImode, op2);
32005 op1 = gen_lowpart (SImode, op1);
32006 pat = GEN_FCN (icode) (op0, op1, op2);
32008 else
32010 switch (fcode)
32012 case IX86_BUILTIN_XSAVE:
32013 icode = CODE_FOR_xsave;
32014 break;
32015 case IX86_BUILTIN_XRSTOR:
32016 icode = CODE_FOR_xrstor;
32017 break;
32018 case IX86_BUILTIN_XSAVEOPT:
32019 icode = CODE_FOR_xsaveopt;
32020 break;
32021 default:
32022 gcc_unreachable ();
32024 pat = GEN_FCN (icode) (op0, op1);
32027 if (pat)
32028 emit_insn (pat);
32029 return 0;
32031 case IX86_BUILTIN_LLWPCB:
32032 arg0 = CALL_EXPR_ARG (exp, 0);
32033 op0 = expand_normal (arg0);
32034 icode = CODE_FOR_lwp_llwpcb;
32035 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32036 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32037 emit_insn (gen_lwp_llwpcb (op0));
32038 return 0;
32040 case IX86_BUILTIN_SLWPCB:
32041 icode = CODE_FOR_lwp_slwpcb;
32042 if (!target
32043 || !insn_data[icode].operand[0].predicate (target, Pmode))
32044 target = gen_reg_rtx (Pmode);
32045 emit_insn (gen_lwp_slwpcb (target));
32046 return target;
32048 case IX86_BUILTIN_BEXTRI32:
32049 case IX86_BUILTIN_BEXTRI64:
32050 arg0 = CALL_EXPR_ARG (exp, 0);
32051 arg1 = CALL_EXPR_ARG (exp, 1);
32052 op0 = expand_normal (arg0);
32053 op1 = expand_normal (arg1);
32054 icode = (fcode == IX86_BUILTIN_BEXTRI32
32055 ? CODE_FOR_tbm_bextri_si
32056 : CODE_FOR_tbm_bextri_di);
32057 if (!CONST_INT_P (op1))
32059 error ("last argument must be an immediate");
32060 return const0_rtx;
32062 else
32064 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
32065 unsigned char lsb_index = INTVAL (op1) & 0xFF;
32066 op1 = GEN_INT (length);
32067 op2 = GEN_INT (lsb_index);
32068 pat = GEN_FCN (icode) (target, op0, op1, op2);
32069 if (pat)
32070 emit_insn (pat);
32071 return target;
32074 case IX86_BUILTIN_RDRAND16_STEP:
32075 icode = CODE_FOR_rdrandhi_1;
32076 mode0 = HImode;
32077 goto rdrand_step;
32079 case IX86_BUILTIN_RDRAND32_STEP:
32080 icode = CODE_FOR_rdrandsi_1;
32081 mode0 = SImode;
32082 goto rdrand_step;
32084 case IX86_BUILTIN_RDRAND64_STEP:
32085 icode = CODE_FOR_rdranddi_1;
32086 mode0 = DImode;
32088 rdrand_step:
32089 op0 = gen_reg_rtx (mode0);
32090 emit_insn (GEN_FCN (icode) (op0));
32092 arg0 = CALL_EXPR_ARG (exp, 0);
32093 op1 = expand_normal (arg0);
32094 if (!address_operand (op1, VOIDmode))
32096 op1 = convert_memory_address (Pmode, op1);
32097 op1 = copy_addr_to_reg (op1);
32099 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32101 op1 = gen_reg_rtx (SImode);
32102 emit_move_insn (op1, CONST1_RTX (SImode));
32104 /* Emit SImode conditional move. */
32105 if (mode0 == HImode)
32107 op2 = gen_reg_rtx (SImode);
32108 emit_insn (gen_zero_extendhisi2 (op2, op0));
32110 else if (mode0 == SImode)
32111 op2 = op0;
32112 else
32113 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32115 if (target == 0)
32116 target = gen_reg_rtx (SImode);
32118 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32119 const0_rtx);
32120 emit_insn (gen_rtx_SET (VOIDmode, target,
32121 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32122 return target;
32124 case IX86_BUILTIN_RDSEED16_STEP:
32125 icode = CODE_FOR_rdseedhi_1;
32126 mode0 = HImode;
32127 goto rdseed_step;
32129 case IX86_BUILTIN_RDSEED32_STEP:
32130 icode = CODE_FOR_rdseedsi_1;
32131 mode0 = SImode;
32132 goto rdseed_step;
32134 case IX86_BUILTIN_RDSEED64_STEP:
32135 icode = CODE_FOR_rdseeddi_1;
32136 mode0 = DImode;
32138 rdseed_step:
32139 op0 = gen_reg_rtx (mode0);
32140 emit_insn (GEN_FCN (icode) (op0));
32142 arg0 = CALL_EXPR_ARG (exp, 0);
32143 op1 = expand_normal (arg0);
32144 if (!address_operand (op1, VOIDmode))
32146 op1 = convert_memory_address (Pmode, op1);
32147 op1 = copy_addr_to_reg (op1);
32149 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32151 op2 = gen_reg_rtx (QImode);
32153 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32154 const0_rtx);
32155 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32157 if (target == 0)
32158 target = gen_reg_rtx (SImode);
32160 emit_insn (gen_zero_extendqisi2 (target, op2));
32161 return target;
32163 case IX86_BUILTIN_ADDCARRYX32:
32164 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32165 mode0 = SImode;
32166 goto addcarryx;
32168 case IX86_BUILTIN_ADDCARRYX64:
32169 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32170 mode0 = DImode;
32172 addcarryx:
32173 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32174 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32175 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32176 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32178 op0 = gen_reg_rtx (QImode);
32180 /* Generate CF from input operand. */
32181 op1 = expand_normal (arg0);
32182 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32183 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32185 /* Gen ADCX instruction to compute X+Y+CF. */
32186 op2 = expand_normal (arg1);
32187 op3 = expand_normal (arg2);
32189 if (!REG_P (op2))
32190 op2 = copy_to_mode_reg (mode0, op2);
32191 if (!REG_P (op3))
32192 op3 = copy_to_mode_reg (mode0, op3);
32194 op0 = gen_reg_rtx (mode0);
32196 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32197 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32198 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32200 /* Store the result. */
32201 op4 = expand_normal (arg3);
32202 if (!address_operand (op4, VOIDmode))
32204 op4 = convert_memory_address (Pmode, op4);
32205 op4 = copy_addr_to_reg (op4);
32207 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32209 /* Return current CF value. */
32210 if (target == 0)
32211 target = gen_reg_rtx (QImode);
32213 PUT_MODE (pat, QImode);
32214 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32215 return target;
32217 case IX86_BUILTIN_GATHERSIV2DF:
32218 icode = CODE_FOR_avx2_gathersiv2df;
32219 goto gather_gen;
32220 case IX86_BUILTIN_GATHERSIV4DF:
32221 icode = CODE_FOR_avx2_gathersiv4df;
32222 goto gather_gen;
32223 case IX86_BUILTIN_GATHERDIV2DF:
32224 icode = CODE_FOR_avx2_gatherdiv2df;
32225 goto gather_gen;
32226 case IX86_BUILTIN_GATHERDIV4DF:
32227 icode = CODE_FOR_avx2_gatherdiv4df;
32228 goto gather_gen;
32229 case IX86_BUILTIN_GATHERSIV4SF:
32230 icode = CODE_FOR_avx2_gathersiv4sf;
32231 goto gather_gen;
32232 case IX86_BUILTIN_GATHERSIV8SF:
32233 icode = CODE_FOR_avx2_gathersiv8sf;
32234 goto gather_gen;
32235 case IX86_BUILTIN_GATHERDIV4SF:
32236 icode = CODE_FOR_avx2_gatherdiv4sf;
32237 goto gather_gen;
32238 case IX86_BUILTIN_GATHERDIV8SF:
32239 icode = CODE_FOR_avx2_gatherdiv8sf;
32240 goto gather_gen;
32241 case IX86_BUILTIN_GATHERSIV2DI:
32242 icode = CODE_FOR_avx2_gathersiv2di;
32243 goto gather_gen;
32244 case IX86_BUILTIN_GATHERSIV4DI:
32245 icode = CODE_FOR_avx2_gathersiv4di;
32246 goto gather_gen;
32247 case IX86_BUILTIN_GATHERDIV2DI:
32248 icode = CODE_FOR_avx2_gatherdiv2di;
32249 goto gather_gen;
32250 case IX86_BUILTIN_GATHERDIV4DI:
32251 icode = CODE_FOR_avx2_gatherdiv4di;
32252 goto gather_gen;
32253 case IX86_BUILTIN_GATHERSIV4SI:
32254 icode = CODE_FOR_avx2_gathersiv4si;
32255 goto gather_gen;
32256 case IX86_BUILTIN_GATHERSIV8SI:
32257 icode = CODE_FOR_avx2_gathersiv8si;
32258 goto gather_gen;
32259 case IX86_BUILTIN_GATHERDIV4SI:
32260 icode = CODE_FOR_avx2_gatherdiv4si;
32261 goto gather_gen;
32262 case IX86_BUILTIN_GATHERDIV8SI:
32263 icode = CODE_FOR_avx2_gatherdiv8si;
32264 goto gather_gen;
32265 case IX86_BUILTIN_GATHERALTSIV4DF:
32266 icode = CODE_FOR_avx2_gathersiv4df;
32267 goto gather_gen;
32268 case IX86_BUILTIN_GATHERALTDIV8SF:
32269 icode = CODE_FOR_avx2_gatherdiv8sf;
32270 goto gather_gen;
32271 case IX86_BUILTIN_GATHERALTSIV4DI:
32272 icode = CODE_FOR_avx2_gathersiv4di;
32273 goto gather_gen;
32274 case IX86_BUILTIN_GATHERALTDIV8SI:
32275 icode = CODE_FOR_avx2_gatherdiv8si;
32276 goto gather_gen;
32278 gather_gen:
32279 arg0 = CALL_EXPR_ARG (exp, 0);
32280 arg1 = CALL_EXPR_ARG (exp, 1);
32281 arg2 = CALL_EXPR_ARG (exp, 2);
32282 arg3 = CALL_EXPR_ARG (exp, 3);
32283 arg4 = CALL_EXPR_ARG (exp, 4);
32284 op0 = expand_normal (arg0);
32285 op1 = expand_normal (arg1);
32286 op2 = expand_normal (arg2);
32287 op3 = expand_normal (arg3);
32288 op4 = expand_normal (arg4);
32289 /* Note the arg order is different from the operand order. */
32290 mode0 = insn_data[icode].operand[1].mode;
32291 mode2 = insn_data[icode].operand[3].mode;
32292 mode3 = insn_data[icode].operand[4].mode;
32293 mode4 = insn_data[icode].operand[5].mode;
32295 if (target == NULL_RTX
32296 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32297 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32298 else
32299 subtarget = target;
32301 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32302 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32304 rtx half = gen_reg_rtx (V4SImode);
32305 if (!nonimmediate_operand (op2, V8SImode))
32306 op2 = copy_to_mode_reg (V8SImode, op2);
32307 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32308 op2 = half;
32310 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32311 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32313 rtx (*gen) (rtx, rtx);
32314 rtx half = gen_reg_rtx (mode0);
32315 if (mode0 == V4SFmode)
32316 gen = gen_vec_extract_lo_v8sf;
32317 else
32318 gen = gen_vec_extract_lo_v8si;
32319 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32320 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32321 emit_insn (gen (half, op0));
32322 op0 = half;
32323 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32324 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32325 emit_insn (gen (half, op3));
32326 op3 = half;
32329 /* Force memory operand only with base register here. But we
32330 don't want to do it on memory operand for other builtin
32331 functions. */
32332 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32334 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32335 op0 = copy_to_mode_reg (mode0, op0);
32336 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32337 op1 = copy_to_mode_reg (Pmode, op1);
32338 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32339 op2 = copy_to_mode_reg (mode2, op2);
32340 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32341 op3 = copy_to_mode_reg (mode3, op3);
32342 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32344 error ("last argument must be scale 1, 2, 4, 8");
32345 return const0_rtx;
32348 /* Optimize. If mask is known to have all high bits set,
32349 replace op0 with pc_rtx to signal that the instruction
32350 overwrites the whole destination and doesn't use its
32351 previous contents. */
32352 if (optimize)
32354 if (TREE_CODE (arg3) == VECTOR_CST)
32356 unsigned int negative = 0;
32357 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32359 tree cst = VECTOR_CST_ELT (arg3, i);
32360 if (TREE_CODE (cst) == INTEGER_CST
32361 && tree_int_cst_sign_bit (cst))
32362 negative++;
32363 else if (TREE_CODE (cst) == REAL_CST
32364 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32365 negative++;
32367 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32368 op0 = pc_rtx;
32370 else if (TREE_CODE (arg3) == SSA_NAME)
32372 /* Recognize also when mask is like:
32373 __v2df src = _mm_setzero_pd ();
32374 __v2df mask = _mm_cmpeq_pd (src, src);
32376 __v8sf src = _mm256_setzero_ps ();
32377 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32378 as that is a cheaper way to load all ones into
32379 a register than having to load a constant from
32380 memory. */
32381 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32382 if (is_gimple_call (def_stmt))
32384 tree fndecl = gimple_call_fndecl (def_stmt);
32385 if (fndecl
32386 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32387 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32389 case IX86_BUILTIN_CMPPD:
32390 case IX86_BUILTIN_CMPPS:
32391 case IX86_BUILTIN_CMPPD256:
32392 case IX86_BUILTIN_CMPPS256:
32393 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32394 break;
32395 /* FALLTHRU */
32396 case IX86_BUILTIN_CMPEQPD:
32397 case IX86_BUILTIN_CMPEQPS:
32398 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32399 && initializer_zerop (gimple_call_arg (def_stmt,
32400 1)))
32401 op0 = pc_rtx;
32402 break;
32403 default:
32404 break;
32410 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32411 if (! pat)
32412 return const0_rtx;
32413 emit_insn (pat);
32415 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32416 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32418 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32419 ? V4SFmode : V4SImode;
32420 if (target == NULL_RTX)
32421 target = gen_reg_rtx (tmode);
32422 if (tmode == V4SFmode)
32423 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32424 else
32425 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32427 else
32428 target = subtarget;
32430 return target;
32432 case IX86_BUILTIN_XABORT:
32433 icode = CODE_FOR_xabort;
32434 arg0 = CALL_EXPR_ARG (exp, 0);
32435 op0 = expand_normal (arg0);
32436 mode0 = insn_data[icode].operand[0].mode;
32437 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32439 error ("the xabort's argument must be an 8-bit immediate");
32440 return const0_rtx;
32442 emit_insn (gen_xabort (op0));
32443 return 0;
32445 default:
32446 break;
32449 for (i = 0, d = bdesc_special_args;
32450 i < ARRAY_SIZE (bdesc_special_args);
32451 i++, d++)
32452 if (d->code == fcode)
32453 return ix86_expand_special_args_builtin (d, exp, target);
32455 for (i = 0, d = bdesc_args;
32456 i < ARRAY_SIZE (bdesc_args);
32457 i++, d++)
32458 if (d->code == fcode)
32459 switch (fcode)
32461 case IX86_BUILTIN_FABSQ:
32462 case IX86_BUILTIN_COPYSIGNQ:
32463 if (!TARGET_SSE)
32464 /* Emit a normal call if SSE isn't available. */
32465 return expand_call (exp, target, ignore);
32466 default:
32467 return ix86_expand_args_builtin (d, exp, target);
32470 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32471 if (d->code == fcode)
32472 return ix86_expand_sse_comi (d, exp, target);
32474 for (i = 0, d = bdesc_pcmpestr;
32475 i < ARRAY_SIZE (bdesc_pcmpestr);
32476 i++, d++)
32477 if (d->code == fcode)
32478 return ix86_expand_sse_pcmpestr (d, exp, target);
32480 for (i = 0, d = bdesc_pcmpistr;
32481 i < ARRAY_SIZE (bdesc_pcmpistr);
32482 i++, d++)
32483 if (d->code == fcode)
32484 return ix86_expand_sse_pcmpistr (d, exp, target);
32486 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32487 if (d->code == fcode)
32488 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32489 (enum ix86_builtin_func_type)
32490 d->flag, d->comparison);
32492 gcc_unreachable ();
32495 /* Returns a function decl for a vectorized version of the builtin function
32496 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32497 if it is not available. */
32499 static tree
32500 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32501 tree type_in)
32503 enum machine_mode in_mode, out_mode;
32504 int in_n, out_n;
32505 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32507 if (TREE_CODE (type_out) != VECTOR_TYPE
32508 || TREE_CODE (type_in) != VECTOR_TYPE
32509 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32510 return NULL_TREE;
32512 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32513 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32514 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32515 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32517 switch (fn)
32519 case BUILT_IN_SQRT:
32520 if (out_mode == DFmode && in_mode == DFmode)
32522 if (out_n == 2 && in_n == 2)
32523 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32524 else if (out_n == 4 && in_n == 4)
32525 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32527 break;
32529 case BUILT_IN_SQRTF:
32530 if (out_mode == SFmode && in_mode == SFmode)
32532 if (out_n == 4 && in_n == 4)
32533 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32534 else if (out_n == 8 && in_n == 8)
32535 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32537 break;
32539 case BUILT_IN_IFLOOR:
32540 case BUILT_IN_LFLOOR:
32541 case BUILT_IN_LLFLOOR:
32542 /* The round insn does not trap on denormals. */
32543 if (flag_trapping_math || !TARGET_ROUND)
32544 break;
32546 if (out_mode == SImode && in_mode == DFmode)
32548 if (out_n == 4 && in_n == 2)
32549 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32550 else if (out_n == 8 && in_n == 4)
32551 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32553 break;
32555 case BUILT_IN_IFLOORF:
32556 case BUILT_IN_LFLOORF:
32557 case BUILT_IN_LLFLOORF:
32558 /* The round insn does not trap on denormals. */
32559 if (flag_trapping_math || !TARGET_ROUND)
32560 break;
32562 if (out_mode == SImode && in_mode == SFmode)
32564 if (out_n == 4 && in_n == 4)
32565 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32566 else if (out_n == 8 && in_n == 8)
32567 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32569 break;
32571 case BUILT_IN_ICEIL:
32572 case BUILT_IN_LCEIL:
32573 case BUILT_IN_LLCEIL:
32574 /* The round insn does not trap on denormals. */
32575 if (flag_trapping_math || !TARGET_ROUND)
32576 break;
32578 if (out_mode == SImode && in_mode == DFmode)
32580 if (out_n == 4 && in_n == 2)
32581 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32582 else if (out_n == 8 && in_n == 4)
32583 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32585 break;
32587 case BUILT_IN_ICEILF:
32588 case BUILT_IN_LCEILF:
32589 case BUILT_IN_LLCEILF:
32590 /* The round insn does not trap on denormals. */
32591 if (flag_trapping_math || !TARGET_ROUND)
32592 break;
32594 if (out_mode == SImode && in_mode == SFmode)
32596 if (out_n == 4 && in_n == 4)
32597 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32598 else if (out_n == 8 && in_n == 8)
32599 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32601 break;
32603 case BUILT_IN_IRINT:
32604 case BUILT_IN_LRINT:
32605 case BUILT_IN_LLRINT:
32606 if (out_mode == SImode && in_mode == DFmode)
32608 if (out_n == 4 && in_n == 2)
32609 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32610 else if (out_n == 8 && in_n == 4)
32611 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32613 break;
32615 case BUILT_IN_IRINTF:
32616 case BUILT_IN_LRINTF:
32617 case BUILT_IN_LLRINTF:
32618 if (out_mode == SImode && in_mode == SFmode)
32620 if (out_n == 4 && in_n == 4)
32621 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32622 else if (out_n == 8 && in_n == 8)
32623 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32625 break;
32627 case BUILT_IN_IROUND:
32628 case BUILT_IN_LROUND:
32629 case BUILT_IN_LLROUND:
32630 /* The round insn does not trap on denormals. */
32631 if (flag_trapping_math || !TARGET_ROUND)
32632 break;
32634 if (out_mode == SImode && in_mode == DFmode)
32636 if (out_n == 4 && in_n == 2)
32637 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32638 else if (out_n == 8 && in_n == 4)
32639 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32641 break;
32643 case BUILT_IN_IROUNDF:
32644 case BUILT_IN_LROUNDF:
32645 case BUILT_IN_LLROUNDF:
32646 /* The round insn does not trap on denormals. */
32647 if (flag_trapping_math || !TARGET_ROUND)
32648 break;
32650 if (out_mode == SImode && in_mode == SFmode)
32652 if (out_n == 4 && in_n == 4)
32653 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32654 else if (out_n == 8 && in_n == 8)
32655 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32657 break;
32659 case BUILT_IN_COPYSIGN:
32660 if (out_mode == DFmode && in_mode == DFmode)
32662 if (out_n == 2 && in_n == 2)
32663 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32664 else if (out_n == 4 && in_n == 4)
32665 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32667 break;
32669 case BUILT_IN_COPYSIGNF:
32670 if (out_mode == SFmode && in_mode == SFmode)
32672 if (out_n == 4 && in_n == 4)
32673 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32674 else if (out_n == 8 && in_n == 8)
32675 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32677 break;
32679 case BUILT_IN_FLOOR:
32680 /* The round insn does not trap on denormals. */
32681 if (flag_trapping_math || !TARGET_ROUND)
32682 break;
32684 if (out_mode == DFmode && in_mode == DFmode)
32686 if (out_n == 2 && in_n == 2)
32687 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32688 else if (out_n == 4 && in_n == 4)
32689 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32691 break;
32693 case BUILT_IN_FLOORF:
32694 /* The round insn does not trap on denormals. */
32695 if (flag_trapping_math || !TARGET_ROUND)
32696 break;
32698 if (out_mode == SFmode && in_mode == SFmode)
32700 if (out_n == 4 && in_n == 4)
32701 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32702 else if (out_n == 8 && in_n == 8)
32703 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32705 break;
32707 case BUILT_IN_CEIL:
32708 /* The round insn does not trap on denormals. */
32709 if (flag_trapping_math || !TARGET_ROUND)
32710 break;
32712 if (out_mode == DFmode && in_mode == DFmode)
32714 if (out_n == 2 && in_n == 2)
32715 return ix86_builtins[IX86_BUILTIN_CEILPD];
32716 else if (out_n == 4 && in_n == 4)
32717 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32719 break;
32721 case BUILT_IN_CEILF:
32722 /* The round insn does not trap on denormals. */
32723 if (flag_trapping_math || !TARGET_ROUND)
32724 break;
32726 if (out_mode == SFmode && in_mode == SFmode)
32728 if (out_n == 4 && in_n == 4)
32729 return ix86_builtins[IX86_BUILTIN_CEILPS];
32730 else if (out_n == 8 && in_n == 8)
32731 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32733 break;
32735 case BUILT_IN_TRUNC:
32736 /* The round insn does not trap on denormals. */
32737 if (flag_trapping_math || !TARGET_ROUND)
32738 break;
32740 if (out_mode == DFmode && in_mode == DFmode)
32742 if (out_n == 2 && in_n == 2)
32743 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32744 else if (out_n == 4 && in_n == 4)
32745 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32747 break;
32749 case BUILT_IN_TRUNCF:
32750 /* The round insn does not trap on denormals. */
32751 if (flag_trapping_math || !TARGET_ROUND)
32752 break;
32754 if (out_mode == SFmode && in_mode == SFmode)
32756 if (out_n == 4 && in_n == 4)
32757 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32758 else if (out_n == 8 && in_n == 8)
32759 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32761 break;
32763 case BUILT_IN_RINT:
32764 /* The round insn does not trap on denormals. */
32765 if (flag_trapping_math || !TARGET_ROUND)
32766 break;
32768 if (out_mode == DFmode && in_mode == DFmode)
32770 if (out_n == 2 && in_n == 2)
32771 return ix86_builtins[IX86_BUILTIN_RINTPD];
32772 else if (out_n == 4 && in_n == 4)
32773 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32775 break;
32777 case BUILT_IN_RINTF:
32778 /* The round insn does not trap on denormals. */
32779 if (flag_trapping_math || !TARGET_ROUND)
32780 break;
32782 if (out_mode == SFmode && in_mode == SFmode)
32784 if (out_n == 4 && in_n == 4)
32785 return ix86_builtins[IX86_BUILTIN_RINTPS];
32786 else if (out_n == 8 && in_n == 8)
32787 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32789 break;
32791 case BUILT_IN_ROUND:
32792 /* The round insn does not trap on denormals. */
32793 if (flag_trapping_math || !TARGET_ROUND)
32794 break;
32796 if (out_mode == DFmode && in_mode == DFmode)
32798 if (out_n == 2 && in_n == 2)
32799 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32800 else if (out_n == 4 && in_n == 4)
32801 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32803 break;
32805 case BUILT_IN_ROUNDF:
32806 /* The round insn does not trap on denormals. */
32807 if (flag_trapping_math || !TARGET_ROUND)
32808 break;
32810 if (out_mode == SFmode && in_mode == SFmode)
32812 if (out_n == 4 && in_n == 4)
32813 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32814 else if (out_n == 8 && in_n == 8)
32815 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32817 break;
32819 case BUILT_IN_FMA:
32820 if (out_mode == DFmode && in_mode == DFmode)
32822 if (out_n == 2 && in_n == 2)
32823 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32824 if (out_n == 4 && in_n == 4)
32825 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32827 break;
32829 case BUILT_IN_FMAF:
32830 if (out_mode == SFmode && in_mode == SFmode)
32832 if (out_n == 4 && in_n == 4)
32833 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32834 if (out_n == 8 && in_n == 8)
32835 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32837 break;
32839 default:
32840 break;
32843 /* Dispatch to a handler for a vectorization library. */
32844 if (ix86_veclib_handler)
32845 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32846 type_in);
32848 return NULL_TREE;
32851 /* Handler for an SVML-style interface to
32852 a library with vectorized intrinsics. */
32854 static tree
32855 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32857 char name[20];
32858 tree fntype, new_fndecl, args;
32859 unsigned arity;
32860 const char *bname;
32861 enum machine_mode el_mode, in_mode;
32862 int n, in_n;
32864 /* The SVML is suitable for unsafe math only. */
32865 if (!flag_unsafe_math_optimizations)
32866 return NULL_TREE;
32868 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32869 n = TYPE_VECTOR_SUBPARTS (type_out);
32870 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32871 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32872 if (el_mode != in_mode
32873 || n != in_n)
32874 return NULL_TREE;
32876 switch (fn)
32878 case BUILT_IN_EXP:
32879 case BUILT_IN_LOG:
32880 case BUILT_IN_LOG10:
32881 case BUILT_IN_POW:
32882 case BUILT_IN_TANH:
32883 case BUILT_IN_TAN:
32884 case BUILT_IN_ATAN:
32885 case BUILT_IN_ATAN2:
32886 case BUILT_IN_ATANH:
32887 case BUILT_IN_CBRT:
32888 case BUILT_IN_SINH:
32889 case BUILT_IN_SIN:
32890 case BUILT_IN_ASINH:
32891 case BUILT_IN_ASIN:
32892 case BUILT_IN_COSH:
32893 case BUILT_IN_COS:
32894 case BUILT_IN_ACOSH:
32895 case BUILT_IN_ACOS:
32896 if (el_mode != DFmode || n != 2)
32897 return NULL_TREE;
32898 break;
32900 case BUILT_IN_EXPF:
32901 case BUILT_IN_LOGF:
32902 case BUILT_IN_LOG10F:
32903 case BUILT_IN_POWF:
32904 case BUILT_IN_TANHF:
32905 case BUILT_IN_TANF:
32906 case BUILT_IN_ATANF:
32907 case BUILT_IN_ATAN2F:
32908 case BUILT_IN_ATANHF:
32909 case BUILT_IN_CBRTF:
32910 case BUILT_IN_SINHF:
32911 case BUILT_IN_SINF:
32912 case BUILT_IN_ASINHF:
32913 case BUILT_IN_ASINF:
32914 case BUILT_IN_COSHF:
32915 case BUILT_IN_COSF:
32916 case BUILT_IN_ACOSHF:
32917 case BUILT_IN_ACOSF:
32918 if (el_mode != SFmode || n != 4)
32919 return NULL_TREE;
32920 break;
32922 default:
32923 return NULL_TREE;
32926 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32928 if (fn == BUILT_IN_LOGF)
32929 strcpy (name, "vmlsLn4");
32930 else if (fn == BUILT_IN_LOG)
32931 strcpy (name, "vmldLn2");
32932 else if (n == 4)
32934 sprintf (name, "vmls%s", bname+10);
32935 name[strlen (name)-1] = '4';
32937 else
32938 sprintf (name, "vmld%s2", bname+10);
32940 /* Convert to uppercase. */
32941 name[4] &= ~0x20;
32943 arity = 0;
32944 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32945 args;
32946 args = TREE_CHAIN (args))
32947 arity++;
32949 if (arity == 1)
32950 fntype = build_function_type_list (type_out, type_in, NULL);
32951 else
32952 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32954 /* Build a function declaration for the vectorized function. */
32955 new_fndecl = build_decl (BUILTINS_LOCATION,
32956 FUNCTION_DECL, get_identifier (name), fntype);
32957 TREE_PUBLIC (new_fndecl) = 1;
32958 DECL_EXTERNAL (new_fndecl) = 1;
32959 DECL_IS_NOVOPS (new_fndecl) = 1;
32960 TREE_READONLY (new_fndecl) = 1;
32962 return new_fndecl;
32965 /* Handler for an ACML-style interface to
32966 a library with vectorized intrinsics. */
32968 static tree
32969 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32971 char name[20] = "__vr.._";
32972 tree fntype, new_fndecl, args;
32973 unsigned arity;
32974 const char *bname;
32975 enum machine_mode el_mode, in_mode;
32976 int n, in_n;
32978 /* The ACML is 64bits only and suitable for unsafe math only as
32979 it does not correctly support parts of IEEE with the required
32980 precision such as denormals. */
32981 if (!TARGET_64BIT
32982 || !flag_unsafe_math_optimizations)
32983 return NULL_TREE;
32985 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32986 n = TYPE_VECTOR_SUBPARTS (type_out);
32987 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32988 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32989 if (el_mode != in_mode
32990 || n != in_n)
32991 return NULL_TREE;
32993 switch (fn)
32995 case BUILT_IN_SIN:
32996 case BUILT_IN_COS:
32997 case BUILT_IN_EXP:
32998 case BUILT_IN_LOG:
32999 case BUILT_IN_LOG2:
33000 case BUILT_IN_LOG10:
33001 name[4] = 'd';
33002 name[5] = '2';
33003 if (el_mode != DFmode
33004 || n != 2)
33005 return NULL_TREE;
33006 break;
33008 case BUILT_IN_SINF:
33009 case BUILT_IN_COSF:
33010 case BUILT_IN_EXPF:
33011 case BUILT_IN_POWF:
33012 case BUILT_IN_LOGF:
33013 case BUILT_IN_LOG2F:
33014 case BUILT_IN_LOG10F:
33015 name[4] = 's';
33016 name[5] = '4';
33017 if (el_mode != SFmode
33018 || n != 4)
33019 return NULL_TREE;
33020 break;
33022 default:
33023 return NULL_TREE;
33026 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33027 sprintf (name + 7, "%s", bname+10);
33029 arity = 0;
33030 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33031 args;
33032 args = TREE_CHAIN (args))
33033 arity++;
33035 if (arity == 1)
33036 fntype = build_function_type_list (type_out, type_in, NULL);
33037 else
33038 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33040 /* Build a function declaration for the vectorized function. */
33041 new_fndecl = build_decl (BUILTINS_LOCATION,
33042 FUNCTION_DECL, get_identifier (name), fntype);
33043 TREE_PUBLIC (new_fndecl) = 1;
33044 DECL_EXTERNAL (new_fndecl) = 1;
33045 DECL_IS_NOVOPS (new_fndecl) = 1;
33046 TREE_READONLY (new_fndecl) = 1;
33048 return new_fndecl;
33051 /* Returns a decl of a function that implements gather load with
33052 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
33053 Return NULL_TREE if it is not available. */
33055 static tree
33056 ix86_vectorize_builtin_gather (const_tree mem_vectype,
33057 const_tree index_type, int scale)
33059 bool si;
33060 enum ix86_builtins code;
33062 if (! TARGET_AVX2)
33063 return NULL_TREE;
33065 if ((TREE_CODE (index_type) != INTEGER_TYPE
33066 && !POINTER_TYPE_P (index_type))
33067 || (TYPE_MODE (index_type) != SImode
33068 && TYPE_MODE (index_type) != DImode))
33069 return NULL_TREE;
33071 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
33072 return NULL_TREE;
33074 /* v*gather* insn sign extends index to pointer mode. */
33075 if (TYPE_PRECISION (index_type) < POINTER_SIZE
33076 && TYPE_UNSIGNED (index_type))
33077 return NULL_TREE;
33079 if (scale <= 0
33080 || scale > 8
33081 || (scale & (scale - 1)) != 0)
33082 return NULL_TREE;
33084 si = TYPE_MODE (index_type) == SImode;
33085 switch (TYPE_MODE (mem_vectype))
33087 case V2DFmode:
33088 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
33089 break;
33090 case V4DFmode:
33091 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
33092 break;
33093 case V2DImode:
33094 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
33095 break;
33096 case V4DImode:
33097 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
33098 break;
33099 case V4SFmode:
33100 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
33101 break;
33102 case V8SFmode:
33103 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
33104 break;
33105 case V4SImode:
33106 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
33107 break;
33108 case V8SImode:
33109 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33110 break;
33111 default:
33112 return NULL_TREE;
33115 return ix86_builtins[code];
33118 /* Returns a code for a target-specific builtin that implements
33119 reciprocal of the function, or NULL_TREE if not available. */
33121 static tree
33122 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33123 bool sqrt ATTRIBUTE_UNUSED)
33125 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33126 && flag_finite_math_only && !flag_trapping_math
33127 && flag_unsafe_math_optimizations))
33128 return NULL_TREE;
33130 if (md_fn)
33131 /* Machine dependent builtins. */
33132 switch (fn)
33134 /* Vectorized version of sqrt to rsqrt conversion. */
33135 case IX86_BUILTIN_SQRTPS_NR:
33136 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33138 case IX86_BUILTIN_SQRTPS_NR256:
33139 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33141 default:
33142 return NULL_TREE;
33144 else
33145 /* Normal builtins. */
33146 switch (fn)
33148 /* Sqrt to rsqrt conversion. */
33149 case BUILT_IN_SQRTF:
33150 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33152 default:
33153 return NULL_TREE;
33157 /* Helper for avx_vpermilps256_operand et al. This is also used by
33158 the expansion functions to turn the parallel back into a mask.
33159 The return value is 0 for no match and the imm8+1 for a match. */
33162 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33164 unsigned i, nelt = GET_MODE_NUNITS (mode);
33165 unsigned mask = 0;
33166 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33168 if (XVECLEN (par, 0) != (int) nelt)
33169 return 0;
33171 /* Validate that all of the elements are constants, and not totally
33172 out of range. Copy the data into an integral array to make the
33173 subsequent checks easier. */
33174 for (i = 0; i < nelt; ++i)
33176 rtx er = XVECEXP (par, 0, i);
33177 unsigned HOST_WIDE_INT ei;
33179 if (!CONST_INT_P (er))
33180 return 0;
33181 ei = INTVAL (er);
33182 if (ei >= nelt)
33183 return 0;
33184 ipar[i] = ei;
33187 switch (mode)
33189 case V4DFmode:
33190 /* In the 256-bit DFmode case, we can only move elements within
33191 a 128-bit lane. */
33192 for (i = 0; i < 2; ++i)
33194 if (ipar[i] >= 2)
33195 return 0;
33196 mask |= ipar[i] << i;
33198 for (i = 2; i < 4; ++i)
33200 if (ipar[i] < 2)
33201 return 0;
33202 mask |= (ipar[i] - 2) << i;
33204 break;
33206 case V8SFmode:
33207 /* In the 256-bit SFmode case, we have full freedom of movement
33208 within the low 128-bit lane, but the high 128-bit lane must
33209 mirror the exact same pattern. */
33210 for (i = 0; i < 4; ++i)
33211 if (ipar[i] + 4 != ipar[i + 4])
33212 return 0;
33213 nelt = 4;
33214 /* FALLTHRU */
33216 case V2DFmode:
33217 case V4SFmode:
33218 /* In the 128-bit case, we've full freedom in the placement of
33219 the elements from the source operand. */
33220 for (i = 0; i < nelt; ++i)
33221 mask |= ipar[i] << (i * (nelt / 2));
33222 break;
33224 default:
33225 gcc_unreachable ();
33228 /* Make sure success has a non-zero value by adding one. */
33229 return mask + 1;
33232 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33233 the expansion functions to turn the parallel back into a mask.
33234 The return value is 0 for no match and the imm8+1 for a match. */
33237 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33239 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33240 unsigned mask = 0;
33241 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33243 if (XVECLEN (par, 0) != (int) nelt)
33244 return 0;
33246 /* Validate that all of the elements are constants, and not totally
33247 out of range. Copy the data into an integral array to make the
33248 subsequent checks easier. */
33249 for (i = 0; i < nelt; ++i)
33251 rtx er = XVECEXP (par, 0, i);
33252 unsigned HOST_WIDE_INT ei;
33254 if (!CONST_INT_P (er))
33255 return 0;
33256 ei = INTVAL (er);
33257 if (ei >= 2 * nelt)
33258 return 0;
33259 ipar[i] = ei;
33262 /* Validate that the halves of the permute are halves. */
33263 for (i = 0; i < nelt2 - 1; ++i)
33264 if (ipar[i] + 1 != ipar[i + 1])
33265 return 0;
33266 for (i = nelt2; i < nelt - 1; ++i)
33267 if (ipar[i] + 1 != ipar[i + 1])
33268 return 0;
33270 /* Reconstruct the mask. */
33271 for (i = 0; i < 2; ++i)
33273 unsigned e = ipar[i * nelt2];
33274 if (e % nelt2)
33275 return 0;
33276 e /= nelt2;
33277 mask |= e << (i * 4);
33280 /* Make sure success has a non-zero value by adding one. */
33281 return mask + 1;
33284 /* Store OPERAND to the memory after reload is completed. This means
33285 that we can't easily use assign_stack_local. */
33287 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33289 rtx result;
33291 gcc_assert (reload_completed);
33292 if (ix86_using_red_zone ())
33294 result = gen_rtx_MEM (mode,
33295 gen_rtx_PLUS (Pmode,
33296 stack_pointer_rtx,
33297 GEN_INT (-RED_ZONE_SIZE)));
33298 emit_move_insn (result, operand);
33300 else if (TARGET_64BIT)
33302 switch (mode)
33304 case HImode:
33305 case SImode:
33306 operand = gen_lowpart (DImode, operand);
33307 /* FALLTHRU */
33308 case DImode:
33309 emit_insn (
33310 gen_rtx_SET (VOIDmode,
33311 gen_rtx_MEM (DImode,
33312 gen_rtx_PRE_DEC (DImode,
33313 stack_pointer_rtx)),
33314 operand));
33315 break;
33316 default:
33317 gcc_unreachable ();
33319 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33321 else
33323 switch (mode)
33325 case DImode:
33327 rtx operands[2];
33328 split_double_mode (mode, &operand, 1, operands, operands + 1);
33329 emit_insn (
33330 gen_rtx_SET (VOIDmode,
33331 gen_rtx_MEM (SImode,
33332 gen_rtx_PRE_DEC (Pmode,
33333 stack_pointer_rtx)),
33334 operands[1]));
33335 emit_insn (
33336 gen_rtx_SET (VOIDmode,
33337 gen_rtx_MEM (SImode,
33338 gen_rtx_PRE_DEC (Pmode,
33339 stack_pointer_rtx)),
33340 operands[0]));
33342 break;
33343 case HImode:
33344 /* Store HImodes as SImodes. */
33345 operand = gen_lowpart (SImode, operand);
33346 /* FALLTHRU */
33347 case SImode:
33348 emit_insn (
33349 gen_rtx_SET (VOIDmode,
33350 gen_rtx_MEM (GET_MODE (operand),
33351 gen_rtx_PRE_DEC (SImode,
33352 stack_pointer_rtx)),
33353 operand));
33354 break;
33355 default:
33356 gcc_unreachable ();
33358 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33360 return result;
33363 /* Free operand from the memory. */
33364 void
33365 ix86_free_from_memory (enum machine_mode mode)
33367 if (!ix86_using_red_zone ())
33369 int size;
33371 if (mode == DImode || TARGET_64BIT)
33372 size = 8;
33373 else
33374 size = 4;
33375 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33376 to pop or add instruction if registers are available. */
33377 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33378 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33379 GEN_INT (size))));
33383 /* Return a register priority for hard reg REGNO. */
33384 static int
33385 ix86_register_priority (int hard_regno)
33387 /* ebp and r13 as the base always wants a displacement, r12 as the
33388 base always wants an index. So discourage their usage in an
33389 address. */
33390 if (hard_regno == R12_REG || hard_regno == R13_REG)
33391 return 0;
33392 if (hard_regno == BP_REG)
33393 return 1;
33394 /* New x86-64 int registers result in bigger code size. Discourage
33395 them. */
33396 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33397 return 2;
33398 /* New x86-64 SSE registers result in bigger code size. Discourage
33399 them. */
33400 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33401 return 2;
33402 /* Usage of AX register results in smaller code. Prefer it. */
33403 if (hard_regno == 0)
33404 return 4;
33405 return 3;
33408 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33410 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33411 QImode must go into class Q_REGS.
33412 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33413 movdf to do mem-to-mem moves through integer regs. */
33415 static reg_class_t
33416 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33418 enum machine_mode mode = GET_MODE (x);
33420 /* We're only allowed to return a subclass of CLASS. Many of the
33421 following checks fail for NO_REGS, so eliminate that early. */
33422 if (regclass == NO_REGS)
33423 return NO_REGS;
33425 /* All classes can load zeros. */
33426 if (x == CONST0_RTX (mode))
33427 return regclass;
33429 /* Force constants into memory if we are loading a (nonzero) constant into
33430 an MMX or SSE register. This is because there are no MMX/SSE instructions
33431 to load from a constant. */
33432 if (CONSTANT_P (x)
33433 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33434 return NO_REGS;
33436 /* Prefer SSE regs only, if we can use them for math. */
33437 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33438 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33440 /* Floating-point constants need more complex checks. */
33441 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33443 /* General regs can load everything. */
33444 if (reg_class_subset_p (regclass, GENERAL_REGS))
33445 return regclass;
33447 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33448 zero above. We only want to wind up preferring 80387 registers if
33449 we plan on doing computation with them. */
33450 if (TARGET_80387
33451 && standard_80387_constant_p (x) > 0)
33453 /* Limit class to non-sse. */
33454 if (regclass == FLOAT_SSE_REGS)
33455 return FLOAT_REGS;
33456 if (regclass == FP_TOP_SSE_REGS)
33457 return FP_TOP_REG;
33458 if (regclass == FP_SECOND_SSE_REGS)
33459 return FP_SECOND_REG;
33460 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33461 return regclass;
33464 return NO_REGS;
33467 /* Generally when we see PLUS here, it's the function invariant
33468 (plus soft-fp const_int). Which can only be computed into general
33469 regs. */
33470 if (GET_CODE (x) == PLUS)
33471 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33473 /* QImode constants are easy to load, but non-constant QImode data
33474 must go into Q_REGS. */
33475 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33477 if (reg_class_subset_p (regclass, Q_REGS))
33478 return regclass;
33479 if (reg_class_subset_p (Q_REGS, regclass))
33480 return Q_REGS;
33481 return NO_REGS;
33484 return regclass;
33487 /* Discourage putting floating-point values in SSE registers unless
33488 SSE math is being used, and likewise for the 387 registers. */
33489 static reg_class_t
33490 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33492 enum machine_mode mode = GET_MODE (x);
33494 /* Restrict the output reload class to the register bank that we are doing
33495 math on. If we would like not to return a subset of CLASS, reject this
33496 alternative: if reload cannot do this, it will still use its choice. */
33497 mode = GET_MODE (x);
33498 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33499 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33501 if (X87_FLOAT_MODE_P (mode))
33503 if (regclass == FP_TOP_SSE_REGS)
33504 return FP_TOP_REG;
33505 else if (regclass == FP_SECOND_SSE_REGS)
33506 return FP_SECOND_REG;
33507 else
33508 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33511 return regclass;
33514 static reg_class_t
33515 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33516 enum machine_mode mode, secondary_reload_info *sri)
33518 /* Double-word spills from general registers to non-offsettable memory
33519 references (zero-extended addresses) require special handling. */
33520 if (TARGET_64BIT
33521 && MEM_P (x)
33522 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33523 && rclass == GENERAL_REGS
33524 && !offsettable_memref_p (x))
33526 sri->icode = (in_p
33527 ? CODE_FOR_reload_noff_load
33528 : CODE_FOR_reload_noff_store);
33529 /* Add the cost of moving address to a temporary. */
33530 sri->extra_cost = 1;
33532 return NO_REGS;
33535 /* QImode spills from non-QI registers require
33536 intermediate register on 32bit targets. */
33537 if (!TARGET_64BIT
33538 && !in_p && mode == QImode
33539 && (rclass == GENERAL_REGS
33540 || rclass == LEGACY_REGS
33541 || rclass == NON_Q_REGS
33542 || rclass == SIREG
33543 || rclass == DIREG
33544 || rclass == INDEX_REGS))
33546 int regno;
33548 if (REG_P (x))
33549 regno = REGNO (x);
33550 else
33551 regno = -1;
33553 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33554 regno = true_regnum (x);
33556 /* Return Q_REGS if the operand is in memory. */
33557 if (regno == -1)
33558 return Q_REGS;
33561 /* This condition handles corner case where an expression involving
33562 pointers gets vectorized. We're trying to use the address of a
33563 stack slot as a vector initializer.
33565 (set (reg:V2DI 74 [ vect_cst_.2 ])
33566 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33568 Eventually frame gets turned into sp+offset like this:
33570 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33571 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33572 (const_int 392 [0x188]))))
33574 That later gets turned into:
33576 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33577 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33578 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33580 We'll have the following reload recorded:
33582 Reload 0: reload_in (DI) =
33583 (plus:DI (reg/f:DI 7 sp)
33584 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33585 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33586 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33587 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33588 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33589 reload_reg_rtx: (reg:V2DI 22 xmm1)
33591 Which isn't going to work since SSE instructions can't handle scalar
33592 additions. Returning GENERAL_REGS forces the addition into integer
33593 register and reload can handle subsequent reloads without problems. */
33595 if (in_p && GET_CODE (x) == PLUS
33596 && SSE_CLASS_P (rclass)
33597 && SCALAR_INT_MODE_P (mode))
33598 return GENERAL_REGS;
33600 return NO_REGS;
33603 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33605 static bool
33606 ix86_class_likely_spilled_p (reg_class_t rclass)
33608 switch (rclass)
33610 case AREG:
33611 case DREG:
33612 case CREG:
33613 case BREG:
33614 case AD_REGS:
33615 case SIREG:
33616 case DIREG:
33617 case SSE_FIRST_REG:
33618 case FP_TOP_REG:
33619 case FP_SECOND_REG:
33620 return true;
33622 default:
33623 break;
33626 return false;
33629 /* If we are copying between general and FP registers, we need a memory
33630 location. The same is true for SSE and MMX registers.
33632 To optimize register_move_cost performance, allow inline variant.
33634 The macro can't work reliably when one of the CLASSES is class containing
33635 registers from multiple units (SSE, MMX, integer). We avoid this by never
33636 combining those units in single alternative in the machine description.
33637 Ensure that this constraint holds to avoid unexpected surprises.
33639 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33640 enforce these sanity checks. */
33642 static inline bool
33643 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33644 enum machine_mode mode, int strict)
33646 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33647 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33648 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33649 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33650 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33651 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33653 gcc_assert (!strict || lra_in_progress);
33654 return true;
33657 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33658 return true;
33660 /* ??? This is a lie. We do have moves between mmx/general, and for
33661 mmx/sse2. But by saying we need secondary memory we discourage the
33662 register allocator from using the mmx registers unless needed. */
33663 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33664 return true;
33666 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33668 /* SSE1 doesn't have any direct moves from other classes. */
33669 if (!TARGET_SSE2)
33670 return true;
33672 /* If the target says that inter-unit moves are more expensive
33673 than moving through memory, then don't generate them. */
33674 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
33675 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
33676 return true;
33678 /* Between SSE and general, we have moves no larger than word size. */
33679 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33680 return true;
33683 return false;
33686 bool
33687 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33688 enum machine_mode mode, int strict)
33690 return inline_secondary_memory_needed (class1, class2, mode, strict);
33693 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33695 On the 80386, this is the size of MODE in words,
33696 except in the FP regs, where a single reg is always enough. */
33698 static unsigned char
33699 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33701 if (MAYBE_INTEGER_CLASS_P (rclass))
33703 if (mode == XFmode)
33704 return (TARGET_64BIT ? 2 : 3);
33705 else if (mode == XCmode)
33706 return (TARGET_64BIT ? 4 : 6);
33707 else
33708 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33710 else
33712 if (COMPLEX_MODE_P (mode))
33713 return 2;
33714 else
33715 return 1;
33719 /* Return true if the registers in CLASS cannot represent the change from
33720 modes FROM to TO. */
33722 bool
33723 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33724 enum reg_class regclass)
33726 if (from == to)
33727 return false;
33729 /* x87 registers can't do subreg at all, as all values are reformatted
33730 to extended precision. */
33731 if (MAYBE_FLOAT_CLASS_P (regclass))
33732 return true;
33734 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33736 /* Vector registers do not support QI or HImode loads. If we don't
33737 disallow a change to these modes, reload will assume it's ok to
33738 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33739 the vec_dupv4hi pattern. */
33740 if (GET_MODE_SIZE (from) < 4)
33741 return true;
33743 /* Vector registers do not support subreg with nonzero offsets, which
33744 are otherwise valid for integer registers. Since we can't see
33745 whether we have a nonzero offset from here, prohibit all
33746 nonparadoxical subregs changing size. */
33747 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33748 return true;
33751 return false;
33754 /* Return the cost of moving data of mode M between a
33755 register and memory. A value of 2 is the default; this cost is
33756 relative to those in `REGISTER_MOVE_COST'.
33758 This function is used extensively by register_move_cost that is used to
33759 build tables at startup. Make it inline in this case.
33760 When IN is 2, return maximum of in and out move cost.
33762 If moving between registers and memory is more expensive than
33763 between two registers, you should define this macro to express the
33764 relative cost.
33766 Model also increased moving costs of QImode registers in non
33767 Q_REGS classes.
33769 static inline int
33770 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33771 int in)
33773 int cost;
33774 if (FLOAT_CLASS_P (regclass))
33776 int index;
33777 switch (mode)
33779 case SFmode:
33780 index = 0;
33781 break;
33782 case DFmode:
33783 index = 1;
33784 break;
33785 case XFmode:
33786 index = 2;
33787 break;
33788 default:
33789 return 100;
33791 if (in == 2)
33792 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33793 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33795 if (SSE_CLASS_P (regclass))
33797 int index;
33798 switch (GET_MODE_SIZE (mode))
33800 case 4:
33801 index = 0;
33802 break;
33803 case 8:
33804 index = 1;
33805 break;
33806 case 16:
33807 index = 2;
33808 break;
33809 default:
33810 return 100;
33812 if (in == 2)
33813 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33814 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33816 if (MMX_CLASS_P (regclass))
33818 int index;
33819 switch (GET_MODE_SIZE (mode))
33821 case 4:
33822 index = 0;
33823 break;
33824 case 8:
33825 index = 1;
33826 break;
33827 default:
33828 return 100;
33830 if (in)
33831 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33832 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33834 switch (GET_MODE_SIZE (mode))
33836 case 1:
33837 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33839 if (!in)
33840 return ix86_cost->int_store[0];
33841 if (TARGET_PARTIAL_REG_DEPENDENCY
33842 && optimize_function_for_speed_p (cfun))
33843 cost = ix86_cost->movzbl_load;
33844 else
33845 cost = ix86_cost->int_load[0];
33846 if (in == 2)
33847 return MAX (cost, ix86_cost->int_store[0]);
33848 return cost;
33850 else
33852 if (in == 2)
33853 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33854 if (in)
33855 return ix86_cost->movzbl_load;
33856 else
33857 return ix86_cost->int_store[0] + 4;
33859 break;
33860 case 2:
33861 if (in == 2)
33862 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33863 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33864 default:
33865 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33866 if (mode == TFmode)
33867 mode = XFmode;
33868 if (in == 2)
33869 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33870 else if (in)
33871 cost = ix86_cost->int_load[2];
33872 else
33873 cost = ix86_cost->int_store[2];
33874 return (cost * (((int) GET_MODE_SIZE (mode)
33875 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33879 static int
33880 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33881 bool in)
33883 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33887 /* Return the cost of moving data from a register in class CLASS1 to
33888 one in class CLASS2.
33890 It is not required that the cost always equal 2 when FROM is the same as TO;
33891 on some machines it is expensive to move between registers if they are not
33892 general registers. */
33894 static int
33895 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33896 reg_class_t class2_i)
33898 enum reg_class class1 = (enum reg_class) class1_i;
33899 enum reg_class class2 = (enum reg_class) class2_i;
33901 /* In case we require secondary memory, compute cost of the store followed
33902 by load. In order to avoid bad register allocation choices, we need
33903 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33905 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33907 int cost = 1;
33909 cost += inline_memory_move_cost (mode, class1, 2);
33910 cost += inline_memory_move_cost (mode, class2, 2);
33912 /* In case of copying from general_purpose_register we may emit multiple
33913 stores followed by single load causing memory size mismatch stall.
33914 Count this as arbitrarily high cost of 20. */
33915 if (targetm.class_max_nregs (class1, mode)
33916 > targetm.class_max_nregs (class2, mode))
33917 cost += 20;
33919 /* In the case of FP/MMX moves, the registers actually overlap, and we
33920 have to switch modes in order to treat them differently. */
33921 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33922 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33923 cost += 20;
33925 return cost;
33928 /* Moves between SSE/MMX and integer unit are expensive. */
33929 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33930 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33932 /* ??? By keeping returned value relatively high, we limit the number
33933 of moves between integer and MMX/SSE registers for all targets.
33934 Additionally, high value prevents problem with x86_modes_tieable_p(),
33935 where integer modes in MMX/SSE registers are not tieable
33936 because of missing QImode and HImode moves to, from or between
33937 MMX/SSE registers. */
33938 return MAX (8, ix86_cost->mmxsse_to_integer);
33940 if (MAYBE_FLOAT_CLASS_P (class1))
33941 return ix86_cost->fp_move;
33942 if (MAYBE_SSE_CLASS_P (class1))
33943 return ix86_cost->sse_move;
33944 if (MAYBE_MMX_CLASS_P (class1))
33945 return ix86_cost->mmx_move;
33946 return 2;
33949 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33950 MODE. */
33952 bool
33953 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33955 /* Flags and only flags can only hold CCmode values. */
33956 if (CC_REGNO_P (regno))
33957 return GET_MODE_CLASS (mode) == MODE_CC;
33958 if (GET_MODE_CLASS (mode) == MODE_CC
33959 || GET_MODE_CLASS (mode) == MODE_RANDOM
33960 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33961 return false;
33962 if (STACK_REGNO_P (regno))
33963 return VALID_FP_MODE_P (mode);
33964 if (SSE_REGNO_P (regno))
33966 /* We implement the move patterns for all vector modes into and
33967 out of SSE registers, even when no operation instructions
33968 are available. OImode move is available only when AVX is
33969 enabled. */
33970 return ((TARGET_AVX && mode == OImode)
33971 || VALID_AVX256_REG_MODE (mode)
33972 || VALID_SSE_REG_MODE (mode)
33973 || VALID_SSE2_REG_MODE (mode)
33974 || VALID_MMX_REG_MODE (mode)
33975 || VALID_MMX_REG_MODE_3DNOW (mode));
33977 if (MMX_REGNO_P (regno))
33979 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33980 so if the register is available at all, then we can move data of
33981 the given mode into or out of it. */
33982 return (VALID_MMX_REG_MODE (mode)
33983 || VALID_MMX_REG_MODE_3DNOW (mode));
33986 if (mode == QImode)
33988 /* Take care for QImode values - they can be in non-QI regs,
33989 but then they do cause partial register stalls. */
33990 if (ANY_QI_REGNO_P (regno))
33991 return true;
33992 if (!TARGET_PARTIAL_REG_STALL)
33993 return true;
33994 /* LRA checks if the hard register is OK for the given mode.
33995 QImode values can live in non-QI regs, so we allow all
33996 registers here. */
33997 if (lra_in_progress)
33998 return true;
33999 return !can_create_pseudo_p ();
34001 /* We handle both integer and floats in the general purpose registers. */
34002 else if (VALID_INT_MODE_P (mode))
34003 return true;
34004 else if (VALID_FP_MODE_P (mode))
34005 return true;
34006 else if (VALID_DFP_MODE_P (mode))
34007 return true;
34008 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
34009 on to use that value in smaller contexts, this can easily force a
34010 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
34011 supporting DImode, allow it. */
34012 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
34013 return true;
34015 return false;
34018 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
34019 tieable integer mode. */
34021 static bool
34022 ix86_tieable_integer_mode_p (enum machine_mode mode)
34024 switch (mode)
34026 case HImode:
34027 case SImode:
34028 return true;
34030 case QImode:
34031 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
34033 case DImode:
34034 return TARGET_64BIT;
34036 default:
34037 return false;
34041 /* Return true if MODE1 is accessible in a register that can hold MODE2
34042 without copying. That is, all register classes that can hold MODE2
34043 can also hold MODE1. */
34045 bool
34046 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
34048 if (mode1 == mode2)
34049 return true;
34051 if (ix86_tieable_integer_mode_p (mode1)
34052 && ix86_tieable_integer_mode_p (mode2))
34053 return true;
34055 /* MODE2 being XFmode implies fp stack or general regs, which means we
34056 can tie any smaller floating point modes to it. Note that we do not
34057 tie this with TFmode. */
34058 if (mode2 == XFmode)
34059 return mode1 == SFmode || mode1 == DFmode;
34061 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
34062 that we can tie it with SFmode. */
34063 if (mode2 == DFmode)
34064 return mode1 == SFmode;
34066 /* If MODE2 is only appropriate for an SSE register, then tie with
34067 any other mode acceptable to SSE registers. */
34068 if (GET_MODE_SIZE (mode2) == 32
34069 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34070 return (GET_MODE_SIZE (mode1) == 32
34071 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34072 if (GET_MODE_SIZE (mode2) == 16
34073 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34074 return (GET_MODE_SIZE (mode1) == 16
34075 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34077 /* If MODE2 is appropriate for an MMX register, then tie
34078 with any other mode acceptable to MMX registers. */
34079 if (GET_MODE_SIZE (mode2) == 8
34080 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
34081 return (GET_MODE_SIZE (mode1) == 8
34082 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
34084 return false;
34087 /* Return the cost of moving between two registers of mode MODE. */
34089 static int
34090 ix86_set_reg_reg_cost (enum machine_mode mode)
34092 unsigned int units = UNITS_PER_WORD;
34094 switch (GET_MODE_CLASS (mode))
34096 default:
34097 break;
34099 case MODE_CC:
34100 units = GET_MODE_SIZE (CCmode);
34101 break;
34103 case MODE_FLOAT:
34104 if ((TARGET_SSE && mode == TFmode)
34105 || (TARGET_80387 && mode == XFmode)
34106 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
34107 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
34108 units = GET_MODE_SIZE (mode);
34109 break;
34111 case MODE_COMPLEX_FLOAT:
34112 if ((TARGET_SSE && mode == TCmode)
34113 || (TARGET_80387 && mode == XCmode)
34114 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
34115 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34116 units = GET_MODE_SIZE (mode);
34117 break;
34119 case MODE_VECTOR_INT:
34120 case MODE_VECTOR_FLOAT:
34121 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34122 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34123 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34124 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34125 units = GET_MODE_SIZE (mode);
34128 /* Return the cost of moving between two registers of mode MODE,
34129 assuming that the move will be in pieces of at most UNITS bytes. */
34130 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34133 /* Compute a (partial) cost for rtx X. Return true if the complete
34134 cost has been computed, and false if subexpressions should be
34135 scanned. In either case, *TOTAL contains the cost result. */
34137 static bool
34138 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34139 bool speed)
34141 enum rtx_code code = (enum rtx_code) code_i;
34142 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34143 enum machine_mode mode = GET_MODE (x);
34144 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34146 switch (code)
34148 case SET:
34149 if (register_operand (SET_DEST (x), VOIDmode)
34150 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34152 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34153 return true;
34155 return false;
34157 case CONST_INT:
34158 case CONST:
34159 case LABEL_REF:
34160 case SYMBOL_REF:
34161 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34162 *total = 3;
34163 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34164 *total = 2;
34165 else if (flag_pic && SYMBOLIC_CONST (x)
34166 && (!TARGET_64BIT
34167 || (!GET_CODE (x) != LABEL_REF
34168 && (GET_CODE (x) != SYMBOL_REF
34169 || !SYMBOL_REF_LOCAL_P (x)))))
34170 *total = 1;
34171 else
34172 *total = 0;
34173 return true;
34175 case CONST_DOUBLE:
34176 if (mode == VOIDmode)
34178 *total = 0;
34179 return true;
34181 switch (standard_80387_constant_p (x))
34183 case 1: /* 0.0 */
34184 *total = 1;
34185 return true;
34186 default: /* Other constants */
34187 *total = 2;
34188 return true;
34189 case 0:
34190 case -1:
34191 break;
34193 if (SSE_FLOAT_MODE_P (mode))
34195 case CONST_VECTOR:
34196 switch (standard_sse_constant_p (x))
34198 case 0:
34199 break;
34200 case 1: /* 0: xor eliminates false dependency */
34201 *total = 0;
34202 return true;
34203 default: /* -1: cmp contains false dependency */
34204 *total = 1;
34205 return true;
34208 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34209 it'll probably end up. Add a penalty for size. */
34210 *total = (COSTS_N_INSNS (1)
34211 + (flag_pic != 0 && !TARGET_64BIT)
34212 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34213 return true;
34215 case ZERO_EXTEND:
34216 /* The zero extensions is often completely free on x86_64, so make
34217 it as cheap as possible. */
34218 if (TARGET_64BIT && mode == DImode
34219 && GET_MODE (XEXP (x, 0)) == SImode)
34220 *total = 1;
34221 else if (TARGET_ZERO_EXTEND_WITH_AND)
34222 *total = cost->add;
34223 else
34224 *total = cost->movzx;
34225 return false;
34227 case SIGN_EXTEND:
34228 *total = cost->movsx;
34229 return false;
34231 case ASHIFT:
34232 if (SCALAR_INT_MODE_P (mode)
34233 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34234 && CONST_INT_P (XEXP (x, 1)))
34236 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34237 if (value == 1)
34239 *total = cost->add;
34240 return false;
34242 if ((value == 2 || value == 3)
34243 && cost->lea <= cost->shift_const)
34245 *total = cost->lea;
34246 return false;
34249 /* FALLTHRU */
34251 case ROTATE:
34252 case ASHIFTRT:
34253 case LSHIFTRT:
34254 case ROTATERT:
34255 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34257 /* ??? Should be SSE vector operation cost. */
34258 /* At least for published AMD latencies, this really is the same
34259 as the latency for a simple fpu operation like fabs. */
34260 /* V*QImode is emulated with 1-11 insns. */
34261 if (mode == V16QImode || mode == V32QImode)
34263 int count = 11;
34264 if (TARGET_XOP && mode == V16QImode)
34266 /* For XOP we use vpshab, which requires a broadcast of the
34267 value to the variable shift insn. For constants this
34268 means a V16Q const in mem; even when we can perform the
34269 shift with one insn set the cost to prefer paddb. */
34270 if (CONSTANT_P (XEXP (x, 1)))
34272 *total = (cost->fabs
34273 + rtx_cost (XEXP (x, 0), code, 0, speed)
34274 + (speed ? 2 : COSTS_N_BYTES (16)));
34275 return true;
34277 count = 3;
34279 else if (TARGET_SSSE3)
34280 count = 7;
34281 *total = cost->fabs * count;
34283 else
34284 *total = cost->fabs;
34286 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34288 if (CONST_INT_P (XEXP (x, 1)))
34290 if (INTVAL (XEXP (x, 1)) > 32)
34291 *total = cost->shift_const + COSTS_N_INSNS (2);
34292 else
34293 *total = cost->shift_const * 2;
34295 else
34297 if (GET_CODE (XEXP (x, 1)) == AND)
34298 *total = cost->shift_var * 2;
34299 else
34300 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34303 else
34305 if (CONST_INT_P (XEXP (x, 1)))
34306 *total = cost->shift_const;
34307 else if (GET_CODE (XEXP (x, 1)) == SUBREG
34308 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
34310 /* Return the cost after shift-and truncation. */
34311 *total = cost->shift_var;
34312 return true;
34314 else
34315 *total = cost->shift_var;
34317 return false;
34319 case FMA:
34321 rtx sub;
34323 gcc_assert (FLOAT_MODE_P (mode));
34324 gcc_assert (TARGET_FMA || TARGET_FMA4);
34326 /* ??? SSE scalar/vector cost should be used here. */
34327 /* ??? Bald assumption that fma has the same cost as fmul. */
34328 *total = cost->fmul;
34329 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34331 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34332 sub = XEXP (x, 0);
34333 if (GET_CODE (sub) == NEG)
34334 sub = XEXP (sub, 0);
34335 *total += rtx_cost (sub, FMA, 0, speed);
34337 sub = XEXP (x, 2);
34338 if (GET_CODE (sub) == NEG)
34339 sub = XEXP (sub, 0);
34340 *total += rtx_cost (sub, FMA, 2, speed);
34341 return true;
34344 case MULT:
34345 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34347 /* ??? SSE scalar cost should be used here. */
34348 *total = cost->fmul;
34349 return false;
34351 else if (X87_FLOAT_MODE_P (mode))
34353 *total = cost->fmul;
34354 return false;
34356 else if (FLOAT_MODE_P (mode))
34358 /* ??? SSE vector cost should be used here. */
34359 *total = cost->fmul;
34360 return false;
34362 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34364 /* V*QImode is emulated with 7-13 insns. */
34365 if (mode == V16QImode || mode == V32QImode)
34367 int extra = 11;
34368 if (TARGET_XOP && mode == V16QImode)
34369 extra = 5;
34370 else if (TARGET_SSSE3)
34371 extra = 6;
34372 *total = cost->fmul * 2 + cost->fabs * extra;
34374 /* V*DImode is emulated with 5-8 insns. */
34375 else if (mode == V2DImode || mode == V4DImode)
34377 if (TARGET_XOP && mode == V2DImode)
34378 *total = cost->fmul * 2 + cost->fabs * 3;
34379 else
34380 *total = cost->fmul * 3 + cost->fabs * 5;
34382 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34383 insns, including two PMULUDQ. */
34384 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34385 *total = cost->fmul * 2 + cost->fabs * 5;
34386 else
34387 *total = cost->fmul;
34388 return false;
34390 else
34392 rtx op0 = XEXP (x, 0);
34393 rtx op1 = XEXP (x, 1);
34394 int nbits;
34395 if (CONST_INT_P (XEXP (x, 1)))
34397 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34398 for (nbits = 0; value != 0; value &= value - 1)
34399 nbits++;
34401 else
34402 /* This is arbitrary. */
34403 nbits = 7;
34405 /* Compute costs correctly for widening multiplication. */
34406 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34407 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34408 == GET_MODE_SIZE (mode))
34410 int is_mulwiden = 0;
34411 enum machine_mode inner_mode = GET_MODE (op0);
34413 if (GET_CODE (op0) == GET_CODE (op1))
34414 is_mulwiden = 1, op1 = XEXP (op1, 0);
34415 else if (CONST_INT_P (op1))
34417 if (GET_CODE (op0) == SIGN_EXTEND)
34418 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34419 == INTVAL (op1);
34420 else
34421 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34424 if (is_mulwiden)
34425 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34428 *total = (cost->mult_init[MODE_INDEX (mode)]
34429 + nbits * cost->mult_bit
34430 + rtx_cost (op0, outer_code, opno, speed)
34431 + rtx_cost (op1, outer_code, opno, speed));
34433 return true;
34436 case DIV:
34437 case UDIV:
34438 case MOD:
34439 case UMOD:
34440 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34441 /* ??? SSE cost should be used here. */
34442 *total = cost->fdiv;
34443 else if (X87_FLOAT_MODE_P (mode))
34444 *total = cost->fdiv;
34445 else if (FLOAT_MODE_P (mode))
34446 /* ??? SSE vector cost should be used here. */
34447 *total = cost->fdiv;
34448 else
34449 *total = cost->divide[MODE_INDEX (mode)];
34450 return false;
34452 case PLUS:
34453 if (GET_MODE_CLASS (mode) == MODE_INT
34454 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34456 if (GET_CODE (XEXP (x, 0)) == PLUS
34457 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34458 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34459 && CONSTANT_P (XEXP (x, 1)))
34461 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34462 if (val == 2 || val == 4 || val == 8)
34464 *total = cost->lea;
34465 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34466 outer_code, opno, speed);
34467 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34468 outer_code, opno, speed);
34469 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34470 return true;
34473 else if (GET_CODE (XEXP (x, 0)) == MULT
34474 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34476 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34477 if (val == 2 || val == 4 || val == 8)
34479 *total = cost->lea;
34480 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34481 outer_code, opno, speed);
34482 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34483 return true;
34486 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34488 *total = cost->lea;
34489 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34490 outer_code, opno, speed);
34491 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34492 outer_code, opno, speed);
34493 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34494 return true;
34497 /* FALLTHRU */
34499 case MINUS:
34500 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34502 /* ??? SSE cost should be used here. */
34503 *total = cost->fadd;
34504 return false;
34506 else if (X87_FLOAT_MODE_P (mode))
34508 *total = cost->fadd;
34509 return false;
34511 else if (FLOAT_MODE_P (mode))
34513 /* ??? SSE vector cost should be used here. */
34514 *total = cost->fadd;
34515 return false;
34517 /* FALLTHRU */
34519 case AND:
34520 case IOR:
34521 case XOR:
34522 if (GET_MODE_CLASS (mode) == MODE_INT
34523 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34525 *total = (cost->add * 2
34526 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34527 << (GET_MODE (XEXP (x, 0)) != DImode))
34528 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34529 << (GET_MODE (XEXP (x, 1)) != DImode)));
34530 return true;
34532 /* FALLTHRU */
34534 case NEG:
34535 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34537 /* ??? SSE cost should be used here. */
34538 *total = cost->fchs;
34539 return false;
34541 else if (X87_FLOAT_MODE_P (mode))
34543 *total = cost->fchs;
34544 return false;
34546 else if (FLOAT_MODE_P (mode))
34548 /* ??? SSE vector cost should be used here. */
34549 *total = cost->fchs;
34550 return false;
34552 /* FALLTHRU */
34554 case NOT:
34555 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34557 /* ??? Should be SSE vector operation cost. */
34558 /* At least for published AMD latencies, this really is the same
34559 as the latency for a simple fpu operation like fabs. */
34560 *total = cost->fabs;
34562 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34563 *total = cost->add * 2;
34564 else
34565 *total = cost->add;
34566 return false;
34568 case COMPARE:
34569 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34570 && XEXP (XEXP (x, 0), 1) == const1_rtx
34571 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34572 && XEXP (x, 1) == const0_rtx)
34574 /* This kind of construct is implemented using test[bwl].
34575 Treat it as if we had an AND. */
34576 *total = (cost->add
34577 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34578 + rtx_cost (const1_rtx, outer_code, opno, speed));
34579 return true;
34581 return false;
34583 case FLOAT_EXTEND:
34584 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34585 *total = 0;
34586 return false;
34588 case ABS:
34589 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34590 /* ??? SSE cost should be used here. */
34591 *total = cost->fabs;
34592 else if (X87_FLOAT_MODE_P (mode))
34593 *total = cost->fabs;
34594 else if (FLOAT_MODE_P (mode))
34595 /* ??? SSE vector cost should be used here. */
34596 *total = cost->fabs;
34597 return false;
34599 case SQRT:
34600 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34601 /* ??? SSE cost should be used here. */
34602 *total = cost->fsqrt;
34603 else if (X87_FLOAT_MODE_P (mode))
34604 *total = cost->fsqrt;
34605 else if (FLOAT_MODE_P (mode))
34606 /* ??? SSE vector cost should be used here. */
34607 *total = cost->fsqrt;
34608 return false;
34610 case UNSPEC:
34611 if (XINT (x, 1) == UNSPEC_TP)
34612 *total = 0;
34613 return false;
34615 case VEC_SELECT:
34616 case VEC_CONCAT:
34617 case VEC_MERGE:
34618 case VEC_DUPLICATE:
34619 /* ??? Assume all of these vector manipulation patterns are
34620 recognizable. In which case they all pretty much have the
34621 same cost. */
34622 *total = cost->fabs;
34623 return true;
34625 default:
34626 return false;
34630 #if TARGET_MACHO
34632 static int current_machopic_label_num;
34634 /* Given a symbol name and its associated stub, write out the
34635 definition of the stub. */
34637 void
34638 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34640 unsigned int length;
34641 char *binder_name, *symbol_name, lazy_ptr_name[32];
34642 int label = ++current_machopic_label_num;
34644 /* For 64-bit we shouldn't get here. */
34645 gcc_assert (!TARGET_64BIT);
34647 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34648 symb = targetm.strip_name_encoding (symb);
34650 length = strlen (stub);
34651 binder_name = XALLOCAVEC (char, length + 32);
34652 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34654 length = strlen (symb);
34655 symbol_name = XALLOCAVEC (char, length + 32);
34656 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34658 sprintf (lazy_ptr_name, "L%d$lz", label);
34660 if (MACHOPIC_ATT_STUB)
34661 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34662 else if (MACHOPIC_PURE)
34663 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34664 else
34665 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34667 fprintf (file, "%s:\n", stub);
34668 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34670 if (MACHOPIC_ATT_STUB)
34672 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34674 else if (MACHOPIC_PURE)
34676 /* PIC stub. */
34677 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34678 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34679 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34680 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34681 label, lazy_ptr_name, label);
34682 fprintf (file, "\tjmp\t*%%ecx\n");
34684 else
34685 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34687 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34688 it needs no stub-binding-helper. */
34689 if (MACHOPIC_ATT_STUB)
34690 return;
34692 fprintf (file, "%s:\n", binder_name);
34694 if (MACHOPIC_PURE)
34696 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34697 fprintf (file, "\tpushl\t%%ecx\n");
34699 else
34700 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34702 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34704 /* N.B. Keep the correspondence of these
34705 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34706 old-pic/new-pic/non-pic stubs; altering this will break
34707 compatibility with existing dylibs. */
34708 if (MACHOPIC_PURE)
34710 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34711 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34713 else
34714 /* 16-byte -mdynamic-no-pic stub. */
34715 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34717 fprintf (file, "%s:\n", lazy_ptr_name);
34718 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34719 fprintf (file, ASM_LONG "%s\n", binder_name);
34721 #endif /* TARGET_MACHO */
34723 /* Order the registers for register allocator. */
34725 void
34726 x86_order_regs_for_local_alloc (void)
34728 int pos = 0;
34729 int i;
34731 /* First allocate the local general purpose registers. */
34732 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34733 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34734 reg_alloc_order [pos++] = i;
34736 /* Global general purpose registers. */
34737 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34738 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34739 reg_alloc_order [pos++] = i;
34741 /* x87 registers come first in case we are doing FP math
34742 using them. */
34743 if (!TARGET_SSE_MATH)
34744 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34745 reg_alloc_order [pos++] = i;
34747 /* SSE registers. */
34748 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34749 reg_alloc_order [pos++] = i;
34750 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34751 reg_alloc_order [pos++] = i;
34753 /* x87 registers. */
34754 if (TARGET_SSE_MATH)
34755 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34756 reg_alloc_order [pos++] = i;
34758 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34759 reg_alloc_order [pos++] = i;
34761 /* Initialize the rest of array as we do not allocate some registers
34762 at all. */
34763 while (pos < FIRST_PSEUDO_REGISTER)
34764 reg_alloc_order [pos++] = 0;
34767 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34768 in struct attribute_spec handler. */
34769 static tree
34770 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34771 tree args,
34772 int flags ATTRIBUTE_UNUSED,
34773 bool *no_add_attrs)
34775 if (TREE_CODE (*node) != FUNCTION_TYPE
34776 && TREE_CODE (*node) != METHOD_TYPE
34777 && TREE_CODE (*node) != FIELD_DECL
34778 && TREE_CODE (*node) != TYPE_DECL)
34780 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34781 name);
34782 *no_add_attrs = true;
34783 return NULL_TREE;
34785 if (TARGET_64BIT)
34787 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34788 name);
34789 *no_add_attrs = true;
34790 return NULL_TREE;
34792 if (is_attribute_p ("callee_pop_aggregate_return", name))
34794 tree cst;
34796 cst = TREE_VALUE (args);
34797 if (TREE_CODE (cst) != INTEGER_CST)
34799 warning (OPT_Wattributes,
34800 "%qE attribute requires an integer constant argument",
34801 name);
34802 *no_add_attrs = true;
34804 else if (compare_tree_int (cst, 0) != 0
34805 && compare_tree_int (cst, 1) != 0)
34807 warning (OPT_Wattributes,
34808 "argument to %qE attribute is neither zero, nor one",
34809 name);
34810 *no_add_attrs = true;
34813 return NULL_TREE;
34816 return NULL_TREE;
34819 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34820 struct attribute_spec.handler. */
34821 static tree
34822 ix86_handle_abi_attribute (tree *node, tree name,
34823 tree args ATTRIBUTE_UNUSED,
34824 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34826 if (TREE_CODE (*node) != FUNCTION_TYPE
34827 && TREE_CODE (*node) != METHOD_TYPE
34828 && TREE_CODE (*node) != FIELD_DECL
34829 && TREE_CODE (*node) != TYPE_DECL)
34831 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34832 name);
34833 *no_add_attrs = true;
34834 return NULL_TREE;
34837 /* Can combine regparm with all attributes but fastcall. */
34838 if (is_attribute_p ("ms_abi", name))
34840 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34842 error ("ms_abi and sysv_abi attributes are not compatible");
34845 return NULL_TREE;
34847 else if (is_attribute_p ("sysv_abi", name))
34849 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34851 error ("ms_abi and sysv_abi attributes are not compatible");
34854 return NULL_TREE;
34857 return NULL_TREE;
34860 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34861 struct attribute_spec.handler. */
34862 static tree
34863 ix86_handle_struct_attribute (tree *node, tree name,
34864 tree args ATTRIBUTE_UNUSED,
34865 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34867 tree *type = NULL;
34868 if (DECL_P (*node))
34870 if (TREE_CODE (*node) == TYPE_DECL)
34871 type = &TREE_TYPE (*node);
34873 else
34874 type = node;
34876 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34878 warning (OPT_Wattributes, "%qE attribute ignored",
34879 name);
34880 *no_add_attrs = true;
34883 else if ((is_attribute_p ("ms_struct", name)
34884 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34885 || ((is_attribute_p ("gcc_struct", name)
34886 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34888 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34889 name);
34890 *no_add_attrs = true;
34893 return NULL_TREE;
34896 static tree
34897 ix86_handle_fndecl_attribute (tree *node, tree name,
34898 tree args ATTRIBUTE_UNUSED,
34899 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34901 if (TREE_CODE (*node) != FUNCTION_DECL)
34903 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34904 name);
34905 *no_add_attrs = true;
34907 return NULL_TREE;
34910 static bool
34911 ix86_ms_bitfield_layout_p (const_tree record_type)
34913 return ((TARGET_MS_BITFIELD_LAYOUT
34914 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34915 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34918 /* Returns an expression indicating where the this parameter is
34919 located on entry to the FUNCTION. */
34921 static rtx
34922 x86_this_parameter (tree function)
34924 tree type = TREE_TYPE (function);
34925 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34926 int nregs;
34928 if (TARGET_64BIT)
34930 const int *parm_regs;
34932 if (ix86_function_type_abi (type) == MS_ABI)
34933 parm_regs = x86_64_ms_abi_int_parameter_registers;
34934 else
34935 parm_regs = x86_64_int_parameter_registers;
34936 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34939 nregs = ix86_function_regparm (type, function);
34941 if (nregs > 0 && !stdarg_p (type))
34943 int regno;
34944 unsigned int ccvt = ix86_get_callcvt (type);
34946 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34947 regno = aggr ? DX_REG : CX_REG;
34948 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34950 regno = CX_REG;
34951 if (aggr)
34952 return gen_rtx_MEM (SImode,
34953 plus_constant (Pmode, stack_pointer_rtx, 4));
34955 else
34957 regno = AX_REG;
34958 if (aggr)
34960 regno = DX_REG;
34961 if (nregs == 1)
34962 return gen_rtx_MEM (SImode,
34963 plus_constant (Pmode,
34964 stack_pointer_rtx, 4));
34967 return gen_rtx_REG (SImode, regno);
34970 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34971 aggr ? 8 : 4));
34974 /* Determine whether x86_output_mi_thunk can succeed. */
34976 static bool
34977 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34978 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34979 HOST_WIDE_INT vcall_offset, const_tree function)
34981 /* 64-bit can handle anything. */
34982 if (TARGET_64BIT)
34983 return true;
34985 /* For 32-bit, everything's fine if we have one free register. */
34986 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34987 return true;
34989 /* Need a free register for vcall_offset. */
34990 if (vcall_offset)
34991 return false;
34993 /* Need a free register for GOT references. */
34994 if (flag_pic && !targetm.binds_local_p (function))
34995 return false;
34997 /* Otherwise ok. */
34998 return true;
35001 /* Output the assembler code for a thunk function. THUNK_DECL is the
35002 declaration for the thunk function itself, FUNCTION is the decl for
35003 the target function. DELTA is an immediate constant offset to be
35004 added to THIS. If VCALL_OFFSET is nonzero, the word at
35005 *(*this + vcall_offset) should be added to THIS. */
35007 static void
35008 x86_output_mi_thunk (FILE *file,
35009 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
35010 HOST_WIDE_INT vcall_offset, tree function)
35012 rtx this_param = x86_this_parameter (function);
35013 rtx this_reg, tmp, fnaddr;
35014 unsigned int tmp_regno;
35016 if (TARGET_64BIT)
35017 tmp_regno = R10_REG;
35018 else
35020 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
35021 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35022 tmp_regno = AX_REG;
35023 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35024 tmp_regno = DX_REG;
35025 else
35026 tmp_regno = CX_REG;
35029 emit_note (NOTE_INSN_PROLOGUE_END);
35031 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
35032 pull it in now and let DELTA benefit. */
35033 if (REG_P (this_param))
35034 this_reg = this_param;
35035 else if (vcall_offset)
35037 /* Put the this parameter into %eax. */
35038 this_reg = gen_rtx_REG (Pmode, AX_REG);
35039 emit_move_insn (this_reg, this_param);
35041 else
35042 this_reg = NULL_RTX;
35044 /* Adjust the this parameter by a fixed constant. */
35045 if (delta)
35047 rtx delta_rtx = GEN_INT (delta);
35048 rtx delta_dst = this_reg ? this_reg : this_param;
35050 if (TARGET_64BIT)
35052 if (!x86_64_general_operand (delta_rtx, Pmode))
35054 tmp = gen_rtx_REG (Pmode, tmp_regno);
35055 emit_move_insn (tmp, delta_rtx);
35056 delta_rtx = tmp;
35060 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
35063 /* Adjust the this parameter by a value stored in the vtable. */
35064 if (vcall_offset)
35066 rtx vcall_addr, vcall_mem, this_mem;
35068 tmp = gen_rtx_REG (Pmode, tmp_regno);
35070 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
35071 if (Pmode != ptr_mode)
35072 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
35073 emit_move_insn (tmp, this_mem);
35075 /* Adjust the this parameter. */
35076 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
35077 if (TARGET_64BIT
35078 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
35080 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
35081 emit_move_insn (tmp2, GEN_INT (vcall_offset));
35082 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
35085 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
35086 if (Pmode != ptr_mode)
35087 emit_insn (gen_addsi_1_zext (this_reg,
35088 gen_rtx_REG (ptr_mode,
35089 REGNO (this_reg)),
35090 vcall_mem));
35091 else
35092 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
35095 /* If necessary, drop THIS back to its stack slot. */
35096 if (this_reg && this_reg != this_param)
35097 emit_move_insn (this_param, this_reg);
35099 fnaddr = XEXP (DECL_RTL (function), 0);
35100 if (TARGET_64BIT)
35102 if (!flag_pic || targetm.binds_local_p (function)
35103 || DEFAULT_ABI == MS_ABI)
35105 else
35107 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
35108 tmp = gen_rtx_CONST (Pmode, tmp);
35109 fnaddr = gen_rtx_MEM (Pmode, tmp);
35112 else
35114 if (!flag_pic || targetm.binds_local_p (function))
35116 #if TARGET_MACHO
35117 else if (TARGET_MACHO)
35119 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
35120 fnaddr = XEXP (fnaddr, 0);
35122 #endif /* TARGET_MACHO */
35123 else
35125 tmp = gen_rtx_REG (Pmode, CX_REG);
35126 output_set_got (tmp, NULL_RTX);
35128 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
35129 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
35130 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
35134 /* Our sibling call patterns do not allow memories, because we have no
35135 predicate that can distinguish between frame and non-frame memory.
35136 For our purposes here, we can get away with (ab)using a jump pattern,
35137 because we're going to do no optimization. */
35138 if (MEM_P (fnaddr))
35139 emit_jump_insn (gen_indirect_jump (fnaddr));
35140 else
35142 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35143 fnaddr = legitimize_pic_address (fnaddr,
35144 gen_rtx_REG (Pmode, tmp_regno));
35146 if (!sibcall_insn_operand (fnaddr, word_mode))
35148 tmp = gen_rtx_REG (word_mode, tmp_regno);
35149 if (GET_MODE (fnaddr) != word_mode)
35150 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35151 emit_move_insn (tmp, fnaddr);
35152 fnaddr = tmp;
35155 tmp = gen_rtx_MEM (QImode, fnaddr);
35156 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35157 tmp = emit_call_insn (tmp);
35158 SIBLING_CALL_P (tmp) = 1;
35160 emit_barrier ();
35162 /* Emit just enough of rest_of_compilation to get the insns emitted.
35163 Note that use_thunk calls assemble_start_function et al. */
35164 tmp = get_insns ();
35165 shorten_branches (tmp);
35166 final_start_function (tmp, file, 1);
35167 final (tmp, file, 1);
35168 final_end_function ();
35171 static void
35172 x86_file_start (void)
35174 default_file_start ();
35175 #if TARGET_MACHO
35176 darwin_file_start ();
35177 #endif
35178 if (X86_FILE_START_VERSION_DIRECTIVE)
35179 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35180 if (X86_FILE_START_FLTUSED)
35181 fputs ("\t.global\t__fltused\n", asm_out_file);
35182 if (ix86_asm_dialect == ASM_INTEL)
35183 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35187 x86_field_alignment (tree field, int computed)
35189 enum machine_mode mode;
35190 tree type = TREE_TYPE (field);
35192 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35193 return computed;
35194 mode = TYPE_MODE (strip_array_types (type));
35195 if (mode == DFmode || mode == DCmode
35196 || GET_MODE_CLASS (mode) == MODE_INT
35197 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35198 return MIN (32, computed);
35199 return computed;
35202 /* Output assembler code to FILE to increment profiler label # LABELNO
35203 for profiling a function entry. */
35204 void
35205 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35207 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35208 : MCOUNT_NAME);
35210 if (TARGET_64BIT)
35212 #ifndef NO_PROFILE_COUNTERS
35213 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35214 #endif
35216 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
35217 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35218 else
35219 fprintf (file, "\tcall\t%s\n", mcount_name);
35221 else if (flag_pic)
35223 #ifndef NO_PROFILE_COUNTERS
35224 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35225 LPREFIX, labelno);
35226 #endif
35227 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35229 else
35231 #ifndef NO_PROFILE_COUNTERS
35232 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35233 LPREFIX, labelno);
35234 #endif
35235 fprintf (file, "\tcall\t%s\n", mcount_name);
35239 /* We don't have exact information about the insn sizes, but we may assume
35240 quite safely that we are informed about all 1 byte insns and memory
35241 address sizes. This is enough to eliminate unnecessary padding in
35242 99% of cases. */
35244 static int
35245 min_insn_size (rtx insn)
35247 int l = 0, len;
35249 if (!INSN_P (insn) || !active_insn_p (insn))
35250 return 0;
35252 /* Discard alignments we've emit and jump instructions. */
35253 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35254 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35255 return 0;
35257 /* Important case - calls are always 5 bytes.
35258 It is common to have many calls in the row. */
35259 if (CALL_P (insn)
35260 && symbolic_reference_mentioned_p (PATTERN (insn))
35261 && !SIBLING_CALL_P (insn))
35262 return 5;
35263 len = get_attr_length (insn);
35264 if (len <= 1)
35265 return 1;
35267 /* For normal instructions we rely on get_attr_length being exact,
35268 with a few exceptions. */
35269 if (!JUMP_P (insn))
35271 enum attr_type type = get_attr_type (insn);
35273 switch (type)
35275 case TYPE_MULTI:
35276 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35277 || asm_noperands (PATTERN (insn)) >= 0)
35278 return 0;
35279 break;
35280 case TYPE_OTHER:
35281 case TYPE_FCMP:
35282 break;
35283 default:
35284 /* Otherwise trust get_attr_length. */
35285 return len;
35288 l = get_attr_length_address (insn);
35289 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35290 l = 4;
35292 if (l)
35293 return 1+l;
35294 else
35295 return 2;
35298 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35300 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35301 window. */
35303 static void
35304 ix86_avoid_jump_mispredicts (void)
35306 rtx insn, start = get_insns ();
35307 int nbytes = 0, njumps = 0;
35308 int isjump = 0;
35310 /* Look for all minimal intervals of instructions containing 4 jumps.
35311 The intervals are bounded by START and INSN. NBYTES is the total
35312 size of instructions in the interval including INSN and not including
35313 START. When the NBYTES is smaller than 16 bytes, it is possible
35314 that the end of START and INSN ends up in the same 16byte page.
35316 The smallest offset in the page INSN can start is the case where START
35317 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35318 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35320 for (insn = start; insn; insn = NEXT_INSN (insn))
35322 int min_size;
35324 if (LABEL_P (insn))
35326 int align = label_to_alignment (insn);
35327 int max_skip = label_to_max_skip (insn);
35329 if (max_skip > 15)
35330 max_skip = 15;
35331 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35332 already in the current 16 byte page, because otherwise
35333 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35334 bytes to reach 16 byte boundary. */
35335 if (align <= 0
35336 || (align <= 3 && max_skip != (1 << align) - 1))
35337 max_skip = 0;
35338 if (dump_file)
35339 fprintf (dump_file, "Label %i with max_skip %i\n",
35340 INSN_UID (insn), max_skip);
35341 if (max_skip)
35343 while (nbytes + max_skip >= 16)
35345 start = NEXT_INSN (start);
35346 if (JUMP_P (start) || CALL_P (start))
35347 njumps--, isjump = 1;
35348 else
35349 isjump = 0;
35350 nbytes -= min_insn_size (start);
35353 continue;
35356 min_size = min_insn_size (insn);
35357 nbytes += min_size;
35358 if (dump_file)
35359 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35360 INSN_UID (insn), min_size);
35361 if (JUMP_P (insn) || CALL_P (insn))
35362 njumps++;
35363 else
35364 continue;
35366 while (njumps > 3)
35368 start = NEXT_INSN (start);
35369 if (JUMP_P (start) || CALL_P (start))
35370 njumps--, isjump = 1;
35371 else
35372 isjump = 0;
35373 nbytes -= min_insn_size (start);
35375 gcc_assert (njumps >= 0);
35376 if (dump_file)
35377 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35378 INSN_UID (start), INSN_UID (insn), nbytes);
35380 if (njumps == 3 && isjump && nbytes < 16)
35382 int padsize = 15 - nbytes + min_insn_size (insn);
35384 if (dump_file)
35385 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35386 INSN_UID (insn), padsize);
35387 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35391 #endif
35393 /* AMD Athlon works faster
35394 when RET is not destination of conditional jump or directly preceded
35395 by other jump instruction. We avoid the penalty by inserting NOP just
35396 before the RET instructions in such cases. */
35397 static void
35398 ix86_pad_returns (void)
35400 edge e;
35401 edge_iterator ei;
35403 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35405 basic_block bb = e->src;
35406 rtx ret = BB_END (bb);
35407 rtx prev;
35408 bool replace = false;
35410 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35411 || optimize_bb_for_size_p (bb))
35412 continue;
35413 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35414 if (active_insn_p (prev) || LABEL_P (prev))
35415 break;
35416 if (prev && LABEL_P (prev))
35418 edge e;
35419 edge_iterator ei;
35421 FOR_EACH_EDGE (e, ei, bb->preds)
35422 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35423 && !(e->flags & EDGE_FALLTHRU))
35424 replace = true;
35426 if (!replace)
35428 prev = prev_active_insn (ret);
35429 if (prev
35430 && ((JUMP_P (prev) && any_condjump_p (prev))
35431 || CALL_P (prev)))
35432 replace = true;
35433 /* Empty functions get branch mispredict even when
35434 the jump destination is not visible to us. */
35435 if (!prev && !optimize_function_for_size_p (cfun))
35436 replace = true;
35438 if (replace)
35440 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35441 delete_insn (ret);
35446 /* Count the minimum number of instructions in BB. Return 4 if the
35447 number of instructions >= 4. */
35449 static int
35450 ix86_count_insn_bb (basic_block bb)
35452 rtx insn;
35453 int insn_count = 0;
35455 /* Count number of instructions in this block. Return 4 if the number
35456 of instructions >= 4. */
35457 FOR_BB_INSNS (bb, insn)
35459 /* Only happen in exit blocks. */
35460 if (JUMP_P (insn)
35461 && ANY_RETURN_P (PATTERN (insn)))
35462 break;
35464 if (NONDEBUG_INSN_P (insn)
35465 && GET_CODE (PATTERN (insn)) != USE
35466 && GET_CODE (PATTERN (insn)) != CLOBBER)
35468 insn_count++;
35469 if (insn_count >= 4)
35470 return insn_count;
35474 return insn_count;
35478 /* Count the minimum number of instructions in code path in BB.
35479 Return 4 if the number of instructions >= 4. */
35481 static int
35482 ix86_count_insn (basic_block bb)
35484 edge e;
35485 edge_iterator ei;
35486 int min_prev_count;
35488 /* Only bother counting instructions along paths with no
35489 more than 2 basic blocks between entry and exit. Given
35490 that BB has an edge to exit, determine if a predecessor
35491 of BB has an edge from entry. If so, compute the number
35492 of instructions in the predecessor block. If there
35493 happen to be multiple such blocks, compute the minimum. */
35494 min_prev_count = 4;
35495 FOR_EACH_EDGE (e, ei, bb->preds)
35497 edge prev_e;
35498 edge_iterator prev_ei;
35500 if (e->src == ENTRY_BLOCK_PTR)
35502 min_prev_count = 0;
35503 break;
35505 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35507 if (prev_e->src == ENTRY_BLOCK_PTR)
35509 int count = ix86_count_insn_bb (e->src);
35510 if (count < min_prev_count)
35511 min_prev_count = count;
35512 break;
35517 if (min_prev_count < 4)
35518 min_prev_count += ix86_count_insn_bb (bb);
35520 return min_prev_count;
35523 /* Pad short function to 4 instructions. */
35525 static void
35526 ix86_pad_short_function (void)
35528 edge e;
35529 edge_iterator ei;
35531 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35533 rtx ret = BB_END (e->src);
35534 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35536 int insn_count = ix86_count_insn (e->src);
35538 /* Pad short function. */
35539 if (insn_count < 4)
35541 rtx insn = ret;
35543 /* Find epilogue. */
35544 while (insn
35545 && (!NOTE_P (insn)
35546 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35547 insn = PREV_INSN (insn);
35549 if (!insn)
35550 insn = ret;
35552 /* Two NOPs count as one instruction. */
35553 insn_count = 2 * (4 - insn_count);
35554 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35560 /* Implement machine specific optimizations. We implement padding of returns
35561 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35562 static void
35563 ix86_reorg (void)
35565 /* We are freeing block_for_insn in the toplev to keep compatibility
35566 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35567 compute_bb_for_insn ();
35569 if (optimize && optimize_function_for_speed_p (cfun))
35571 if (TARGET_PAD_SHORT_FUNCTION)
35572 ix86_pad_short_function ();
35573 else if (TARGET_PAD_RETURNS)
35574 ix86_pad_returns ();
35575 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35576 if (TARGET_FOUR_JUMP_LIMIT)
35577 ix86_avoid_jump_mispredicts ();
35578 #endif
35582 /* Return nonzero when QImode register that must be represented via REX prefix
35583 is used. */
35584 bool
35585 x86_extended_QIreg_mentioned_p (rtx insn)
35587 int i;
35588 extract_insn_cached (insn);
35589 for (i = 0; i < recog_data.n_operands; i++)
35590 if (GENERAL_REG_P (recog_data.operand[i])
35591 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35592 return true;
35593 return false;
35596 /* Return nonzero when P points to register encoded via REX prefix.
35597 Called via for_each_rtx. */
35598 static int
35599 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35601 unsigned int regno;
35602 if (!REG_P (*p))
35603 return 0;
35604 regno = REGNO (*p);
35605 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35608 /* Return true when INSN mentions register that must be encoded using REX
35609 prefix. */
35610 bool
35611 x86_extended_reg_mentioned_p (rtx insn)
35613 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35614 extended_reg_mentioned_1, NULL);
35617 /* If profitable, negate (without causing overflow) integer constant
35618 of mode MODE at location LOC. Return true in this case. */
35619 bool
35620 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35622 HOST_WIDE_INT val;
35624 if (!CONST_INT_P (*loc))
35625 return false;
35627 switch (mode)
35629 case DImode:
35630 /* DImode x86_64 constants must fit in 32 bits. */
35631 gcc_assert (x86_64_immediate_operand (*loc, mode));
35633 mode = SImode;
35634 break;
35636 case SImode:
35637 case HImode:
35638 case QImode:
35639 break;
35641 default:
35642 gcc_unreachable ();
35645 /* Avoid overflows. */
35646 if (mode_signbit_p (mode, *loc))
35647 return false;
35649 val = INTVAL (*loc);
35651 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35652 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35653 if ((val < 0 && val != -128)
35654 || val == 128)
35656 *loc = GEN_INT (-val);
35657 return true;
35660 return false;
35663 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35664 optabs would emit if we didn't have TFmode patterns. */
35666 void
35667 x86_emit_floatuns (rtx operands[2])
35669 rtx neglab, donelab, i0, i1, f0, in, out;
35670 enum machine_mode mode, inmode;
35672 inmode = GET_MODE (operands[1]);
35673 gcc_assert (inmode == SImode || inmode == DImode);
35675 out = operands[0];
35676 in = force_reg (inmode, operands[1]);
35677 mode = GET_MODE (out);
35678 neglab = gen_label_rtx ();
35679 donelab = gen_label_rtx ();
35680 f0 = gen_reg_rtx (mode);
35682 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35684 expand_float (out, in, 0);
35686 emit_jump_insn (gen_jump (donelab));
35687 emit_barrier ();
35689 emit_label (neglab);
35691 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35692 1, OPTAB_DIRECT);
35693 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35694 1, OPTAB_DIRECT);
35695 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35697 expand_float (f0, i0, 0);
35699 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35701 emit_label (donelab);
35704 /* AVX2 does support 32-byte integer vector operations,
35705 thus the longest vector we are faced with is V32QImode. */
35706 #define MAX_VECT_LEN 32
35708 struct expand_vec_perm_d
35710 rtx target, op0, op1;
35711 unsigned char perm[MAX_VECT_LEN];
35712 enum machine_mode vmode;
35713 unsigned char nelt;
35714 bool one_operand_p;
35715 bool testing_p;
35718 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35719 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35720 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35722 /* Get a vector mode of the same size as the original but with elements
35723 twice as wide. This is only guaranteed to apply to integral vectors. */
35725 static inline enum machine_mode
35726 get_mode_wider_vector (enum machine_mode o)
35728 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35729 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35730 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35731 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35732 return n;
35735 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35736 with all elements equal to VAR. Return true if successful. */
35738 static bool
35739 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35740 rtx target, rtx val)
35742 bool ok;
35744 switch (mode)
35746 case V2SImode:
35747 case V2SFmode:
35748 if (!mmx_ok)
35749 return false;
35750 /* FALLTHRU */
35752 case V4DFmode:
35753 case V4DImode:
35754 case V8SFmode:
35755 case V8SImode:
35756 case V2DFmode:
35757 case V2DImode:
35758 case V4SFmode:
35759 case V4SImode:
35761 rtx insn, dup;
35763 /* First attempt to recognize VAL as-is. */
35764 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35765 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35766 if (recog_memoized (insn) < 0)
35768 rtx seq;
35769 /* If that fails, force VAL into a register. */
35771 start_sequence ();
35772 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35773 seq = get_insns ();
35774 end_sequence ();
35775 if (seq)
35776 emit_insn_before (seq, insn);
35778 ok = recog_memoized (insn) >= 0;
35779 gcc_assert (ok);
35782 return true;
35784 case V4HImode:
35785 if (!mmx_ok)
35786 return false;
35787 if (TARGET_SSE || TARGET_3DNOW_A)
35789 rtx x;
35791 val = gen_lowpart (SImode, val);
35792 x = gen_rtx_TRUNCATE (HImode, val);
35793 x = gen_rtx_VEC_DUPLICATE (mode, x);
35794 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35795 return true;
35797 goto widen;
35799 case V8QImode:
35800 if (!mmx_ok)
35801 return false;
35802 goto widen;
35804 case V8HImode:
35805 if (TARGET_SSE2)
35807 struct expand_vec_perm_d dperm;
35808 rtx tmp1, tmp2;
35810 permute:
35811 memset (&dperm, 0, sizeof (dperm));
35812 dperm.target = target;
35813 dperm.vmode = mode;
35814 dperm.nelt = GET_MODE_NUNITS (mode);
35815 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35816 dperm.one_operand_p = true;
35818 /* Extend to SImode using a paradoxical SUBREG. */
35819 tmp1 = gen_reg_rtx (SImode);
35820 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35822 /* Insert the SImode value as low element of a V4SImode vector. */
35823 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35824 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35826 ok = (expand_vec_perm_1 (&dperm)
35827 || expand_vec_perm_broadcast_1 (&dperm));
35828 gcc_assert (ok);
35829 return ok;
35831 goto widen;
35833 case V16QImode:
35834 if (TARGET_SSE2)
35835 goto permute;
35836 goto widen;
35838 widen:
35839 /* Replicate the value once into the next wider mode and recurse. */
35841 enum machine_mode smode, wsmode, wvmode;
35842 rtx x;
35844 smode = GET_MODE_INNER (mode);
35845 wvmode = get_mode_wider_vector (mode);
35846 wsmode = GET_MODE_INNER (wvmode);
35848 val = convert_modes (wsmode, smode, val, true);
35849 x = expand_simple_binop (wsmode, ASHIFT, val,
35850 GEN_INT (GET_MODE_BITSIZE (smode)),
35851 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35852 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35854 x = gen_lowpart (wvmode, target);
35855 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35856 gcc_assert (ok);
35857 return ok;
35860 case V16HImode:
35861 case V32QImode:
35863 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35864 rtx x = gen_reg_rtx (hvmode);
35866 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35867 gcc_assert (ok);
35869 x = gen_rtx_VEC_CONCAT (mode, x, x);
35870 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35872 return true;
35874 default:
35875 return false;
35879 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35880 whose ONE_VAR element is VAR, and other elements are zero. Return true
35881 if successful. */
35883 static bool
35884 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35885 rtx target, rtx var, int one_var)
35887 enum machine_mode vsimode;
35888 rtx new_target;
35889 rtx x, tmp;
35890 bool use_vector_set = false;
35892 switch (mode)
35894 case V2DImode:
35895 /* For SSE4.1, we normally use vector set. But if the second
35896 element is zero and inter-unit moves are OK, we use movq
35897 instead. */
35898 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
35899 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
35900 && one_var == 0));
35901 break;
35902 case V16QImode:
35903 case V4SImode:
35904 case V4SFmode:
35905 use_vector_set = TARGET_SSE4_1;
35906 break;
35907 case V8HImode:
35908 use_vector_set = TARGET_SSE2;
35909 break;
35910 case V4HImode:
35911 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35912 break;
35913 case V32QImode:
35914 case V16HImode:
35915 case V8SImode:
35916 case V8SFmode:
35917 case V4DFmode:
35918 use_vector_set = TARGET_AVX;
35919 break;
35920 case V4DImode:
35921 /* Use ix86_expand_vector_set in 64bit mode only. */
35922 use_vector_set = TARGET_AVX && TARGET_64BIT;
35923 break;
35924 default:
35925 break;
35928 if (use_vector_set)
35930 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35931 var = force_reg (GET_MODE_INNER (mode), var);
35932 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35933 return true;
35936 switch (mode)
35938 case V2SFmode:
35939 case V2SImode:
35940 if (!mmx_ok)
35941 return false;
35942 /* FALLTHRU */
35944 case V2DFmode:
35945 case V2DImode:
35946 if (one_var != 0)
35947 return false;
35948 var = force_reg (GET_MODE_INNER (mode), var);
35949 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35950 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35951 return true;
35953 case V4SFmode:
35954 case V4SImode:
35955 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35956 new_target = gen_reg_rtx (mode);
35957 else
35958 new_target = target;
35959 var = force_reg (GET_MODE_INNER (mode), var);
35960 x = gen_rtx_VEC_DUPLICATE (mode, var);
35961 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35962 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35963 if (one_var != 0)
35965 /* We need to shuffle the value to the correct position, so
35966 create a new pseudo to store the intermediate result. */
35968 /* With SSE2, we can use the integer shuffle insns. */
35969 if (mode != V4SFmode && TARGET_SSE2)
35971 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35972 const1_rtx,
35973 GEN_INT (one_var == 1 ? 0 : 1),
35974 GEN_INT (one_var == 2 ? 0 : 1),
35975 GEN_INT (one_var == 3 ? 0 : 1)));
35976 if (target != new_target)
35977 emit_move_insn (target, new_target);
35978 return true;
35981 /* Otherwise convert the intermediate result to V4SFmode and
35982 use the SSE1 shuffle instructions. */
35983 if (mode != V4SFmode)
35985 tmp = gen_reg_rtx (V4SFmode);
35986 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35988 else
35989 tmp = new_target;
35991 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
35992 const1_rtx,
35993 GEN_INT (one_var == 1 ? 0 : 1),
35994 GEN_INT (one_var == 2 ? 0+4 : 1+4),
35995 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
35997 if (mode != V4SFmode)
35998 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
35999 else if (tmp != target)
36000 emit_move_insn (target, tmp);
36002 else if (target != new_target)
36003 emit_move_insn (target, new_target);
36004 return true;
36006 case V8HImode:
36007 case V16QImode:
36008 vsimode = V4SImode;
36009 goto widen;
36010 case V4HImode:
36011 case V8QImode:
36012 if (!mmx_ok)
36013 return false;
36014 vsimode = V2SImode;
36015 goto widen;
36016 widen:
36017 if (one_var != 0)
36018 return false;
36020 /* Zero extend the variable element to SImode and recurse. */
36021 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
36023 x = gen_reg_rtx (vsimode);
36024 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
36025 var, one_var))
36026 gcc_unreachable ();
36028 emit_move_insn (target, gen_lowpart (mode, x));
36029 return true;
36031 default:
36032 return false;
36036 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36037 consisting of the values in VALS. It is known that all elements
36038 except ONE_VAR are constants. Return true if successful. */
36040 static bool
36041 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
36042 rtx target, rtx vals, int one_var)
36044 rtx var = XVECEXP (vals, 0, one_var);
36045 enum machine_mode wmode;
36046 rtx const_vec, x;
36048 const_vec = copy_rtx (vals);
36049 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
36050 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
36052 switch (mode)
36054 case V2DFmode:
36055 case V2DImode:
36056 case V2SFmode:
36057 case V2SImode:
36058 /* For the two element vectors, it's just as easy to use
36059 the general case. */
36060 return false;
36062 case V4DImode:
36063 /* Use ix86_expand_vector_set in 64bit mode only. */
36064 if (!TARGET_64BIT)
36065 return false;
36066 case V4DFmode:
36067 case V8SFmode:
36068 case V8SImode:
36069 case V16HImode:
36070 case V32QImode:
36071 case V4SFmode:
36072 case V4SImode:
36073 case V8HImode:
36074 case V4HImode:
36075 break;
36077 case V16QImode:
36078 if (TARGET_SSE4_1)
36079 break;
36080 wmode = V8HImode;
36081 goto widen;
36082 case V8QImode:
36083 wmode = V4HImode;
36084 goto widen;
36085 widen:
36086 /* There's no way to set one QImode entry easily. Combine
36087 the variable value with its adjacent constant value, and
36088 promote to an HImode set. */
36089 x = XVECEXP (vals, 0, one_var ^ 1);
36090 if (one_var & 1)
36092 var = convert_modes (HImode, QImode, var, true);
36093 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
36094 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36095 x = GEN_INT (INTVAL (x) & 0xff);
36097 else
36099 var = convert_modes (HImode, QImode, var, true);
36100 x = gen_int_mode (INTVAL (x) << 8, HImode);
36102 if (x != const0_rtx)
36103 var = expand_simple_binop (HImode, IOR, var, x, var,
36104 1, OPTAB_LIB_WIDEN);
36106 x = gen_reg_rtx (wmode);
36107 emit_move_insn (x, gen_lowpart (wmode, const_vec));
36108 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
36110 emit_move_insn (target, gen_lowpart (mode, x));
36111 return true;
36113 default:
36114 return false;
36117 emit_move_insn (target, const_vec);
36118 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36119 return true;
36122 /* A subroutine of ix86_expand_vector_init_general. Use vector
36123 concatenate to handle the most general case: all values variable,
36124 and none identical. */
36126 static void
36127 ix86_expand_vector_init_concat (enum machine_mode mode,
36128 rtx target, rtx *ops, int n)
36130 enum machine_mode cmode, hmode = VOIDmode;
36131 rtx first[8], second[4];
36132 rtvec v;
36133 int i, j;
36135 switch (n)
36137 case 2:
36138 switch (mode)
36140 case V8SImode:
36141 cmode = V4SImode;
36142 break;
36143 case V8SFmode:
36144 cmode = V4SFmode;
36145 break;
36146 case V4DImode:
36147 cmode = V2DImode;
36148 break;
36149 case V4DFmode:
36150 cmode = V2DFmode;
36151 break;
36152 case V4SImode:
36153 cmode = V2SImode;
36154 break;
36155 case V4SFmode:
36156 cmode = V2SFmode;
36157 break;
36158 case V2DImode:
36159 cmode = DImode;
36160 break;
36161 case V2SImode:
36162 cmode = SImode;
36163 break;
36164 case V2DFmode:
36165 cmode = DFmode;
36166 break;
36167 case V2SFmode:
36168 cmode = SFmode;
36169 break;
36170 default:
36171 gcc_unreachable ();
36174 if (!register_operand (ops[1], cmode))
36175 ops[1] = force_reg (cmode, ops[1]);
36176 if (!register_operand (ops[0], cmode))
36177 ops[0] = force_reg (cmode, ops[0]);
36178 emit_insn (gen_rtx_SET (VOIDmode, target,
36179 gen_rtx_VEC_CONCAT (mode, ops[0],
36180 ops[1])));
36181 break;
36183 case 4:
36184 switch (mode)
36186 case V4DImode:
36187 cmode = V2DImode;
36188 break;
36189 case V4DFmode:
36190 cmode = V2DFmode;
36191 break;
36192 case V4SImode:
36193 cmode = V2SImode;
36194 break;
36195 case V4SFmode:
36196 cmode = V2SFmode;
36197 break;
36198 default:
36199 gcc_unreachable ();
36201 goto half;
36203 case 8:
36204 switch (mode)
36206 case V8SImode:
36207 cmode = V2SImode;
36208 hmode = V4SImode;
36209 break;
36210 case V8SFmode:
36211 cmode = V2SFmode;
36212 hmode = V4SFmode;
36213 break;
36214 default:
36215 gcc_unreachable ();
36217 goto half;
36219 half:
36220 /* FIXME: We process inputs backward to help RA. PR 36222. */
36221 i = n - 1;
36222 j = (n >> 1) - 1;
36223 for (; i > 0; i -= 2, j--)
36225 first[j] = gen_reg_rtx (cmode);
36226 v = gen_rtvec (2, ops[i - 1], ops[i]);
36227 ix86_expand_vector_init (false, first[j],
36228 gen_rtx_PARALLEL (cmode, v));
36231 n >>= 1;
36232 if (n > 2)
36234 gcc_assert (hmode != VOIDmode);
36235 for (i = j = 0; i < n; i += 2, j++)
36237 second[j] = gen_reg_rtx (hmode);
36238 ix86_expand_vector_init_concat (hmode, second [j],
36239 &first [i], 2);
36241 n >>= 1;
36242 ix86_expand_vector_init_concat (mode, target, second, n);
36244 else
36245 ix86_expand_vector_init_concat (mode, target, first, n);
36246 break;
36248 default:
36249 gcc_unreachable ();
36253 /* A subroutine of ix86_expand_vector_init_general. Use vector
36254 interleave to handle the most general case: all values variable,
36255 and none identical. */
36257 static void
36258 ix86_expand_vector_init_interleave (enum machine_mode mode,
36259 rtx target, rtx *ops, int n)
36261 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36262 int i, j;
36263 rtx op0, op1;
36264 rtx (*gen_load_even) (rtx, rtx, rtx);
36265 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36266 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36268 switch (mode)
36270 case V8HImode:
36271 gen_load_even = gen_vec_setv8hi;
36272 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36273 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36274 inner_mode = HImode;
36275 first_imode = V4SImode;
36276 second_imode = V2DImode;
36277 third_imode = VOIDmode;
36278 break;
36279 case V16QImode:
36280 gen_load_even = gen_vec_setv16qi;
36281 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36282 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36283 inner_mode = QImode;
36284 first_imode = V8HImode;
36285 second_imode = V4SImode;
36286 third_imode = V2DImode;
36287 break;
36288 default:
36289 gcc_unreachable ();
36292 for (i = 0; i < n; i++)
36294 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36295 op0 = gen_reg_rtx (SImode);
36296 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36298 /* Insert the SImode value as low element of V4SImode vector. */
36299 op1 = gen_reg_rtx (V4SImode);
36300 op0 = gen_rtx_VEC_MERGE (V4SImode,
36301 gen_rtx_VEC_DUPLICATE (V4SImode,
36302 op0),
36303 CONST0_RTX (V4SImode),
36304 const1_rtx);
36305 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36307 /* Cast the V4SImode vector back to a vector in orignal mode. */
36308 op0 = gen_reg_rtx (mode);
36309 emit_move_insn (op0, gen_lowpart (mode, op1));
36311 /* Load even elements into the second positon. */
36312 emit_insn (gen_load_even (op0,
36313 force_reg (inner_mode,
36314 ops [i + i + 1]),
36315 const1_rtx));
36317 /* Cast vector to FIRST_IMODE vector. */
36318 ops[i] = gen_reg_rtx (first_imode);
36319 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36322 /* Interleave low FIRST_IMODE vectors. */
36323 for (i = j = 0; i < n; i += 2, j++)
36325 op0 = gen_reg_rtx (first_imode);
36326 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36328 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36329 ops[j] = gen_reg_rtx (second_imode);
36330 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36333 /* Interleave low SECOND_IMODE vectors. */
36334 switch (second_imode)
36336 case V4SImode:
36337 for (i = j = 0; i < n / 2; i += 2, j++)
36339 op0 = gen_reg_rtx (second_imode);
36340 emit_insn (gen_interleave_second_low (op0, ops[i],
36341 ops[i + 1]));
36343 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36344 vector. */
36345 ops[j] = gen_reg_rtx (third_imode);
36346 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36348 second_imode = V2DImode;
36349 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36350 /* FALLTHRU */
36352 case V2DImode:
36353 op0 = gen_reg_rtx (second_imode);
36354 emit_insn (gen_interleave_second_low (op0, ops[0],
36355 ops[1]));
36357 /* Cast the SECOND_IMODE vector back to a vector on original
36358 mode. */
36359 emit_insn (gen_rtx_SET (VOIDmode, target,
36360 gen_lowpart (mode, op0)));
36361 break;
36363 default:
36364 gcc_unreachable ();
36368 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36369 all values variable, and none identical. */
36371 static void
36372 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36373 rtx target, rtx vals)
36375 rtx ops[32], op0, op1;
36376 enum machine_mode half_mode = VOIDmode;
36377 int n, i;
36379 switch (mode)
36381 case V2SFmode:
36382 case V2SImode:
36383 if (!mmx_ok && !TARGET_SSE)
36384 break;
36385 /* FALLTHRU */
36387 case V8SFmode:
36388 case V8SImode:
36389 case V4DFmode:
36390 case V4DImode:
36391 case V4SFmode:
36392 case V4SImode:
36393 case V2DFmode:
36394 case V2DImode:
36395 n = GET_MODE_NUNITS (mode);
36396 for (i = 0; i < n; i++)
36397 ops[i] = XVECEXP (vals, 0, i);
36398 ix86_expand_vector_init_concat (mode, target, ops, n);
36399 return;
36401 case V32QImode:
36402 half_mode = V16QImode;
36403 goto half;
36405 case V16HImode:
36406 half_mode = V8HImode;
36407 goto half;
36409 half:
36410 n = GET_MODE_NUNITS (mode);
36411 for (i = 0; i < n; i++)
36412 ops[i] = XVECEXP (vals, 0, i);
36413 op0 = gen_reg_rtx (half_mode);
36414 op1 = gen_reg_rtx (half_mode);
36415 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36416 n >> 2);
36417 ix86_expand_vector_init_interleave (half_mode, op1,
36418 &ops [n >> 1], n >> 2);
36419 emit_insn (gen_rtx_SET (VOIDmode, target,
36420 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36421 return;
36423 case V16QImode:
36424 if (!TARGET_SSE4_1)
36425 break;
36426 /* FALLTHRU */
36428 case V8HImode:
36429 if (!TARGET_SSE2)
36430 break;
36432 /* Don't use ix86_expand_vector_init_interleave if we can't
36433 move from GPR to SSE register directly. */
36434 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
36435 break;
36437 n = GET_MODE_NUNITS (mode);
36438 for (i = 0; i < n; i++)
36439 ops[i] = XVECEXP (vals, 0, i);
36440 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36441 return;
36443 case V4HImode:
36444 case V8QImode:
36445 break;
36447 default:
36448 gcc_unreachable ();
36452 int i, j, n_elts, n_words, n_elt_per_word;
36453 enum machine_mode inner_mode;
36454 rtx words[4], shift;
36456 inner_mode = GET_MODE_INNER (mode);
36457 n_elts = GET_MODE_NUNITS (mode);
36458 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36459 n_elt_per_word = n_elts / n_words;
36460 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36462 for (i = 0; i < n_words; ++i)
36464 rtx word = NULL_RTX;
36466 for (j = 0; j < n_elt_per_word; ++j)
36468 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36469 elt = convert_modes (word_mode, inner_mode, elt, true);
36471 if (j == 0)
36472 word = elt;
36473 else
36475 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36476 word, 1, OPTAB_LIB_WIDEN);
36477 word = expand_simple_binop (word_mode, IOR, word, elt,
36478 word, 1, OPTAB_LIB_WIDEN);
36482 words[i] = word;
36485 if (n_words == 1)
36486 emit_move_insn (target, gen_lowpart (mode, words[0]));
36487 else if (n_words == 2)
36489 rtx tmp = gen_reg_rtx (mode);
36490 emit_clobber (tmp);
36491 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36492 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36493 emit_move_insn (target, tmp);
36495 else if (n_words == 4)
36497 rtx tmp = gen_reg_rtx (V4SImode);
36498 gcc_assert (word_mode == SImode);
36499 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36500 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36501 emit_move_insn (target, gen_lowpart (mode, tmp));
36503 else
36504 gcc_unreachable ();
36508 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36509 instructions unless MMX_OK is true. */
36511 void
36512 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36514 enum machine_mode mode = GET_MODE (target);
36515 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36516 int n_elts = GET_MODE_NUNITS (mode);
36517 int n_var = 0, one_var = -1;
36518 bool all_same = true, all_const_zero = true;
36519 int i;
36520 rtx x;
36522 for (i = 0; i < n_elts; ++i)
36524 x = XVECEXP (vals, 0, i);
36525 if (!(CONST_INT_P (x)
36526 || GET_CODE (x) == CONST_DOUBLE
36527 || GET_CODE (x) == CONST_FIXED))
36528 n_var++, one_var = i;
36529 else if (x != CONST0_RTX (inner_mode))
36530 all_const_zero = false;
36531 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36532 all_same = false;
36535 /* Constants are best loaded from the constant pool. */
36536 if (n_var == 0)
36538 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36539 return;
36542 /* If all values are identical, broadcast the value. */
36543 if (all_same
36544 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36545 XVECEXP (vals, 0, 0)))
36546 return;
36548 /* Values where only one field is non-constant are best loaded from
36549 the pool and overwritten via move later. */
36550 if (n_var == 1)
36552 if (all_const_zero
36553 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36554 XVECEXP (vals, 0, one_var),
36555 one_var))
36556 return;
36558 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36559 return;
36562 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36565 void
36566 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36568 enum machine_mode mode = GET_MODE (target);
36569 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36570 enum machine_mode half_mode;
36571 bool use_vec_merge = false;
36572 rtx tmp;
36573 static rtx (*gen_extract[6][2]) (rtx, rtx)
36575 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36576 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36577 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36578 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36579 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36580 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36582 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36584 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36585 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36586 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36587 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36588 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36589 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36591 int i, j, n;
36593 switch (mode)
36595 case V2SFmode:
36596 case V2SImode:
36597 if (mmx_ok)
36599 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36600 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36601 if (elt == 0)
36602 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36603 else
36604 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36605 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36606 return;
36608 break;
36610 case V2DImode:
36611 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36612 if (use_vec_merge)
36613 break;
36615 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36616 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36617 if (elt == 0)
36618 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36619 else
36620 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36621 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36622 return;
36624 case V2DFmode:
36626 rtx op0, op1;
36628 /* For the two element vectors, we implement a VEC_CONCAT with
36629 the extraction of the other element. */
36631 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36632 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36634 if (elt == 0)
36635 op0 = val, op1 = tmp;
36636 else
36637 op0 = tmp, op1 = val;
36639 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36640 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36642 return;
36644 case V4SFmode:
36645 use_vec_merge = TARGET_SSE4_1;
36646 if (use_vec_merge)
36647 break;
36649 switch (elt)
36651 case 0:
36652 use_vec_merge = true;
36653 break;
36655 case 1:
36656 /* tmp = target = A B C D */
36657 tmp = copy_to_reg (target);
36658 /* target = A A B B */
36659 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36660 /* target = X A B B */
36661 ix86_expand_vector_set (false, target, val, 0);
36662 /* target = A X C D */
36663 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36664 const1_rtx, const0_rtx,
36665 GEN_INT (2+4), GEN_INT (3+4)));
36666 return;
36668 case 2:
36669 /* tmp = target = A B C D */
36670 tmp = copy_to_reg (target);
36671 /* tmp = X B C D */
36672 ix86_expand_vector_set (false, tmp, val, 0);
36673 /* target = A B X D */
36674 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36675 const0_rtx, const1_rtx,
36676 GEN_INT (0+4), GEN_INT (3+4)));
36677 return;
36679 case 3:
36680 /* tmp = target = A B C D */
36681 tmp = copy_to_reg (target);
36682 /* tmp = X B C D */
36683 ix86_expand_vector_set (false, tmp, val, 0);
36684 /* target = A B X D */
36685 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36686 const0_rtx, const1_rtx,
36687 GEN_INT (2+4), GEN_INT (0+4)));
36688 return;
36690 default:
36691 gcc_unreachable ();
36693 break;
36695 case V4SImode:
36696 use_vec_merge = TARGET_SSE4_1;
36697 if (use_vec_merge)
36698 break;
36700 /* Element 0 handled by vec_merge below. */
36701 if (elt == 0)
36703 use_vec_merge = true;
36704 break;
36707 if (TARGET_SSE2)
36709 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36710 store into element 0, then shuffle them back. */
36712 rtx order[4];
36714 order[0] = GEN_INT (elt);
36715 order[1] = const1_rtx;
36716 order[2] = const2_rtx;
36717 order[3] = GEN_INT (3);
36718 order[elt] = const0_rtx;
36720 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36721 order[1], order[2], order[3]));
36723 ix86_expand_vector_set (false, target, val, 0);
36725 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36726 order[1], order[2], order[3]));
36728 else
36730 /* For SSE1, we have to reuse the V4SF code. */
36731 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36732 gen_lowpart (SFmode, val), elt);
36734 return;
36736 case V8HImode:
36737 use_vec_merge = TARGET_SSE2;
36738 break;
36739 case V4HImode:
36740 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36741 break;
36743 case V16QImode:
36744 use_vec_merge = TARGET_SSE4_1;
36745 break;
36747 case V8QImode:
36748 break;
36750 case V32QImode:
36751 half_mode = V16QImode;
36752 j = 0;
36753 n = 16;
36754 goto half;
36756 case V16HImode:
36757 half_mode = V8HImode;
36758 j = 1;
36759 n = 8;
36760 goto half;
36762 case V8SImode:
36763 half_mode = V4SImode;
36764 j = 2;
36765 n = 4;
36766 goto half;
36768 case V4DImode:
36769 half_mode = V2DImode;
36770 j = 3;
36771 n = 2;
36772 goto half;
36774 case V8SFmode:
36775 half_mode = V4SFmode;
36776 j = 4;
36777 n = 4;
36778 goto half;
36780 case V4DFmode:
36781 half_mode = V2DFmode;
36782 j = 5;
36783 n = 2;
36784 goto half;
36786 half:
36787 /* Compute offset. */
36788 i = elt / n;
36789 elt %= n;
36791 gcc_assert (i <= 1);
36793 /* Extract the half. */
36794 tmp = gen_reg_rtx (half_mode);
36795 emit_insn (gen_extract[j][i] (tmp, target));
36797 /* Put val in tmp at elt. */
36798 ix86_expand_vector_set (false, tmp, val, elt);
36800 /* Put it back. */
36801 emit_insn (gen_insert[j][i] (target, target, tmp));
36802 return;
36804 default:
36805 break;
36808 if (use_vec_merge)
36810 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36811 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36812 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36814 else
36816 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36818 emit_move_insn (mem, target);
36820 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36821 emit_move_insn (tmp, val);
36823 emit_move_insn (target, mem);
36827 void
36828 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36830 enum machine_mode mode = GET_MODE (vec);
36831 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36832 bool use_vec_extr = false;
36833 rtx tmp;
36835 switch (mode)
36837 case V2SImode:
36838 case V2SFmode:
36839 if (!mmx_ok)
36840 break;
36841 /* FALLTHRU */
36843 case V2DFmode:
36844 case V2DImode:
36845 use_vec_extr = true;
36846 break;
36848 case V4SFmode:
36849 use_vec_extr = TARGET_SSE4_1;
36850 if (use_vec_extr)
36851 break;
36853 switch (elt)
36855 case 0:
36856 tmp = vec;
36857 break;
36859 case 1:
36860 case 3:
36861 tmp = gen_reg_rtx (mode);
36862 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36863 GEN_INT (elt), GEN_INT (elt),
36864 GEN_INT (elt+4), GEN_INT (elt+4)));
36865 break;
36867 case 2:
36868 tmp = gen_reg_rtx (mode);
36869 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36870 break;
36872 default:
36873 gcc_unreachable ();
36875 vec = tmp;
36876 use_vec_extr = true;
36877 elt = 0;
36878 break;
36880 case V4SImode:
36881 use_vec_extr = TARGET_SSE4_1;
36882 if (use_vec_extr)
36883 break;
36885 if (TARGET_SSE2)
36887 switch (elt)
36889 case 0:
36890 tmp = vec;
36891 break;
36893 case 1:
36894 case 3:
36895 tmp = gen_reg_rtx (mode);
36896 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36897 GEN_INT (elt), GEN_INT (elt),
36898 GEN_INT (elt), GEN_INT (elt)));
36899 break;
36901 case 2:
36902 tmp = gen_reg_rtx (mode);
36903 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36904 break;
36906 default:
36907 gcc_unreachable ();
36909 vec = tmp;
36910 use_vec_extr = true;
36911 elt = 0;
36913 else
36915 /* For SSE1, we have to reuse the V4SF code. */
36916 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36917 gen_lowpart (V4SFmode, vec), elt);
36918 return;
36920 break;
36922 case V8HImode:
36923 use_vec_extr = TARGET_SSE2;
36924 break;
36925 case V4HImode:
36926 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36927 break;
36929 case V16QImode:
36930 use_vec_extr = TARGET_SSE4_1;
36931 break;
36933 case V8SFmode:
36934 if (TARGET_AVX)
36936 tmp = gen_reg_rtx (V4SFmode);
36937 if (elt < 4)
36938 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36939 else
36940 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36941 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36942 return;
36944 break;
36946 case V4DFmode:
36947 if (TARGET_AVX)
36949 tmp = gen_reg_rtx (V2DFmode);
36950 if (elt < 2)
36951 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36952 else
36953 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36954 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36955 return;
36957 break;
36959 case V32QImode:
36960 if (TARGET_AVX)
36962 tmp = gen_reg_rtx (V16QImode);
36963 if (elt < 16)
36964 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36965 else
36966 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36967 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36968 return;
36970 break;
36972 case V16HImode:
36973 if (TARGET_AVX)
36975 tmp = gen_reg_rtx (V8HImode);
36976 if (elt < 8)
36977 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36978 else
36979 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36980 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36981 return;
36983 break;
36985 case V8SImode:
36986 if (TARGET_AVX)
36988 tmp = gen_reg_rtx (V4SImode);
36989 if (elt < 4)
36990 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
36991 else
36992 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
36993 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36994 return;
36996 break;
36998 case V4DImode:
36999 if (TARGET_AVX)
37001 tmp = gen_reg_rtx (V2DImode);
37002 if (elt < 2)
37003 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
37004 else
37005 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
37006 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37007 return;
37009 break;
37011 case V8QImode:
37012 /* ??? Could extract the appropriate HImode element and shift. */
37013 default:
37014 break;
37017 if (use_vec_extr)
37019 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
37020 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
37022 /* Let the rtl optimizers know about the zero extension performed. */
37023 if (inner_mode == QImode || inner_mode == HImode)
37025 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
37026 target = gen_lowpart (SImode, target);
37029 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37031 else
37033 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37035 emit_move_insn (mem, vec);
37037 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37038 emit_move_insn (target, tmp);
37042 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
37043 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
37044 The upper bits of DEST are undefined, though they shouldn't cause
37045 exceptions (some bits from src or all zeros are ok). */
37047 static void
37048 emit_reduc_half (rtx dest, rtx src, int i)
37050 rtx tem;
37051 switch (GET_MODE (src))
37053 case V4SFmode:
37054 if (i == 128)
37055 tem = gen_sse_movhlps (dest, src, src);
37056 else
37057 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
37058 GEN_INT (1 + 4), GEN_INT (1 + 4));
37059 break;
37060 case V2DFmode:
37061 tem = gen_vec_interleave_highv2df (dest, src, src);
37062 break;
37063 case V16QImode:
37064 case V8HImode:
37065 case V4SImode:
37066 case V2DImode:
37067 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
37068 gen_lowpart (V1TImode, src),
37069 GEN_INT (i / 2));
37070 break;
37071 case V8SFmode:
37072 if (i == 256)
37073 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
37074 else
37075 tem = gen_avx_shufps256 (dest, src, src,
37076 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
37077 break;
37078 case V4DFmode:
37079 if (i == 256)
37080 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
37081 else
37082 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
37083 break;
37084 case V32QImode:
37085 case V16HImode:
37086 case V8SImode:
37087 case V4DImode:
37088 if (i == 256)
37089 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
37090 gen_lowpart (V4DImode, src),
37091 gen_lowpart (V4DImode, src),
37092 const1_rtx);
37093 else
37094 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
37095 gen_lowpart (V2TImode, src),
37096 GEN_INT (i / 2));
37097 break;
37098 default:
37099 gcc_unreachable ();
37101 emit_insn (tem);
37104 /* Expand a vector reduction. FN is the binary pattern to reduce;
37105 DEST is the destination; IN is the input vector. */
37107 void
37108 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
37110 rtx half, dst, vec = in;
37111 enum machine_mode mode = GET_MODE (in);
37112 int i;
37114 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
37115 if (TARGET_SSE4_1
37116 && mode == V8HImode
37117 && fn == gen_uminv8hi3)
37119 emit_insn (gen_sse4_1_phminposuw (dest, in));
37120 return;
37123 for (i = GET_MODE_BITSIZE (mode);
37124 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37125 i >>= 1)
37127 half = gen_reg_rtx (mode);
37128 emit_reduc_half (half, vec, i);
37129 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37130 dst = dest;
37131 else
37132 dst = gen_reg_rtx (mode);
37133 emit_insn (fn (dst, half, vec));
37134 vec = dst;
37138 /* Target hook for scalar_mode_supported_p. */
37139 static bool
37140 ix86_scalar_mode_supported_p (enum machine_mode mode)
37142 if (DECIMAL_FLOAT_MODE_P (mode))
37143 return default_decimal_float_supported_p ();
37144 else if (mode == TFmode)
37145 return true;
37146 else
37147 return default_scalar_mode_supported_p (mode);
37150 /* Implements target hook vector_mode_supported_p. */
37151 static bool
37152 ix86_vector_mode_supported_p (enum machine_mode mode)
37154 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37155 return true;
37156 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37157 return true;
37158 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37159 return true;
37160 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37161 return true;
37162 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37163 return true;
37164 return false;
37167 /* Target hook for c_mode_for_suffix. */
37168 static enum machine_mode
37169 ix86_c_mode_for_suffix (char suffix)
37171 if (suffix == 'q')
37172 return TFmode;
37173 if (suffix == 'w')
37174 return XFmode;
37176 return VOIDmode;
37179 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37181 We do this in the new i386 backend to maintain source compatibility
37182 with the old cc0-based compiler. */
37184 static tree
37185 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37186 tree inputs ATTRIBUTE_UNUSED,
37187 tree clobbers)
37189 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37190 clobbers);
37191 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37192 clobbers);
37193 return clobbers;
37196 /* Implements target vector targetm.asm.encode_section_info. */
37198 static void ATTRIBUTE_UNUSED
37199 ix86_encode_section_info (tree decl, rtx rtl, int first)
37201 default_encode_section_info (decl, rtl, first);
37203 if (TREE_CODE (decl) == VAR_DECL
37204 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37205 && ix86_in_large_data_p (decl))
37206 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37209 /* Worker function for REVERSE_CONDITION. */
37211 enum rtx_code
37212 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37214 return (mode != CCFPmode && mode != CCFPUmode
37215 ? reverse_condition (code)
37216 : reverse_condition_maybe_unordered (code));
37219 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37220 to OPERANDS[0]. */
37222 const char *
37223 output_387_reg_move (rtx insn, rtx *operands)
37225 if (REG_P (operands[0]))
37227 if (REG_P (operands[1])
37228 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37230 if (REGNO (operands[0]) == FIRST_STACK_REG)
37231 return output_387_ffreep (operands, 0);
37232 return "fstp\t%y0";
37234 if (STACK_TOP_P (operands[0]))
37235 return "fld%Z1\t%y1";
37236 return "fst\t%y0";
37238 else if (MEM_P (operands[0]))
37240 gcc_assert (REG_P (operands[1]));
37241 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37242 return "fstp%Z0\t%y0";
37243 else
37245 /* There is no non-popping store to memory for XFmode.
37246 So if we need one, follow the store with a load. */
37247 if (GET_MODE (operands[0]) == XFmode)
37248 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37249 else
37250 return "fst%Z0\t%y0";
37253 else
37254 gcc_unreachable();
37257 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37258 FP status register is set. */
37260 void
37261 ix86_emit_fp_unordered_jump (rtx label)
37263 rtx reg = gen_reg_rtx (HImode);
37264 rtx temp;
37266 emit_insn (gen_x86_fnstsw_1 (reg));
37268 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37270 emit_insn (gen_x86_sahf_1 (reg));
37272 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37273 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37275 else
37277 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37279 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37280 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37283 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37284 gen_rtx_LABEL_REF (VOIDmode, label),
37285 pc_rtx);
37286 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37288 emit_jump_insn (temp);
37289 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37292 /* Output code to perform a log1p XFmode calculation. */
37294 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37296 rtx label1 = gen_label_rtx ();
37297 rtx label2 = gen_label_rtx ();
37299 rtx tmp = gen_reg_rtx (XFmode);
37300 rtx tmp2 = gen_reg_rtx (XFmode);
37301 rtx test;
37303 emit_insn (gen_absxf2 (tmp, op1));
37304 test = gen_rtx_GE (VOIDmode, tmp,
37305 CONST_DOUBLE_FROM_REAL_VALUE (
37306 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37307 XFmode));
37308 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37310 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37311 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37312 emit_jump (label2);
37314 emit_label (label1);
37315 emit_move_insn (tmp, CONST1_RTX (XFmode));
37316 emit_insn (gen_addxf3 (tmp, op1, tmp));
37317 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37318 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37320 emit_label (label2);
37323 /* Emit code for round calculation. */
37324 void ix86_emit_i387_round (rtx op0, rtx op1)
37326 enum machine_mode inmode = GET_MODE (op1);
37327 enum machine_mode outmode = GET_MODE (op0);
37328 rtx e1, e2, res, tmp, tmp1, half;
37329 rtx scratch = gen_reg_rtx (HImode);
37330 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37331 rtx jump_label = gen_label_rtx ();
37332 rtx insn;
37333 rtx (*gen_abs) (rtx, rtx);
37334 rtx (*gen_neg) (rtx, rtx);
37336 switch (inmode)
37338 case SFmode:
37339 gen_abs = gen_abssf2;
37340 break;
37341 case DFmode:
37342 gen_abs = gen_absdf2;
37343 break;
37344 case XFmode:
37345 gen_abs = gen_absxf2;
37346 break;
37347 default:
37348 gcc_unreachable ();
37351 switch (outmode)
37353 case SFmode:
37354 gen_neg = gen_negsf2;
37355 break;
37356 case DFmode:
37357 gen_neg = gen_negdf2;
37358 break;
37359 case XFmode:
37360 gen_neg = gen_negxf2;
37361 break;
37362 case HImode:
37363 gen_neg = gen_neghi2;
37364 break;
37365 case SImode:
37366 gen_neg = gen_negsi2;
37367 break;
37368 case DImode:
37369 gen_neg = gen_negdi2;
37370 break;
37371 default:
37372 gcc_unreachable ();
37375 e1 = gen_reg_rtx (inmode);
37376 e2 = gen_reg_rtx (inmode);
37377 res = gen_reg_rtx (outmode);
37379 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37381 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37383 /* scratch = fxam(op1) */
37384 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37385 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37386 UNSPEC_FXAM)));
37387 /* e1 = fabs(op1) */
37388 emit_insn (gen_abs (e1, op1));
37390 /* e2 = e1 + 0.5 */
37391 half = force_reg (inmode, half);
37392 emit_insn (gen_rtx_SET (VOIDmode, e2,
37393 gen_rtx_PLUS (inmode, e1, half)));
37395 /* res = floor(e2) */
37396 if (inmode != XFmode)
37398 tmp1 = gen_reg_rtx (XFmode);
37400 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37401 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37403 else
37404 tmp1 = e2;
37406 switch (outmode)
37408 case SFmode:
37409 case DFmode:
37411 rtx tmp0 = gen_reg_rtx (XFmode);
37413 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37415 emit_insn (gen_rtx_SET (VOIDmode, res,
37416 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37417 UNSPEC_TRUNC_NOOP)));
37419 break;
37420 case XFmode:
37421 emit_insn (gen_frndintxf2_floor (res, tmp1));
37422 break;
37423 case HImode:
37424 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37425 break;
37426 case SImode:
37427 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37428 break;
37429 case DImode:
37430 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37431 break;
37432 default:
37433 gcc_unreachable ();
37436 /* flags = signbit(a) */
37437 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37439 /* if (flags) then res = -res */
37440 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37441 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37442 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37443 pc_rtx);
37444 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37445 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37446 JUMP_LABEL (insn) = jump_label;
37448 emit_insn (gen_neg (res, res));
37450 emit_label (jump_label);
37451 LABEL_NUSES (jump_label) = 1;
37453 emit_move_insn (op0, res);
37456 /* Output code to perform a Newton-Rhapson approximation of a single precision
37457 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37459 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37461 rtx x0, x1, e0, e1;
37463 x0 = gen_reg_rtx (mode);
37464 e0 = gen_reg_rtx (mode);
37465 e1 = gen_reg_rtx (mode);
37466 x1 = gen_reg_rtx (mode);
37468 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37470 b = force_reg (mode, b);
37472 /* x0 = rcp(b) estimate */
37473 emit_insn (gen_rtx_SET (VOIDmode, x0,
37474 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37475 UNSPEC_RCP)));
37476 /* e0 = x0 * b */
37477 emit_insn (gen_rtx_SET (VOIDmode, e0,
37478 gen_rtx_MULT (mode, x0, b)));
37480 /* e0 = x0 * e0 */
37481 emit_insn (gen_rtx_SET (VOIDmode, e0,
37482 gen_rtx_MULT (mode, x0, e0)));
37484 /* e1 = x0 + x0 */
37485 emit_insn (gen_rtx_SET (VOIDmode, e1,
37486 gen_rtx_PLUS (mode, x0, x0)));
37488 /* x1 = e1 - e0 */
37489 emit_insn (gen_rtx_SET (VOIDmode, x1,
37490 gen_rtx_MINUS (mode, e1, e0)));
37492 /* res = a * x1 */
37493 emit_insn (gen_rtx_SET (VOIDmode, res,
37494 gen_rtx_MULT (mode, a, x1)));
37497 /* Output code to perform a Newton-Rhapson approximation of a
37498 single precision floating point [reciprocal] square root. */
37500 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37501 bool recip)
37503 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37504 REAL_VALUE_TYPE r;
37506 x0 = gen_reg_rtx (mode);
37507 e0 = gen_reg_rtx (mode);
37508 e1 = gen_reg_rtx (mode);
37509 e2 = gen_reg_rtx (mode);
37510 e3 = gen_reg_rtx (mode);
37512 real_from_integer (&r, VOIDmode, -3, -1, 0);
37513 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37515 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37516 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37518 if (VECTOR_MODE_P (mode))
37520 mthree = ix86_build_const_vector (mode, true, mthree);
37521 mhalf = ix86_build_const_vector (mode, true, mhalf);
37524 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37525 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37527 a = force_reg (mode, a);
37529 /* x0 = rsqrt(a) estimate */
37530 emit_insn (gen_rtx_SET (VOIDmode, x0,
37531 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37532 UNSPEC_RSQRT)));
37534 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37535 if (!recip)
37537 rtx zero, mask;
37539 zero = gen_reg_rtx (mode);
37540 mask = gen_reg_rtx (mode);
37542 zero = force_reg (mode, CONST0_RTX(mode));
37543 emit_insn (gen_rtx_SET (VOIDmode, mask,
37544 gen_rtx_NE (mode, zero, a)));
37546 emit_insn (gen_rtx_SET (VOIDmode, x0,
37547 gen_rtx_AND (mode, x0, mask)));
37550 /* e0 = x0 * a */
37551 emit_insn (gen_rtx_SET (VOIDmode, e0,
37552 gen_rtx_MULT (mode, x0, a)));
37553 /* e1 = e0 * x0 */
37554 emit_insn (gen_rtx_SET (VOIDmode, e1,
37555 gen_rtx_MULT (mode, e0, x0)));
37557 /* e2 = e1 - 3. */
37558 mthree = force_reg (mode, mthree);
37559 emit_insn (gen_rtx_SET (VOIDmode, e2,
37560 gen_rtx_PLUS (mode, e1, mthree)));
37562 mhalf = force_reg (mode, mhalf);
37563 if (recip)
37564 /* e3 = -.5 * x0 */
37565 emit_insn (gen_rtx_SET (VOIDmode, e3,
37566 gen_rtx_MULT (mode, x0, mhalf)));
37567 else
37568 /* e3 = -.5 * e0 */
37569 emit_insn (gen_rtx_SET (VOIDmode, e3,
37570 gen_rtx_MULT (mode, e0, mhalf)));
37571 /* ret = e2 * e3 */
37572 emit_insn (gen_rtx_SET (VOIDmode, res,
37573 gen_rtx_MULT (mode, e2, e3)));
37576 #ifdef TARGET_SOLARIS
37577 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37579 static void
37580 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37581 tree decl)
37583 /* With Binutils 2.15, the "@unwind" marker must be specified on
37584 every occurrence of the ".eh_frame" section, not just the first
37585 one. */
37586 if (TARGET_64BIT
37587 && strcmp (name, ".eh_frame") == 0)
37589 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37590 flags & SECTION_WRITE ? "aw" : "a");
37591 return;
37594 #ifndef USE_GAS
37595 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37597 solaris_elf_asm_comdat_section (name, flags, decl);
37598 return;
37600 #endif
37602 default_elf_asm_named_section (name, flags, decl);
37604 #endif /* TARGET_SOLARIS */
37606 /* Return the mangling of TYPE if it is an extended fundamental type. */
37608 static const char *
37609 ix86_mangle_type (const_tree type)
37611 type = TYPE_MAIN_VARIANT (type);
37613 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37614 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37615 return NULL;
37617 switch (TYPE_MODE (type))
37619 case TFmode:
37620 /* __float128 is "g". */
37621 return "g";
37622 case XFmode:
37623 /* "long double" or __float80 is "e". */
37624 return "e";
37625 default:
37626 return NULL;
37630 /* For 32-bit code we can save PIC register setup by using
37631 __stack_chk_fail_local hidden function instead of calling
37632 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37633 register, so it is better to call __stack_chk_fail directly. */
37635 static tree ATTRIBUTE_UNUSED
37636 ix86_stack_protect_fail (void)
37638 return TARGET_64BIT
37639 ? default_external_stack_protect_fail ()
37640 : default_hidden_stack_protect_fail ();
37643 /* Select a format to encode pointers in exception handling data. CODE
37644 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37645 true if the symbol may be affected by dynamic relocations.
37647 ??? All x86 object file formats are capable of representing this.
37648 After all, the relocation needed is the same as for the call insn.
37649 Whether or not a particular assembler allows us to enter such, I
37650 guess we'll have to see. */
37652 asm_preferred_eh_data_format (int code, int global)
37654 if (flag_pic)
37656 int type = DW_EH_PE_sdata8;
37657 if (!TARGET_64BIT
37658 || ix86_cmodel == CM_SMALL_PIC
37659 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37660 type = DW_EH_PE_sdata4;
37661 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37663 if (ix86_cmodel == CM_SMALL
37664 || (ix86_cmodel == CM_MEDIUM && code))
37665 return DW_EH_PE_udata4;
37666 return DW_EH_PE_absptr;
37669 /* Expand copysign from SIGN to the positive value ABS_VALUE
37670 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37671 the sign-bit. */
37672 static void
37673 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37675 enum machine_mode mode = GET_MODE (sign);
37676 rtx sgn = gen_reg_rtx (mode);
37677 if (mask == NULL_RTX)
37679 enum machine_mode vmode;
37681 if (mode == SFmode)
37682 vmode = V4SFmode;
37683 else if (mode == DFmode)
37684 vmode = V2DFmode;
37685 else
37686 vmode = mode;
37688 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37689 if (!VECTOR_MODE_P (mode))
37691 /* We need to generate a scalar mode mask in this case. */
37692 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37693 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37694 mask = gen_reg_rtx (mode);
37695 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37698 else
37699 mask = gen_rtx_NOT (mode, mask);
37700 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37701 gen_rtx_AND (mode, mask, sign)));
37702 emit_insn (gen_rtx_SET (VOIDmode, result,
37703 gen_rtx_IOR (mode, abs_value, sgn)));
37706 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37707 mask for masking out the sign-bit is stored in *SMASK, if that is
37708 non-null. */
37709 static rtx
37710 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37712 enum machine_mode vmode, mode = GET_MODE (op0);
37713 rtx xa, mask;
37715 xa = gen_reg_rtx (mode);
37716 if (mode == SFmode)
37717 vmode = V4SFmode;
37718 else if (mode == DFmode)
37719 vmode = V2DFmode;
37720 else
37721 vmode = mode;
37722 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37723 if (!VECTOR_MODE_P (mode))
37725 /* We need to generate a scalar mode mask in this case. */
37726 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37727 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37728 mask = gen_reg_rtx (mode);
37729 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37731 emit_insn (gen_rtx_SET (VOIDmode, xa,
37732 gen_rtx_AND (mode, op0, mask)));
37734 if (smask)
37735 *smask = mask;
37737 return xa;
37740 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37741 swapping the operands if SWAP_OPERANDS is true. The expanded
37742 code is a forward jump to a newly created label in case the
37743 comparison is true. The generated label rtx is returned. */
37744 static rtx
37745 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37746 bool swap_operands)
37748 rtx label, tmp;
37750 if (swap_operands)
37752 tmp = op0;
37753 op0 = op1;
37754 op1 = tmp;
37757 label = gen_label_rtx ();
37758 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37759 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37760 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37761 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37762 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37763 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37764 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37765 JUMP_LABEL (tmp) = label;
37767 return label;
37770 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37771 using comparison code CODE. Operands are swapped for the comparison if
37772 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37773 static rtx
37774 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37775 bool swap_operands)
37777 rtx (*insn)(rtx, rtx, rtx, rtx);
37778 enum machine_mode mode = GET_MODE (op0);
37779 rtx mask = gen_reg_rtx (mode);
37781 if (swap_operands)
37783 rtx tmp = op0;
37784 op0 = op1;
37785 op1 = tmp;
37788 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37790 emit_insn (insn (mask, op0, op1,
37791 gen_rtx_fmt_ee (code, mode, op0, op1)));
37792 return mask;
37795 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37796 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37797 static rtx
37798 ix86_gen_TWO52 (enum machine_mode mode)
37800 REAL_VALUE_TYPE TWO52r;
37801 rtx TWO52;
37803 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37804 TWO52 = const_double_from_real_value (TWO52r, mode);
37805 TWO52 = force_reg (mode, TWO52);
37807 return TWO52;
37810 /* Expand SSE sequence for computing lround from OP1 storing
37811 into OP0. */
37812 void
37813 ix86_expand_lround (rtx op0, rtx op1)
37815 /* C code for the stuff we're doing below:
37816 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37817 return (long)tmp;
37819 enum machine_mode mode = GET_MODE (op1);
37820 const struct real_format *fmt;
37821 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37822 rtx adj;
37824 /* load nextafter (0.5, 0.0) */
37825 fmt = REAL_MODE_FORMAT (mode);
37826 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37827 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37829 /* adj = copysign (0.5, op1) */
37830 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37831 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37833 /* adj = op1 + adj */
37834 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37836 /* op0 = (imode)adj */
37837 expand_fix (op0, adj, 0);
37840 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37841 into OPERAND0. */
37842 void
37843 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37845 /* C code for the stuff we're doing below (for do_floor):
37846 xi = (long)op1;
37847 xi -= (double)xi > op1 ? 1 : 0;
37848 return xi;
37850 enum machine_mode fmode = GET_MODE (op1);
37851 enum machine_mode imode = GET_MODE (op0);
37852 rtx ireg, freg, label, tmp;
37854 /* reg = (long)op1 */
37855 ireg = gen_reg_rtx (imode);
37856 expand_fix (ireg, op1, 0);
37858 /* freg = (double)reg */
37859 freg = gen_reg_rtx (fmode);
37860 expand_float (freg, ireg, 0);
37862 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37863 label = ix86_expand_sse_compare_and_jump (UNLE,
37864 freg, op1, !do_floor);
37865 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37866 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37867 emit_move_insn (ireg, tmp);
37869 emit_label (label);
37870 LABEL_NUSES (label) = 1;
37872 emit_move_insn (op0, ireg);
37875 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37876 result in OPERAND0. */
37877 void
37878 ix86_expand_rint (rtx operand0, rtx operand1)
37880 /* C code for the stuff we're doing below:
37881 xa = fabs (operand1);
37882 if (!isless (xa, 2**52))
37883 return operand1;
37884 xa = xa + 2**52 - 2**52;
37885 return copysign (xa, operand1);
37887 enum machine_mode mode = GET_MODE (operand0);
37888 rtx res, xa, label, TWO52, mask;
37890 res = gen_reg_rtx (mode);
37891 emit_move_insn (res, operand1);
37893 /* xa = abs (operand1) */
37894 xa = ix86_expand_sse_fabs (res, &mask);
37896 /* if (!isless (xa, TWO52)) goto label; */
37897 TWO52 = ix86_gen_TWO52 (mode);
37898 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37900 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37901 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37903 ix86_sse_copysign_to_positive (res, xa, res, mask);
37905 emit_label (label);
37906 LABEL_NUSES (label) = 1;
37908 emit_move_insn (operand0, res);
37911 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37912 into OPERAND0. */
37913 void
37914 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37916 /* C code for the stuff we expand below.
37917 double xa = fabs (x), x2;
37918 if (!isless (xa, TWO52))
37919 return x;
37920 xa = xa + TWO52 - TWO52;
37921 x2 = copysign (xa, x);
37922 Compensate. Floor:
37923 if (x2 > x)
37924 x2 -= 1;
37925 Compensate. Ceil:
37926 if (x2 < x)
37927 x2 -= -1;
37928 return x2;
37930 enum machine_mode mode = GET_MODE (operand0);
37931 rtx xa, TWO52, tmp, label, one, res, mask;
37933 TWO52 = ix86_gen_TWO52 (mode);
37935 /* Temporary for holding the result, initialized to the input
37936 operand to ease control flow. */
37937 res = gen_reg_rtx (mode);
37938 emit_move_insn (res, operand1);
37940 /* xa = abs (operand1) */
37941 xa = ix86_expand_sse_fabs (res, &mask);
37943 /* if (!isless (xa, TWO52)) goto label; */
37944 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37946 /* xa = xa + TWO52 - TWO52; */
37947 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37948 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37950 /* xa = copysign (xa, operand1) */
37951 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37953 /* generate 1.0 or -1.0 */
37954 one = force_reg (mode,
37955 const_double_from_real_value (do_floor
37956 ? dconst1 : dconstm1, mode));
37958 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37959 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37960 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37961 gen_rtx_AND (mode, one, tmp)));
37962 /* We always need to subtract here to preserve signed zero. */
37963 tmp = expand_simple_binop (mode, MINUS,
37964 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37965 emit_move_insn (res, tmp);
37967 emit_label (label);
37968 LABEL_NUSES (label) = 1;
37970 emit_move_insn (operand0, res);
37973 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37974 into OPERAND0. */
37975 void
37976 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37978 /* C code for the stuff we expand below.
37979 double xa = fabs (x), x2;
37980 if (!isless (xa, TWO52))
37981 return x;
37982 x2 = (double)(long)x;
37983 Compensate. Floor:
37984 if (x2 > x)
37985 x2 -= 1;
37986 Compensate. Ceil:
37987 if (x2 < x)
37988 x2 += 1;
37989 if (HONOR_SIGNED_ZEROS (mode))
37990 return copysign (x2, x);
37991 return x2;
37993 enum machine_mode mode = GET_MODE (operand0);
37994 rtx xa, xi, TWO52, tmp, label, one, res, mask;
37996 TWO52 = ix86_gen_TWO52 (mode);
37998 /* Temporary for holding the result, initialized to the input
37999 operand to ease control flow. */
38000 res = gen_reg_rtx (mode);
38001 emit_move_insn (res, operand1);
38003 /* xa = abs (operand1) */
38004 xa = ix86_expand_sse_fabs (res, &mask);
38006 /* if (!isless (xa, TWO52)) goto label; */
38007 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38009 /* xa = (double)(long)x */
38010 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38011 expand_fix (xi, res, 0);
38012 expand_float (xa, xi, 0);
38014 /* generate 1.0 */
38015 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38017 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38018 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38019 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38020 gen_rtx_AND (mode, one, tmp)));
38021 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
38022 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38023 emit_move_insn (res, tmp);
38025 if (HONOR_SIGNED_ZEROS (mode))
38026 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38028 emit_label (label);
38029 LABEL_NUSES (label) = 1;
38031 emit_move_insn (operand0, res);
38034 /* Expand SSE sequence for computing round from OPERAND1 storing
38035 into OPERAND0. Sequence that works without relying on DImode truncation
38036 via cvttsd2siq that is only available on 64bit targets. */
38037 void
38038 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
38040 /* C code for the stuff we expand below.
38041 double xa = fabs (x), xa2, x2;
38042 if (!isless (xa, TWO52))
38043 return x;
38044 Using the absolute value and copying back sign makes
38045 -0.0 -> -0.0 correct.
38046 xa2 = xa + TWO52 - TWO52;
38047 Compensate.
38048 dxa = xa2 - xa;
38049 if (dxa <= -0.5)
38050 xa2 += 1;
38051 else if (dxa > 0.5)
38052 xa2 -= 1;
38053 x2 = copysign (xa2, x);
38054 return x2;
38056 enum machine_mode mode = GET_MODE (operand0);
38057 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
38059 TWO52 = ix86_gen_TWO52 (mode);
38061 /* Temporary for holding the result, initialized to the input
38062 operand to ease control flow. */
38063 res = gen_reg_rtx (mode);
38064 emit_move_insn (res, operand1);
38066 /* xa = abs (operand1) */
38067 xa = ix86_expand_sse_fabs (res, &mask);
38069 /* if (!isless (xa, TWO52)) goto label; */
38070 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38072 /* xa2 = xa + TWO52 - TWO52; */
38073 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38074 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
38076 /* dxa = xa2 - xa; */
38077 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
38079 /* generate 0.5, 1.0 and -0.5 */
38080 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
38081 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
38082 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
38083 0, OPTAB_DIRECT);
38085 /* Compensate. */
38086 tmp = gen_reg_rtx (mode);
38087 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
38088 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
38089 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38090 gen_rtx_AND (mode, one, tmp)));
38091 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38092 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
38093 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
38094 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38095 gen_rtx_AND (mode, one, tmp)));
38096 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38098 /* res = copysign (xa2, operand1) */
38099 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
38101 emit_label (label);
38102 LABEL_NUSES (label) = 1;
38104 emit_move_insn (operand0, res);
38107 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38108 into OPERAND0. */
38109 void
38110 ix86_expand_trunc (rtx operand0, rtx operand1)
38112 /* C code for SSE variant we expand below.
38113 double xa = fabs (x), x2;
38114 if (!isless (xa, TWO52))
38115 return x;
38116 x2 = (double)(long)x;
38117 if (HONOR_SIGNED_ZEROS (mode))
38118 return copysign (x2, x);
38119 return x2;
38121 enum machine_mode mode = GET_MODE (operand0);
38122 rtx xa, xi, TWO52, label, res, mask;
38124 TWO52 = ix86_gen_TWO52 (mode);
38126 /* Temporary for holding the result, initialized to the input
38127 operand to ease control flow. */
38128 res = gen_reg_rtx (mode);
38129 emit_move_insn (res, operand1);
38131 /* xa = abs (operand1) */
38132 xa = ix86_expand_sse_fabs (res, &mask);
38134 /* if (!isless (xa, TWO52)) goto label; */
38135 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38137 /* x = (double)(long)x */
38138 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38139 expand_fix (xi, res, 0);
38140 expand_float (res, xi, 0);
38142 if (HONOR_SIGNED_ZEROS (mode))
38143 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38145 emit_label (label);
38146 LABEL_NUSES (label) = 1;
38148 emit_move_insn (operand0, res);
38151 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38152 into OPERAND0. */
38153 void
38154 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38156 enum machine_mode mode = GET_MODE (operand0);
38157 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38159 /* C code for SSE variant we expand below.
38160 double xa = fabs (x), x2;
38161 if (!isless (xa, TWO52))
38162 return x;
38163 xa2 = xa + TWO52 - TWO52;
38164 Compensate:
38165 if (xa2 > xa)
38166 xa2 -= 1.0;
38167 x2 = copysign (xa2, x);
38168 return x2;
38171 TWO52 = ix86_gen_TWO52 (mode);
38173 /* Temporary for holding the result, initialized to the input
38174 operand to ease control flow. */
38175 res = gen_reg_rtx (mode);
38176 emit_move_insn (res, operand1);
38178 /* xa = abs (operand1) */
38179 xa = ix86_expand_sse_fabs (res, &smask);
38181 /* if (!isless (xa, TWO52)) goto label; */
38182 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38184 /* res = xa + TWO52 - TWO52; */
38185 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38186 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38187 emit_move_insn (res, tmp);
38189 /* generate 1.0 */
38190 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38192 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38193 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38194 emit_insn (gen_rtx_SET (VOIDmode, mask,
38195 gen_rtx_AND (mode, mask, one)));
38196 tmp = expand_simple_binop (mode, MINUS,
38197 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38198 emit_move_insn (res, tmp);
38200 /* res = copysign (res, operand1) */
38201 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38203 emit_label (label);
38204 LABEL_NUSES (label) = 1;
38206 emit_move_insn (operand0, res);
38209 /* Expand SSE sequence for computing round from OPERAND1 storing
38210 into OPERAND0. */
38211 void
38212 ix86_expand_round (rtx operand0, rtx operand1)
38214 /* C code for the stuff we're doing below:
38215 double xa = fabs (x);
38216 if (!isless (xa, TWO52))
38217 return x;
38218 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38219 return copysign (xa, x);
38221 enum machine_mode mode = GET_MODE (operand0);
38222 rtx res, TWO52, xa, label, xi, half, mask;
38223 const struct real_format *fmt;
38224 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38226 /* Temporary for holding the result, initialized to the input
38227 operand to ease control flow. */
38228 res = gen_reg_rtx (mode);
38229 emit_move_insn (res, operand1);
38231 TWO52 = ix86_gen_TWO52 (mode);
38232 xa = ix86_expand_sse_fabs (res, &mask);
38233 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38235 /* load nextafter (0.5, 0.0) */
38236 fmt = REAL_MODE_FORMAT (mode);
38237 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38238 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38240 /* xa = xa + 0.5 */
38241 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38242 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38244 /* xa = (double)(int64_t)xa */
38245 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38246 expand_fix (xi, xa, 0);
38247 expand_float (xa, xi, 0);
38249 /* res = copysign (xa, operand1) */
38250 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38252 emit_label (label);
38253 LABEL_NUSES (label) = 1;
38255 emit_move_insn (operand0, res);
38258 /* Expand SSE sequence for computing round
38259 from OP1 storing into OP0 using sse4 round insn. */
38260 void
38261 ix86_expand_round_sse4 (rtx op0, rtx op1)
38263 enum machine_mode mode = GET_MODE (op0);
38264 rtx e1, e2, res, half;
38265 const struct real_format *fmt;
38266 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38267 rtx (*gen_copysign) (rtx, rtx, rtx);
38268 rtx (*gen_round) (rtx, rtx, rtx);
38270 switch (mode)
38272 case SFmode:
38273 gen_copysign = gen_copysignsf3;
38274 gen_round = gen_sse4_1_roundsf2;
38275 break;
38276 case DFmode:
38277 gen_copysign = gen_copysigndf3;
38278 gen_round = gen_sse4_1_rounddf2;
38279 break;
38280 default:
38281 gcc_unreachable ();
38284 /* round (a) = trunc (a + copysign (0.5, a)) */
38286 /* load nextafter (0.5, 0.0) */
38287 fmt = REAL_MODE_FORMAT (mode);
38288 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38289 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38290 half = const_double_from_real_value (pred_half, mode);
38292 /* e1 = copysign (0.5, op1) */
38293 e1 = gen_reg_rtx (mode);
38294 emit_insn (gen_copysign (e1, half, op1));
38296 /* e2 = op1 + e1 */
38297 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38299 /* res = trunc (e2) */
38300 res = gen_reg_rtx (mode);
38301 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38303 emit_move_insn (op0, res);
38307 /* Table of valid machine attributes. */
38308 static const struct attribute_spec ix86_attribute_table[] =
38310 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38311 affects_type_identity } */
38312 /* Stdcall attribute says callee is responsible for popping arguments
38313 if they are not variable. */
38314 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38315 true },
38316 /* Fastcall attribute says callee is responsible for popping arguments
38317 if they are not variable. */
38318 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38319 true },
38320 /* Thiscall attribute says callee is responsible for popping arguments
38321 if they are not variable. */
38322 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38323 true },
38324 /* Cdecl attribute says the callee is a normal C declaration */
38325 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38326 true },
38327 /* Regparm attribute specifies how many integer arguments are to be
38328 passed in registers. */
38329 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38330 true },
38331 /* Sseregparm attribute says we are using x86_64 calling conventions
38332 for FP arguments. */
38333 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38334 true },
38335 /* The transactional memory builtins are implicitly regparm or fastcall
38336 depending on the ABI. Override the generic do-nothing attribute that
38337 these builtins were declared with. */
38338 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38339 true },
38340 /* force_align_arg_pointer says this function realigns the stack at entry. */
38341 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38342 false, true, true, ix86_handle_cconv_attribute, false },
38343 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38344 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38345 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38346 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38347 false },
38348 #endif
38349 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38350 false },
38351 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38352 false },
38353 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38354 SUBTARGET_ATTRIBUTE_TABLE,
38355 #endif
38356 /* ms_abi and sysv_abi calling convention function attributes. */
38357 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38358 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38359 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38360 false },
38361 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38362 ix86_handle_callee_pop_aggregate_return, true },
38363 /* End element. */
38364 { NULL, 0, 0, false, false, false, NULL, false }
38367 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38368 static int
38369 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38370 tree vectype,
38371 int misalign ATTRIBUTE_UNUSED)
38373 unsigned elements;
38375 switch (type_of_cost)
38377 case scalar_stmt:
38378 return ix86_cost->scalar_stmt_cost;
38380 case scalar_load:
38381 return ix86_cost->scalar_load_cost;
38383 case scalar_store:
38384 return ix86_cost->scalar_store_cost;
38386 case vector_stmt:
38387 return ix86_cost->vec_stmt_cost;
38389 case vector_load:
38390 return ix86_cost->vec_align_load_cost;
38392 case vector_store:
38393 return ix86_cost->vec_store_cost;
38395 case vec_to_scalar:
38396 return ix86_cost->vec_to_scalar_cost;
38398 case scalar_to_vec:
38399 return ix86_cost->scalar_to_vec_cost;
38401 case unaligned_load:
38402 case unaligned_store:
38403 return ix86_cost->vec_unalign_load_cost;
38405 case cond_branch_taken:
38406 return ix86_cost->cond_taken_branch_cost;
38408 case cond_branch_not_taken:
38409 return ix86_cost->cond_not_taken_branch_cost;
38411 case vec_perm:
38412 case vec_promote_demote:
38413 return ix86_cost->vec_stmt_cost;
38415 case vec_construct:
38416 elements = TYPE_VECTOR_SUBPARTS (vectype);
38417 return elements / 2 + 1;
38419 default:
38420 gcc_unreachable ();
38424 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38425 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38426 insn every time. */
38428 static GTY(()) rtx vselect_insn;
38430 /* Initialize vselect_insn. */
38432 static void
38433 init_vselect_insn (void)
38435 unsigned i;
38436 rtx x;
38438 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38439 for (i = 0; i < MAX_VECT_LEN; ++i)
38440 XVECEXP (x, 0, i) = const0_rtx;
38441 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38442 const0_rtx), x);
38443 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38444 start_sequence ();
38445 vselect_insn = emit_insn (x);
38446 end_sequence ();
38449 /* Construct (set target (vec_select op0 (parallel perm))) and
38450 return true if that's a valid instruction in the active ISA. */
38452 static bool
38453 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38454 unsigned nelt, bool testing_p)
38456 unsigned int i;
38457 rtx x, save_vconcat;
38458 int icode;
38460 if (vselect_insn == NULL_RTX)
38461 init_vselect_insn ();
38463 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38464 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38465 for (i = 0; i < nelt; ++i)
38466 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38467 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38468 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38469 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38470 SET_DEST (PATTERN (vselect_insn)) = target;
38471 icode = recog_memoized (vselect_insn);
38473 if (icode >= 0 && !testing_p)
38474 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38476 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38477 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38478 INSN_CODE (vselect_insn) = -1;
38480 return icode >= 0;
38483 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38485 static bool
38486 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38487 const unsigned char *perm, unsigned nelt,
38488 bool testing_p)
38490 enum machine_mode v2mode;
38491 rtx x;
38492 bool ok;
38494 if (vselect_insn == NULL_RTX)
38495 init_vselect_insn ();
38497 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38498 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38499 PUT_MODE (x, v2mode);
38500 XEXP (x, 0) = op0;
38501 XEXP (x, 1) = op1;
38502 ok = expand_vselect (target, x, perm, nelt, testing_p);
38503 XEXP (x, 0) = const0_rtx;
38504 XEXP (x, 1) = const0_rtx;
38505 return ok;
38508 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38509 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38511 static bool
38512 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38514 enum machine_mode vmode = d->vmode;
38515 unsigned i, mask, nelt = d->nelt;
38516 rtx target, op0, op1, x;
38517 rtx rperm[32], vperm;
38519 if (d->one_operand_p)
38520 return false;
38521 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38523 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38525 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38527 else
38528 return false;
38530 /* This is a blend, not a permute. Elements must stay in their
38531 respective lanes. */
38532 for (i = 0; i < nelt; ++i)
38534 unsigned e = d->perm[i];
38535 if (!(e == i || e == i + nelt))
38536 return false;
38539 if (d->testing_p)
38540 return true;
38542 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38543 decision should be extracted elsewhere, so that we only try that
38544 sequence once all budget==3 options have been tried. */
38545 target = d->target;
38546 op0 = d->op0;
38547 op1 = d->op1;
38548 mask = 0;
38550 switch (vmode)
38552 case V4DFmode:
38553 case V8SFmode:
38554 case V2DFmode:
38555 case V4SFmode:
38556 case V8HImode:
38557 case V8SImode:
38558 for (i = 0; i < nelt; ++i)
38559 mask |= (d->perm[i] >= nelt) << i;
38560 break;
38562 case V2DImode:
38563 for (i = 0; i < 2; ++i)
38564 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38565 vmode = V8HImode;
38566 goto do_subreg;
38568 case V4SImode:
38569 for (i = 0; i < 4; ++i)
38570 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38571 vmode = V8HImode;
38572 goto do_subreg;
38574 case V16QImode:
38575 /* See if bytes move in pairs so we can use pblendw with
38576 an immediate argument, rather than pblendvb with a vector
38577 argument. */
38578 for (i = 0; i < 16; i += 2)
38579 if (d->perm[i] + 1 != d->perm[i + 1])
38581 use_pblendvb:
38582 for (i = 0; i < nelt; ++i)
38583 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38585 finish_pblendvb:
38586 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38587 vperm = force_reg (vmode, vperm);
38589 if (GET_MODE_SIZE (vmode) == 16)
38590 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38591 else
38592 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38593 return true;
38596 for (i = 0; i < 8; ++i)
38597 mask |= (d->perm[i * 2] >= 16) << i;
38598 vmode = V8HImode;
38599 /* FALLTHRU */
38601 do_subreg:
38602 target = gen_lowpart (vmode, target);
38603 op0 = gen_lowpart (vmode, op0);
38604 op1 = gen_lowpart (vmode, op1);
38605 break;
38607 case V32QImode:
38608 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38609 for (i = 0; i < 32; i += 2)
38610 if (d->perm[i] + 1 != d->perm[i + 1])
38611 goto use_pblendvb;
38612 /* See if bytes move in quadruplets. If yes, vpblendd
38613 with immediate can be used. */
38614 for (i = 0; i < 32; i += 4)
38615 if (d->perm[i] + 2 != d->perm[i + 2])
38616 break;
38617 if (i < 32)
38619 /* See if bytes move the same in both lanes. If yes,
38620 vpblendw with immediate can be used. */
38621 for (i = 0; i < 16; i += 2)
38622 if (d->perm[i] + 16 != d->perm[i + 16])
38623 goto use_pblendvb;
38625 /* Use vpblendw. */
38626 for (i = 0; i < 16; ++i)
38627 mask |= (d->perm[i * 2] >= 32) << i;
38628 vmode = V16HImode;
38629 goto do_subreg;
38632 /* Use vpblendd. */
38633 for (i = 0; i < 8; ++i)
38634 mask |= (d->perm[i * 4] >= 32) << i;
38635 vmode = V8SImode;
38636 goto do_subreg;
38638 case V16HImode:
38639 /* See if words move in pairs. If yes, vpblendd can be used. */
38640 for (i = 0; i < 16; i += 2)
38641 if (d->perm[i] + 1 != d->perm[i + 1])
38642 break;
38643 if (i < 16)
38645 /* See if words move the same in both lanes. If not,
38646 vpblendvb must be used. */
38647 for (i = 0; i < 8; i++)
38648 if (d->perm[i] + 8 != d->perm[i + 8])
38650 /* Use vpblendvb. */
38651 for (i = 0; i < 32; ++i)
38652 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38654 vmode = V32QImode;
38655 nelt = 32;
38656 target = gen_lowpart (vmode, target);
38657 op0 = gen_lowpart (vmode, op0);
38658 op1 = gen_lowpart (vmode, op1);
38659 goto finish_pblendvb;
38662 /* Use vpblendw. */
38663 for (i = 0; i < 16; ++i)
38664 mask |= (d->perm[i] >= 16) << i;
38665 break;
38668 /* Use vpblendd. */
38669 for (i = 0; i < 8; ++i)
38670 mask |= (d->perm[i * 2] >= 16) << i;
38671 vmode = V8SImode;
38672 goto do_subreg;
38674 case V4DImode:
38675 /* Use vpblendd. */
38676 for (i = 0; i < 4; ++i)
38677 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38678 vmode = V8SImode;
38679 goto do_subreg;
38681 default:
38682 gcc_unreachable ();
38685 /* This matches five different patterns with the different modes. */
38686 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38687 x = gen_rtx_SET (VOIDmode, target, x);
38688 emit_insn (x);
38690 return true;
38693 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38694 in terms of the variable form of vpermilps.
38696 Note that we will have already failed the immediate input vpermilps,
38697 which requires that the high and low part shuffle be identical; the
38698 variable form doesn't require that. */
38700 static bool
38701 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38703 rtx rperm[8], vperm;
38704 unsigned i;
38706 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38707 return false;
38709 /* We can only permute within the 128-bit lane. */
38710 for (i = 0; i < 8; ++i)
38712 unsigned e = d->perm[i];
38713 if (i < 4 ? e >= 4 : e < 4)
38714 return false;
38717 if (d->testing_p)
38718 return true;
38720 for (i = 0; i < 8; ++i)
38722 unsigned e = d->perm[i];
38724 /* Within each 128-bit lane, the elements of op0 are numbered
38725 from 0 and the elements of op1 are numbered from 4. */
38726 if (e >= 8 + 4)
38727 e -= 8;
38728 else if (e >= 4)
38729 e -= 4;
38731 rperm[i] = GEN_INT (e);
38734 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38735 vperm = force_reg (V8SImode, vperm);
38736 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38738 return true;
38741 /* Return true if permutation D can be performed as VMODE permutation
38742 instead. */
38744 static bool
38745 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38747 unsigned int i, j, chunk;
38749 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38750 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38751 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38752 return false;
38754 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38755 return true;
38757 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38758 for (i = 0; i < d->nelt; i += chunk)
38759 if (d->perm[i] & (chunk - 1))
38760 return false;
38761 else
38762 for (j = 1; j < chunk; ++j)
38763 if (d->perm[i] + j != d->perm[i + j])
38764 return false;
38766 return true;
38769 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38770 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38772 static bool
38773 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38775 unsigned i, nelt, eltsz, mask;
38776 unsigned char perm[32];
38777 enum machine_mode vmode = V16QImode;
38778 rtx rperm[32], vperm, target, op0, op1;
38780 nelt = d->nelt;
38782 if (!d->one_operand_p)
38784 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38786 if (TARGET_AVX2
38787 && valid_perm_using_mode_p (V2TImode, d))
38789 if (d->testing_p)
38790 return true;
38792 /* Use vperm2i128 insn. The pattern uses
38793 V4DImode instead of V2TImode. */
38794 target = gen_lowpart (V4DImode, d->target);
38795 op0 = gen_lowpart (V4DImode, d->op0);
38796 op1 = gen_lowpart (V4DImode, d->op1);
38797 rperm[0]
38798 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38799 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38800 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38801 return true;
38803 return false;
38806 else
38808 if (GET_MODE_SIZE (d->vmode) == 16)
38810 if (!TARGET_SSSE3)
38811 return false;
38813 else if (GET_MODE_SIZE (d->vmode) == 32)
38815 if (!TARGET_AVX2)
38816 return false;
38818 /* V4DImode should be already handled through
38819 expand_vselect by vpermq instruction. */
38820 gcc_assert (d->vmode != V4DImode);
38822 vmode = V32QImode;
38823 if (d->vmode == V8SImode
38824 || d->vmode == V16HImode
38825 || d->vmode == V32QImode)
38827 /* First see if vpermq can be used for
38828 V8SImode/V16HImode/V32QImode. */
38829 if (valid_perm_using_mode_p (V4DImode, d))
38831 for (i = 0; i < 4; i++)
38832 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38833 if (d->testing_p)
38834 return true;
38835 return expand_vselect (gen_lowpart (V4DImode, d->target),
38836 gen_lowpart (V4DImode, d->op0),
38837 perm, 4, false);
38840 /* Next see if vpermd can be used. */
38841 if (valid_perm_using_mode_p (V8SImode, d))
38842 vmode = V8SImode;
38844 /* Or if vpermps can be used. */
38845 else if (d->vmode == V8SFmode)
38846 vmode = V8SImode;
38848 if (vmode == V32QImode)
38850 /* vpshufb only works intra lanes, it is not
38851 possible to shuffle bytes in between the lanes. */
38852 for (i = 0; i < nelt; ++i)
38853 if ((d->perm[i] ^ i) & (nelt / 2))
38854 return false;
38857 else
38858 return false;
38861 if (d->testing_p)
38862 return true;
38864 if (vmode == V8SImode)
38865 for (i = 0; i < 8; ++i)
38866 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38867 else
38869 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38870 if (!d->one_operand_p)
38871 mask = 2 * nelt - 1;
38872 else if (vmode == V16QImode)
38873 mask = nelt - 1;
38874 else
38875 mask = nelt / 2 - 1;
38877 for (i = 0; i < nelt; ++i)
38879 unsigned j, e = d->perm[i] & mask;
38880 for (j = 0; j < eltsz; ++j)
38881 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38885 vperm = gen_rtx_CONST_VECTOR (vmode,
38886 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38887 vperm = force_reg (vmode, vperm);
38889 target = gen_lowpart (vmode, d->target);
38890 op0 = gen_lowpart (vmode, d->op0);
38891 if (d->one_operand_p)
38893 if (vmode == V16QImode)
38894 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38895 else if (vmode == V32QImode)
38896 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38897 else if (vmode == V8SFmode)
38898 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38899 else
38900 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38902 else
38904 op1 = gen_lowpart (vmode, d->op1);
38905 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38908 return true;
38911 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38912 in a single instruction. */
38914 static bool
38915 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38917 unsigned i, nelt = d->nelt;
38918 unsigned char perm2[MAX_VECT_LEN];
38920 /* Check plain VEC_SELECT first, because AVX has instructions that could
38921 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38922 input where SEL+CONCAT may not. */
38923 if (d->one_operand_p)
38925 int mask = nelt - 1;
38926 bool identity_perm = true;
38927 bool broadcast_perm = true;
38929 for (i = 0; i < nelt; i++)
38931 perm2[i] = d->perm[i] & mask;
38932 if (perm2[i] != i)
38933 identity_perm = false;
38934 if (perm2[i])
38935 broadcast_perm = false;
38938 if (identity_perm)
38940 if (!d->testing_p)
38941 emit_move_insn (d->target, d->op0);
38942 return true;
38944 else if (broadcast_perm && TARGET_AVX2)
38946 /* Use vpbroadcast{b,w,d}. */
38947 rtx (*gen) (rtx, rtx) = NULL;
38948 switch (d->vmode)
38950 case V32QImode:
38951 gen = gen_avx2_pbroadcastv32qi_1;
38952 break;
38953 case V16HImode:
38954 gen = gen_avx2_pbroadcastv16hi_1;
38955 break;
38956 case V8SImode:
38957 gen = gen_avx2_pbroadcastv8si_1;
38958 break;
38959 case V16QImode:
38960 gen = gen_avx2_pbroadcastv16qi;
38961 break;
38962 case V8HImode:
38963 gen = gen_avx2_pbroadcastv8hi;
38964 break;
38965 case V8SFmode:
38966 gen = gen_avx2_vec_dupv8sf_1;
38967 break;
38968 /* For other modes prefer other shuffles this function creates. */
38969 default: break;
38971 if (gen != NULL)
38973 if (!d->testing_p)
38974 emit_insn (gen (d->target, d->op0));
38975 return true;
38979 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38980 return true;
38982 /* There are plenty of patterns in sse.md that are written for
38983 SEL+CONCAT and are not replicated for a single op. Perhaps
38984 that should be changed, to avoid the nastiness here. */
38986 /* Recognize interleave style patterns, which means incrementing
38987 every other permutation operand. */
38988 for (i = 0; i < nelt; i += 2)
38990 perm2[i] = d->perm[i] & mask;
38991 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
38993 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38994 d->testing_p))
38995 return true;
38997 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
38998 if (nelt >= 4)
39000 for (i = 0; i < nelt; i += 4)
39002 perm2[i + 0] = d->perm[i + 0] & mask;
39003 perm2[i + 1] = d->perm[i + 1] & mask;
39004 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
39005 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
39008 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39009 d->testing_p))
39010 return true;
39014 /* Finally, try the fully general two operand permute. */
39015 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
39016 d->testing_p))
39017 return true;
39019 /* Recognize interleave style patterns with reversed operands. */
39020 if (!d->one_operand_p)
39022 for (i = 0; i < nelt; ++i)
39024 unsigned e = d->perm[i];
39025 if (e >= nelt)
39026 e -= nelt;
39027 else
39028 e += nelt;
39029 perm2[i] = e;
39032 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
39033 d->testing_p))
39034 return true;
39037 /* Try the SSE4.1 blend variable merge instructions. */
39038 if (expand_vec_perm_blend (d))
39039 return true;
39041 /* Try one of the AVX vpermil variable permutations. */
39042 if (expand_vec_perm_vpermil (d))
39043 return true;
39045 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
39046 vpshufb, vpermd, vpermps or vpermq variable permutation. */
39047 if (expand_vec_perm_pshufb (d))
39048 return true;
39050 return false;
39053 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39054 in terms of a pair of pshuflw + pshufhw instructions. */
39056 static bool
39057 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
39059 unsigned char perm2[MAX_VECT_LEN];
39060 unsigned i;
39061 bool ok;
39063 if (d->vmode != V8HImode || !d->one_operand_p)
39064 return false;
39066 /* The two permutations only operate in 64-bit lanes. */
39067 for (i = 0; i < 4; ++i)
39068 if (d->perm[i] >= 4)
39069 return false;
39070 for (i = 4; i < 8; ++i)
39071 if (d->perm[i] < 4)
39072 return false;
39074 if (d->testing_p)
39075 return true;
39077 /* Emit the pshuflw. */
39078 memcpy (perm2, d->perm, 4);
39079 for (i = 4; i < 8; ++i)
39080 perm2[i] = i;
39081 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
39082 gcc_assert (ok);
39084 /* Emit the pshufhw. */
39085 memcpy (perm2 + 4, d->perm + 4, 4);
39086 for (i = 0; i < 4; ++i)
39087 perm2[i] = i;
39088 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
39089 gcc_assert (ok);
39091 return true;
39094 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39095 the permutation using the SSSE3 palignr instruction. This succeeds
39096 when all of the elements in PERM fit within one vector and we merely
39097 need to shift them down so that a single vector permutation has a
39098 chance to succeed. */
39100 static bool
39101 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
39103 unsigned i, nelt = d->nelt;
39104 unsigned min, max;
39105 bool in_order, ok;
39106 rtx shift;
39108 /* Even with AVX, palignr only operates on 128-bit vectors. */
39109 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39110 return false;
39112 min = nelt, max = 0;
39113 for (i = 0; i < nelt; ++i)
39115 unsigned e = d->perm[i];
39116 if (e < min)
39117 min = e;
39118 if (e > max)
39119 max = e;
39121 if (min == 0 || max - min >= nelt)
39122 return false;
39124 /* Given that we have SSSE3, we know we'll be able to implement the
39125 single operand permutation after the palignr with pshufb. */
39126 if (d->testing_p)
39127 return true;
39129 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39130 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39131 gen_lowpart (TImode, d->op1),
39132 gen_lowpart (TImode, d->op0), shift));
39134 d->op0 = d->op1 = d->target;
39135 d->one_operand_p = true;
39137 in_order = true;
39138 for (i = 0; i < nelt; ++i)
39140 unsigned e = d->perm[i] - min;
39141 if (e != i)
39142 in_order = false;
39143 d->perm[i] = e;
39146 /* Test for the degenerate case where the alignment by itself
39147 produces the desired permutation. */
39148 if (in_order)
39149 return true;
39151 ok = expand_vec_perm_1 (d);
39152 gcc_assert (ok);
39154 return ok;
39157 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39159 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39160 a two vector permutation into a single vector permutation by using
39161 an interleave operation to merge the vectors. */
39163 static bool
39164 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39166 struct expand_vec_perm_d dremap, dfinal;
39167 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39168 unsigned HOST_WIDE_INT contents;
39169 unsigned char remap[2 * MAX_VECT_LEN];
39170 rtx seq;
39171 bool ok, same_halves = false;
39173 if (GET_MODE_SIZE (d->vmode) == 16)
39175 if (d->one_operand_p)
39176 return false;
39178 else if (GET_MODE_SIZE (d->vmode) == 32)
39180 if (!TARGET_AVX)
39181 return false;
39182 /* For 32-byte modes allow even d->one_operand_p.
39183 The lack of cross-lane shuffling in some instructions
39184 might prevent a single insn shuffle. */
39185 dfinal = *d;
39186 dfinal.testing_p = true;
39187 /* If expand_vec_perm_interleave3 can expand this into
39188 a 3 insn sequence, give up and let it be expanded as
39189 3 insn sequence. While that is one insn longer,
39190 it doesn't need a memory operand and in the common
39191 case that both interleave low and high permutations
39192 with the same operands are adjacent needs 4 insns
39193 for both after CSE. */
39194 if (expand_vec_perm_interleave3 (&dfinal))
39195 return false;
39197 else
39198 return false;
39200 /* Examine from whence the elements come. */
39201 contents = 0;
39202 for (i = 0; i < nelt; ++i)
39203 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39205 memset (remap, 0xff, sizeof (remap));
39206 dremap = *d;
39208 if (GET_MODE_SIZE (d->vmode) == 16)
39210 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39212 /* Split the two input vectors into 4 halves. */
39213 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39214 h2 = h1 << nelt2;
39215 h3 = h2 << nelt2;
39216 h4 = h3 << nelt2;
39218 /* If the elements from the low halves use interleave low, and similarly
39219 for interleave high. If the elements are from mis-matched halves, we
39220 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39221 if ((contents & (h1 | h3)) == contents)
39223 /* punpckl* */
39224 for (i = 0; i < nelt2; ++i)
39226 remap[i] = i * 2;
39227 remap[i + nelt] = i * 2 + 1;
39228 dremap.perm[i * 2] = i;
39229 dremap.perm[i * 2 + 1] = i + nelt;
39231 if (!TARGET_SSE2 && d->vmode == V4SImode)
39232 dremap.vmode = V4SFmode;
39234 else if ((contents & (h2 | h4)) == contents)
39236 /* punpckh* */
39237 for (i = 0; i < nelt2; ++i)
39239 remap[i + nelt2] = i * 2;
39240 remap[i + nelt + nelt2] = i * 2 + 1;
39241 dremap.perm[i * 2] = i + nelt2;
39242 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39244 if (!TARGET_SSE2 && d->vmode == V4SImode)
39245 dremap.vmode = V4SFmode;
39247 else if ((contents & (h1 | h4)) == contents)
39249 /* shufps */
39250 for (i = 0; i < nelt2; ++i)
39252 remap[i] = i;
39253 remap[i + nelt + nelt2] = i + nelt2;
39254 dremap.perm[i] = i;
39255 dremap.perm[i + nelt2] = i + nelt + nelt2;
39257 if (nelt != 4)
39259 /* shufpd */
39260 dremap.vmode = V2DImode;
39261 dremap.nelt = 2;
39262 dremap.perm[0] = 0;
39263 dremap.perm[1] = 3;
39266 else if ((contents & (h2 | h3)) == contents)
39268 /* shufps */
39269 for (i = 0; i < nelt2; ++i)
39271 remap[i + nelt2] = i;
39272 remap[i + nelt] = i + nelt2;
39273 dremap.perm[i] = i + nelt2;
39274 dremap.perm[i + nelt2] = i + nelt;
39276 if (nelt != 4)
39278 /* shufpd */
39279 dremap.vmode = V2DImode;
39280 dremap.nelt = 2;
39281 dremap.perm[0] = 1;
39282 dremap.perm[1] = 2;
39285 else
39286 return false;
39288 else
39290 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39291 unsigned HOST_WIDE_INT q[8];
39292 unsigned int nonzero_halves[4];
39294 /* Split the two input vectors into 8 quarters. */
39295 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39296 for (i = 1; i < 8; ++i)
39297 q[i] = q[0] << (nelt4 * i);
39298 for (i = 0; i < 4; ++i)
39299 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39301 nonzero_halves[nzcnt] = i;
39302 ++nzcnt;
39305 if (nzcnt == 1)
39307 gcc_assert (d->one_operand_p);
39308 nonzero_halves[1] = nonzero_halves[0];
39309 same_halves = true;
39311 else if (d->one_operand_p)
39313 gcc_assert (nonzero_halves[0] == 0);
39314 gcc_assert (nonzero_halves[1] == 1);
39317 if (nzcnt <= 2)
39319 if (d->perm[0] / nelt2 == nonzero_halves[1])
39321 /* Attempt to increase the likelihood that dfinal
39322 shuffle will be intra-lane. */
39323 char tmph = nonzero_halves[0];
39324 nonzero_halves[0] = nonzero_halves[1];
39325 nonzero_halves[1] = tmph;
39328 /* vperm2f128 or vperm2i128. */
39329 for (i = 0; i < nelt2; ++i)
39331 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39332 remap[i + nonzero_halves[0] * nelt2] = i;
39333 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39334 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39337 if (d->vmode != V8SFmode
39338 && d->vmode != V4DFmode
39339 && d->vmode != V8SImode)
39341 dremap.vmode = V8SImode;
39342 dremap.nelt = 8;
39343 for (i = 0; i < 4; ++i)
39345 dremap.perm[i] = i + nonzero_halves[0] * 4;
39346 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39350 else if (d->one_operand_p)
39351 return false;
39352 else if (TARGET_AVX2
39353 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39355 /* vpunpckl* */
39356 for (i = 0; i < nelt4; ++i)
39358 remap[i] = i * 2;
39359 remap[i + nelt] = i * 2 + 1;
39360 remap[i + nelt2] = i * 2 + nelt2;
39361 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39362 dremap.perm[i * 2] = i;
39363 dremap.perm[i * 2 + 1] = i + nelt;
39364 dremap.perm[i * 2 + nelt2] = i + nelt2;
39365 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39368 else if (TARGET_AVX2
39369 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39371 /* vpunpckh* */
39372 for (i = 0; i < nelt4; ++i)
39374 remap[i + nelt4] = i * 2;
39375 remap[i + nelt + nelt4] = i * 2 + 1;
39376 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39377 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39378 dremap.perm[i * 2] = i + nelt4;
39379 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39380 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39381 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39384 else
39385 return false;
39388 /* Use the remapping array set up above to move the elements from their
39389 swizzled locations into their final destinations. */
39390 dfinal = *d;
39391 for (i = 0; i < nelt; ++i)
39393 unsigned e = remap[d->perm[i]];
39394 gcc_assert (e < nelt);
39395 /* If same_halves is true, both halves of the remapped vector are the
39396 same. Avoid cross-lane accesses if possible. */
39397 if (same_halves && i >= nelt2)
39399 gcc_assert (e < nelt2);
39400 dfinal.perm[i] = e + nelt2;
39402 else
39403 dfinal.perm[i] = e;
39405 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39406 dfinal.op1 = dfinal.op0;
39407 dfinal.one_operand_p = true;
39408 dremap.target = dfinal.op0;
39410 /* Test if the final remap can be done with a single insn. For V4SFmode or
39411 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39412 start_sequence ();
39413 ok = expand_vec_perm_1 (&dfinal);
39414 seq = get_insns ();
39415 end_sequence ();
39417 if (!ok)
39418 return false;
39420 if (d->testing_p)
39421 return true;
39423 if (dremap.vmode != dfinal.vmode)
39425 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39426 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39427 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39430 ok = expand_vec_perm_1 (&dremap);
39431 gcc_assert (ok);
39433 emit_insn (seq);
39434 return true;
39437 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39438 a single vector cross-lane permutation into vpermq followed
39439 by any of the single insn permutations. */
39441 static bool
39442 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39444 struct expand_vec_perm_d dremap, dfinal;
39445 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39446 unsigned contents[2];
39447 bool ok;
39449 if (!(TARGET_AVX2
39450 && (d->vmode == V32QImode || d->vmode == V16HImode)
39451 && d->one_operand_p))
39452 return false;
39454 contents[0] = 0;
39455 contents[1] = 0;
39456 for (i = 0; i < nelt2; ++i)
39458 contents[0] |= 1u << (d->perm[i] / nelt4);
39459 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39462 for (i = 0; i < 2; ++i)
39464 unsigned int cnt = 0;
39465 for (j = 0; j < 4; ++j)
39466 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39467 return false;
39470 if (d->testing_p)
39471 return true;
39473 dremap = *d;
39474 dremap.vmode = V4DImode;
39475 dremap.nelt = 4;
39476 dremap.target = gen_reg_rtx (V4DImode);
39477 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39478 dremap.op1 = dremap.op0;
39479 dremap.one_operand_p = true;
39480 for (i = 0; i < 2; ++i)
39482 unsigned int cnt = 0;
39483 for (j = 0; j < 4; ++j)
39484 if ((contents[i] & (1u << j)) != 0)
39485 dremap.perm[2 * i + cnt++] = j;
39486 for (; cnt < 2; ++cnt)
39487 dremap.perm[2 * i + cnt] = 0;
39490 dfinal = *d;
39491 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39492 dfinal.op1 = dfinal.op0;
39493 dfinal.one_operand_p = true;
39494 for (i = 0, j = 0; i < nelt; ++i)
39496 if (i == nelt2)
39497 j = 2;
39498 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39499 if ((d->perm[i] / nelt4) == dremap.perm[j])
39501 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39502 dfinal.perm[i] |= nelt4;
39503 else
39504 gcc_unreachable ();
39507 ok = expand_vec_perm_1 (&dremap);
39508 gcc_assert (ok);
39510 ok = expand_vec_perm_1 (&dfinal);
39511 gcc_assert (ok);
39513 return true;
39516 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39517 a vector permutation using two instructions, vperm2f128 resp.
39518 vperm2i128 followed by any single in-lane permutation. */
39520 static bool
39521 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39523 struct expand_vec_perm_d dfirst, dsecond;
39524 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39525 bool ok;
39527 if (!TARGET_AVX
39528 || GET_MODE_SIZE (d->vmode) != 32
39529 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39530 return false;
39532 dsecond = *d;
39533 dsecond.one_operand_p = false;
39534 dsecond.testing_p = true;
39536 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39537 immediate. For perm < 16 the second permutation uses
39538 d->op0 as first operand, for perm >= 16 it uses d->op1
39539 as first operand. The second operand is the result of
39540 vperm2[fi]128. */
39541 for (perm = 0; perm < 32; perm++)
39543 /* Ignore permutations which do not move anything cross-lane. */
39544 if (perm < 16)
39546 /* The second shuffle for e.g. V4DFmode has
39547 0123 and ABCD operands.
39548 Ignore AB23, as 23 is already in the second lane
39549 of the first operand. */
39550 if ((perm & 0xc) == (1 << 2)) continue;
39551 /* And 01CD, as 01 is in the first lane of the first
39552 operand. */
39553 if ((perm & 3) == 0) continue;
39554 /* And 4567, as then the vperm2[fi]128 doesn't change
39555 anything on the original 4567 second operand. */
39556 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39558 else
39560 /* The second shuffle for e.g. V4DFmode has
39561 4567 and ABCD operands.
39562 Ignore AB67, as 67 is already in the second lane
39563 of the first operand. */
39564 if ((perm & 0xc) == (3 << 2)) continue;
39565 /* And 45CD, as 45 is in the first lane of the first
39566 operand. */
39567 if ((perm & 3) == 2) continue;
39568 /* And 0123, as then the vperm2[fi]128 doesn't change
39569 anything on the original 0123 first operand. */
39570 if ((perm & 0xf) == (1 << 2)) continue;
39573 for (i = 0; i < nelt; i++)
39575 j = d->perm[i] / nelt2;
39576 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39577 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39578 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39579 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39580 else
39581 break;
39584 if (i == nelt)
39586 start_sequence ();
39587 ok = expand_vec_perm_1 (&dsecond);
39588 end_sequence ();
39590 else
39591 ok = false;
39593 if (ok)
39595 if (d->testing_p)
39596 return true;
39598 /* Found a usable second shuffle. dfirst will be
39599 vperm2f128 on d->op0 and d->op1. */
39600 dsecond.testing_p = false;
39601 dfirst = *d;
39602 dfirst.target = gen_reg_rtx (d->vmode);
39603 for (i = 0; i < nelt; i++)
39604 dfirst.perm[i] = (i & (nelt2 - 1))
39605 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39607 ok = expand_vec_perm_1 (&dfirst);
39608 gcc_assert (ok);
39610 /* And dsecond is some single insn shuffle, taking
39611 d->op0 and result of vperm2f128 (if perm < 16) or
39612 d->op1 and result of vperm2f128 (otherwise). */
39613 dsecond.op1 = dfirst.target;
39614 if (perm >= 16)
39615 dsecond.op0 = dfirst.op1;
39617 ok = expand_vec_perm_1 (&dsecond);
39618 gcc_assert (ok);
39620 return true;
39623 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39624 if (d->one_operand_p)
39625 return false;
39628 return false;
39631 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39632 a two vector permutation using 2 intra-lane interleave insns
39633 and cross-lane shuffle for 32-byte vectors. */
39635 static bool
39636 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39638 unsigned i, nelt;
39639 rtx (*gen) (rtx, rtx, rtx);
39641 if (d->one_operand_p)
39642 return false;
39643 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39645 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39647 else
39648 return false;
39650 nelt = d->nelt;
39651 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39652 return false;
39653 for (i = 0; i < nelt; i += 2)
39654 if (d->perm[i] != d->perm[0] + i / 2
39655 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39656 return false;
39658 if (d->testing_p)
39659 return true;
39661 switch (d->vmode)
39663 case V32QImode:
39664 if (d->perm[0])
39665 gen = gen_vec_interleave_highv32qi;
39666 else
39667 gen = gen_vec_interleave_lowv32qi;
39668 break;
39669 case V16HImode:
39670 if (d->perm[0])
39671 gen = gen_vec_interleave_highv16hi;
39672 else
39673 gen = gen_vec_interleave_lowv16hi;
39674 break;
39675 case V8SImode:
39676 if (d->perm[0])
39677 gen = gen_vec_interleave_highv8si;
39678 else
39679 gen = gen_vec_interleave_lowv8si;
39680 break;
39681 case V4DImode:
39682 if (d->perm[0])
39683 gen = gen_vec_interleave_highv4di;
39684 else
39685 gen = gen_vec_interleave_lowv4di;
39686 break;
39687 case V8SFmode:
39688 if (d->perm[0])
39689 gen = gen_vec_interleave_highv8sf;
39690 else
39691 gen = gen_vec_interleave_lowv8sf;
39692 break;
39693 case V4DFmode:
39694 if (d->perm[0])
39695 gen = gen_vec_interleave_highv4df;
39696 else
39697 gen = gen_vec_interleave_lowv4df;
39698 break;
39699 default:
39700 gcc_unreachable ();
39703 emit_insn (gen (d->target, d->op0, d->op1));
39704 return true;
39707 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39708 a single vector permutation using a single intra-lane vector
39709 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39710 the non-swapped and swapped vectors together. */
39712 static bool
39713 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39715 struct expand_vec_perm_d dfirst, dsecond;
39716 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39717 rtx seq;
39718 bool ok;
39719 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39721 if (!TARGET_AVX
39722 || TARGET_AVX2
39723 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39724 || !d->one_operand_p)
39725 return false;
39727 dfirst = *d;
39728 for (i = 0; i < nelt; i++)
39729 dfirst.perm[i] = 0xff;
39730 for (i = 0, msk = 0; i < nelt; i++)
39732 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39733 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39734 return false;
39735 dfirst.perm[j] = d->perm[i];
39736 if (j != i)
39737 msk |= (1 << i);
39739 for (i = 0; i < nelt; i++)
39740 if (dfirst.perm[i] == 0xff)
39741 dfirst.perm[i] = i;
39743 if (!d->testing_p)
39744 dfirst.target = gen_reg_rtx (dfirst.vmode);
39746 start_sequence ();
39747 ok = expand_vec_perm_1 (&dfirst);
39748 seq = get_insns ();
39749 end_sequence ();
39751 if (!ok)
39752 return false;
39754 if (d->testing_p)
39755 return true;
39757 emit_insn (seq);
39759 dsecond = *d;
39760 dsecond.op0 = dfirst.target;
39761 dsecond.op1 = dfirst.target;
39762 dsecond.one_operand_p = true;
39763 dsecond.target = gen_reg_rtx (dsecond.vmode);
39764 for (i = 0; i < nelt; i++)
39765 dsecond.perm[i] = i ^ nelt2;
39767 ok = expand_vec_perm_1 (&dsecond);
39768 gcc_assert (ok);
39770 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39771 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39772 return true;
39775 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39776 permutation using two vperm2f128, followed by a vshufpd insn blending
39777 the two vectors together. */
39779 static bool
39780 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39782 struct expand_vec_perm_d dfirst, dsecond, dthird;
39783 bool ok;
39785 if (!TARGET_AVX || (d->vmode != V4DFmode))
39786 return false;
39788 if (d->testing_p)
39789 return true;
39791 dfirst = *d;
39792 dsecond = *d;
39793 dthird = *d;
39795 dfirst.perm[0] = (d->perm[0] & ~1);
39796 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39797 dfirst.perm[2] = (d->perm[2] & ~1);
39798 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39799 dsecond.perm[0] = (d->perm[1] & ~1);
39800 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39801 dsecond.perm[2] = (d->perm[3] & ~1);
39802 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39803 dthird.perm[0] = (d->perm[0] % 2);
39804 dthird.perm[1] = (d->perm[1] % 2) + 4;
39805 dthird.perm[2] = (d->perm[2] % 2) + 2;
39806 dthird.perm[3] = (d->perm[3] % 2) + 6;
39808 dfirst.target = gen_reg_rtx (dfirst.vmode);
39809 dsecond.target = gen_reg_rtx (dsecond.vmode);
39810 dthird.op0 = dfirst.target;
39811 dthird.op1 = dsecond.target;
39812 dthird.one_operand_p = false;
39814 canonicalize_perm (&dfirst);
39815 canonicalize_perm (&dsecond);
39817 ok = expand_vec_perm_1 (&dfirst)
39818 && expand_vec_perm_1 (&dsecond)
39819 && expand_vec_perm_1 (&dthird);
39821 gcc_assert (ok);
39823 return true;
39826 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39827 permutation with two pshufb insns and an ior. We should have already
39828 failed all two instruction sequences. */
39830 static bool
39831 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39833 rtx rperm[2][16], vperm, l, h, op, m128;
39834 unsigned int i, nelt, eltsz;
39836 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39837 return false;
39838 gcc_assert (!d->one_operand_p);
39840 nelt = d->nelt;
39841 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39843 /* Generate two permutation masks. If the required element is within
39844 the given vector it is shuffled into the proper lane. If the required
39845 element is in the other vector, force a zero into the lane by setting
39846 bit 7 in the permutation mask. */
39847 m128 = GEN_INT (-128);
39848 for (i = 0; i < nelt; ++i)
39850 unsigned j, e = d->perm[i];
39851 unsigned which = (e >= nelt);
39852 if (e >= nelt)
39853 e -= nelt;
39855 for (j = 0; j < eltsz; ++j)
39857 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39858 rperm[1-which][i*eltsz + j] = m128;
39862 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39863 vperm = force_reg (V16QImode, vperm);
39865 l = gen_reg_rtx (V16QImode);
39866 op = gen_lowpart (V16QImode, d->op0);
39867 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39869 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39870 vperm = force_reg (V16QImode, vperm);
39872 h = gen_reg_rtx (V16QImode);
39873 op = gen_lowpart (V16QImode, d->op1);
39874 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39876 op = gen_lowpart (V16QImode, d->target);
39877 emit_insn (gen_iorv16qi3 (op, l, h));
39879 return true;
39882 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39883 with two vpshufb insns, vpermq and vpor. We should have already failed
39884 all two or three instruction sequences. */
39886 static bool
39887 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39889 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39890 unsigned int i, nelt, eltsz;
39892 if (!TARGET_AVX2
39893 || !d->one_operand_p
39894 || (d->vmode != V32QImode && d->vmode != V16HImode))
39895 return false;
39897 if (d->testing_p)
39898 return true;
39900 nelt = d->nelt;
39901 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39903 /* Generate two permutation masks. If the required element is within
39904 the same lane, it is shuffled in. If the required element from the
39905 other lane, force a zero by setting bit 7 in the permutation mask.
39906 In the other mask the mask has non-negative elements if element
39907 is requested from the other lane, but also moved to the other lane,
39908 so that the result of vpshufb can have the two V2TImode halves
39909 swapped. */
39910 m128 = GEN_INT (-128);
39911 for (i = 0; i < nelt; ++i)
39913 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39914 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39916 for (j = 0; j < eltsz; ++j)
39918 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39919 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39923 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39924 vperm = force_reg (V32QImode, vperm);
39926 h = gen_reg_rtx (V32QImode);
39927 op = gen_lowpart (V32QImode, d->op0);
39928 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39930 /* Swap the 128-byte lanes of h into hp. */
39931 hp = gen_reg_rtx (V4DImode);
39932 op = gen_lowpart (V4DImode, h);
39933 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39934 const1_rtx));
39936 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39937 vperm = force_reg (V32QImode, vperm);
39939 l = gen_reg_rtx (V32QImode);
39940 op = gen_lowpart (V32QImode, d->op0);
39941 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39943 op = gen_lowpart (V32QImode, d->target);
39944 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39946 return true;
39949 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39950 and extract-odd permutations of two V32QImode and V16QImode operand
39951 with two vpshufb insns, vpor and vpermq. We should have already
39952 failed all two or three instruction sequences. */
39954 static bool
39955 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39957 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39958 unsigned int i, nelt, eltsz;
39960 if (!TARGET_AVX2
39961 || d->one_operand_p
39962 || (d->vmode != V32QImode && d->vmode != V16HImode))
39963 return false;
39965 for (i = 0; i < d->nelt; ++i)
39966 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39967 return false;
39969 if (d->testing_p)
39970 return true;
39972 nelt = d->nelt;
39973 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39975 /* Generate two permutation masks. In the first permutation mask
39976 the first quarter will contain indexes for the first half
39977 of the op0, the second quarter will contain bit 7 set, third quarter
39978 will contain indexes for the second half of the op0 and the
39979 last quarter bit 7 set. In the second permutation mask
39980 the first quarter will contain bit 7 set, the second quarter
39981 indexes for the first half of the op1, the third quarter bit 7 set
39982 and last quarter indexes for the second half of the op1.
39983 I.e. the first mask e.g. for V32QImode extract even will be:
39984 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39985 (all values masked with 0xf except for -128) and second mask
39986 for extract even will be
39987 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39988 m128 = GEN_INT (-128);
39989 for (i = 0; i < nelt; ++i)
39991 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39992 unsigned which = d->perm[i] >= nelt;
39993 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
39995 for (j = 0; j < eltsz; ++j)
39997 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
39998 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
40002 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40003 vperm = force_reg (V32QImode, vperm);
40005 l = gen_reg_rtx (V32QImode);
40006 op = gen_lowpart (V32QImode, d->op0);
40007 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40009 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40010 vperm = force_reg (V32QImode, vperm);
40012 h = gen_reg_rtx (V32QImode);
40013 op = gen_lowpart (V32QImode, d->op1);
40014 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40016 ior = gen_reg_rtx (V32QImode);
40017 emit_insn (gen_iorv32qi3 (ior, l, h));
40019 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
40020 op = gen_lowpart (V4DImode, d->target);
40021 ior = gen_lowpart (V4DImode, ior);
40022 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
40023 const1_rtx, GEN_INT (3)));
40025 return true;
40028 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
40029 and extract-odd permutations. */
40031 static bool
40032 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
40034 rtx t1, t2, t3;
40036 switch (d->vmode)
40038 case V4DFmode:
40039 t1 = gen_reg_rtx (V4DFmode);
40040 t2 = gen_reg_rtx (V4DFmode);
40042 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40043 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
40044 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
40046 /* Now an unpck[lh]pd will produce the result required. */
40047 if (odd)
40048 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
40049 else
40050 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
40051 emit_insn (t3);
40052 break;
40054 case V8SFmode:
40056 int mask = odd ? 0xdd : 0x88;
40058 t1 = gen_reg_rtx (V8SFmode);
40059 t2 = gen_reg_rtx (V8SFmode);
40060 t3 = gen_reg_rtx (V8SFmode);
40062 /* Shuffle within the 128-bit lanes to produce:
40063 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
40064 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
40065 GEN_INT (mask)));
40067 /* Shuffle the lanes around to produce:
40068 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
40069 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
40070 GEN_INT (0x3)));
40072 /* Shuffle within the 128-bit lanes to produce:
40073 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
40074 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
40076 /* Shuffle within the 128-bit lanes to produce:
40077 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
40078 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
40080 /* Shuffle the lanes around to produce:
40081 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
40082 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
40083 GEN_INT (0x20)));
40085 break;
40087 case V2DFmode:
40088 case V4SFmode:
40089 case V2DImode:
40090 case V4SImode:
40091 /* These are always directly implementable by expand_vec_perm_1. */
40092 gcc_unreachable ();
40094 case V8HImode:
40095 if (TARGET_SSSE3)
40096 return expand_vec_perm_pshufb2 (d);
40097 else
40099 /* We need 2*log2(N)-1 operations to achieve odd/even
40100 with interleave. */
40101 t1 = gen_reg_rtx (V8HImode);
40102 t2 = gen_reg_rtx (V8HImode);
40103 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
40104 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
40105 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
40106 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
40107 if (odd)
40108 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
40109 else
40110 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
40111 emit_insn (t3);
40113 break;
40115 case V16QImode:
40116 if (TARGET_SSSE3)
40117 return expand_vec_perm_pshufb2 (d);
40118 else
40120 t1 = gen_reg_rtx (V16QImode);
40121 t2 = gen_reg_rtx (V16QImode);
40122 t3 = gen_reg_rtx (V16QImode);
40123 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40124 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40125 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40126 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40127 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40128 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40129 if (odd)
40130 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40131 else
40132 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40133 emit_insn (t3);
40135 break;
40137 case V16HImode:
40138 case V32QImode:
40139 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40141 case V4DImode:
40142 if (!TARGET_AVX2)
40144 struct expand_vec_perm_d d_copy = *d;
40145 d_copy.vmode = V4DFmode;
40146 d_copy.target = gen_lowpart (V4DFmode, d->target);
40147 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40148 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40149 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40152 t1 = gen_reg_rtx (V4DImode);
40153 t2 = gen_reg_rtx (V4DImode);
40155 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40156 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40157 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40159 /* Now an vpunpck[lh]qdq will produce the result required. */
40160 if (odd)
40161 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40162 else
40163 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40164 emit_insn (t3);
40165 break;
40167 case V8SImode:
40168 if (!TARGET_AVX2)
40170 struct expand_vec_perm_d d_copy = *d;
40171 d_copy.vmode = V8SFmode;
40172 d_copy.target = gen_lowpart (V8SFmode, d->target);
40173 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40174 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40175 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40178 t1 = gen_reg_rtx (V8SImode);
40179 t2 = gen_reg_rtx (V8SImode);
40181 /* Shuffle the lanes around into
40182 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40183 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40184 gen_lowpart (V4DImode, d->op0),
40185 gen_lowpart (V4DImode, d->op1),
40186 GEN_INT (0x20)));
40187 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40188 gen_lowpart (V4DImode, d->op0),
40189 gen_lowpart (V4DImode, d->op1),
40190 GEN_INT (0x31)));
40192 /* Swap the 2nd and 3rd position in each lane into
40193 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40194 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40195 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40196 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40197 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40199 /* Now an vpunpck[lh]qdq will produce
40200 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40201 if (odd)
40202 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40203 gen_lowpart (V4DImode, t1),
40204 gen_lowpart (V4DImode, t2));
40205 else
40206 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40207 gen_lowpart (V4DImode, t1),
40208 gen_lowpart (V4DImode, t2));
40209 emit_insn (t3);
40210 break;
40212 default:
40213 gcc_unreachable ();
40216 return true;
40219 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40220 extract-even and extract-odd permutations. */
40222 static bool
40223 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40225 unsigned i, odd, nelt = d->nelt;
40227 odd = d->perm[0];
40228 if (odd != 0 && odd != 1)
40229 return false;
40231 for (i = 1; i < nelt; ++i)
40232 if (d->perm[i] != 2 * i + odd)
40233 return false;
40235 return expand_vec_perm_even_odd_1 (d, odd);
40238 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40239 permutations. We assume that expand_vec_perm_1 has already failed. */
40241 static bool
40242 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40244 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40245 enum machine_mode vmode = d->vmode;
40246 unsigned char perm2[4];
40247 rtx op0 = d->op0;
40248 bool ok;
40250 switch (vmode)
40252 case V4DFmode:
40253 case V8SFmode:
40254 /* These are special-cased in sse.md so that we can optionally
40255 use the vbroadcast instruction. They expand to two insns
40256 if the input happens to be in a register. */
40257 gcc_unreachable ();
40259 case V2DFmode:
40260 case V2DImode:
40261 case V4SFmode:
40262 case V4SImode:
40263 /* These are always implementable using standard shuffle patterns. */
40264 gcc_unreachable ();
40266 case V8HImode:
40267 case V16QImode:
40268 /* These can be implemented via interleave. We save one insn by
40269 stopping once we have promoted to V4SImode and then use pshufd. */
40272 rtx dest;
40273 rtx (*gen) (rtx, rtx, rtx)
40274 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40275 : gen_vec_interleave_lowv8hi;
40277 if (elt >= nelt2)
40279 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40280 : gen_vec_interleave_highv8hi;
40281 elt -= nelt2;
40283 nelt2 /= 2;
40285 dest = gen_reg_rtx (vmode);
40286 emit_insn (gen (dest, op0, op0));
40287 vmode = get_mode_wider_vector (vmode);
40288 op0 = gen_lowpart (vmode, dest);
40290 while (vmode != V4SImode);
40292 memset (perm2, elt, 4);
40293 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40294 d->testing_p);
40295 gcc_assert (ok);
40296 return true;
40298 case V32QImode:
40299 case V16HImode:
40300 case V8SImode:
40301 case V4DImode:
40302 /* For AVX2 broadcasts of the first element vpbroadcast* or
40303 vpermq should be used by expand_vec_perm_1. */
40304 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40305 return false;
40307 default:
40308 gcc_unreachable ();
40312 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40313 broadcast permutations. */
40315 static bool
40316 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40318 unsigned i, elt, nelt = d->nelt;
40320 if (!d->one_operand_p)
40321 return false;
40323 elt = d->perm[0];
40324 for (i = 1; i < nelt; ++i)
40325 if (d->perm[i] != elt)
40326 return false;
40328 return expand_vec_perm_broadcast_1 (d);
40331 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40332 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40333 all the shorter instruction sequences. */
40335 static bool
40336 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40338 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40339 unsigned int i, nelt, eltsz;
40340 bool used[4];
40342 if (!TARGET_AVX2
40343 || d->one_operand_p
40344 || (d->vmode != V32QImode && d->vmode != V16HImode))
40345 return false;
40347 if (d->testing_p)
40348 return true;
40350 nelt = d->nelt;
40351 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40353 /* Generate 4 permutation masks. If the required element is within
40354 the same lane, it is shuffled in. If the required element from the
40355 other lane, force a zero by setting bit 7 in the permutation mask.
40356 In the other mask the mask has non-negative elements if element
40357 is requested from the other lane, but also moved to the other lane,
40358 so that the result of vpshufb can have the two V2TImode halves
40359 swapped. */
40360 m128 = GEN_INT (-128);
40361 for (i = 0; i < 32; ++i)
40363 rperm[0][i] = m128;
40364 rperm[1][i] = m128;
40365 rperm[2][i] = m128;
40366 rperm[3][i] = m128;
40368 used[0] = false;
40369 used[1] = false;
40370 used[2] = false;
40371 used[3] = false;
40372 for (i = 0; i < nelt; ++i)
40374 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40375 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40376 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40378 for (j = 0; j < eltsz; ++j)
40379 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40380 used[which] = true;
40383 for (i = 0; i < 2; ++i)
40385 if (!used[2 * i + 1])
40387 h[i] = NULL_RTX;
40388 continue;
40390 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40391 gen_rtvec_v (32, rperm[2 * i + 1]));
40392 vperm = force_reg (V32QImode, vperm);
40393 h[i] = gen_reg_rtx (V32QImode);
40394 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40395 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40398 /* Swap the 128-byte lanes of h[X]. */
40399 for (i = 0; i < 2; ++i)
40401 if (h[i] == NULL_RTX)
40402 continue;
40403 op = gen_reg_rtx (V4DImode);
40404 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40405 const2_rtx, GEN_INT (3), const0_rtx,
40406 const1_rtx));
40407 h[i] = gen_lowpart (V32QImode, op);
40410 for (i = 0; i < 2; ++i)
40412 if (!used[2 * i])
40414 l[i] = NULL_RTX;
40415 continue;
40417 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40418 vperm = force_reg (V32QImode, vperm);
40419 l[i] = gen_reg_rtx (V32QImode);
40420 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40421 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40424 for (i = 0; i < 2; ++i)
40426 if (h[i] && l[i])
40428 op = gen_reg_rtx (V32QImode);
40429 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40430 l[i] = op;
40432 else if (h[i])
40433 l[i] = h[i];
40436 gcc_assert (l[0] && l[1]);
40437 op = gen_lowpart (V32QImode, d->target);
40438 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40439 return true;
40442 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40443 With all of the interface bits taken care of, perform the expansion
40444 in D and return true on success. */
40446 static bool
40447 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40449 /* Try a single instruction expansion. */
40450 if (expand_vec_perm_1 (d))
40451 return true;
40453 /* Try sequences of two instructions. */
40455 if (expand_vec_perm_pshuflw_pshufhw (d))
40456 return true;
40458 if (expand_vec_perm_palignr (d))
40459 return true;
40461 if (expand_vec_perm_interleave2 (d))
40462 return true;
40464 if (expand_vec_perm_broadcast (d))
40465 return true;
40467 if (expand_vec_perm_vpermq_perm_1 (d))
40468 return true;
40470 if (expand_vec_perm_vperm2f128 (d))
40471 return true;
40473 /* Try sequences of three instructions. */
40475 if (expand_vec_perm_2vperm2f128_vshuf (d))
40476 return true;
40478 if (expand_vec_perm_pshufb2 (d))
40479 return true;
40481 if (expand_vec_perm_interleave3 (d))
40482 return true;
40484 if (expand_vec_perm_vperm2f128_vblend (d))
40485 return true;
40487 /* Try sequences of four instructions. */
40489 if (expand_vec_perm_vpshufb2_vpermq (d))
40490 return true;
40492 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40493 return true;
40495 /* ??? Look for narrow permutations whose element orderings would
40496 allow the promotion to a wider mode. */
40498 /* ??? Look for sequences of interleave or a wider permute that place
40499 the data into the correct lanes for a half-vector shuffle like
40500 pshuf[lh]w or vpermilps. */
40502 /* ??? Look for sequences of interleave that produce the desired results.
40503 The combinatorics of punpck[lh] get pretty ugly... */
40505 if (expand_vec_perm_even_odd (d))
40506 return true;
40508 /* Even longer sequences. */
40509 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40510 return true;
40512 return false;
40515 /* If a permutation only uses one operand, make it clear. Returns true
40516 if the permutation references both operands. */
40518 static bool
40519 canonicalize_perm (struct expand_vec_perm_d *d)
40521 int i, which, nelt = d->nelt;
40523 for (i = which = 0; i < nelt; ++i)
40524 which |= (d->perm[i] < nelt ? 1 : 2);
40526 d->one_operand_p = true;
40527 switch (which)
40529 default:
40530 gcc_unreachable();
40532 case 3:
40533 if (!rtx_equal_p (d->op0, d->op1))
40535 d->one_operand_p = false;
40536 break;
40538 /* The elements of PERM do not suggest that only the first operand
40539 is used, but both operands are identical. Allow easier matching
40540 of the permutation by folding the permutation into the single
40541 input vector. */
40542 /* FALLTHRU */
40544 case 2:
40545 for (i = 0; i < nelt; ++i)
40546 d->perm[i] &= nelt - 1;
40547 d->op0 = d->op1;
40548 break;
40550 case 1:
40551 d->op1 = d->op0;
40552 break;
40555 return (which == 3);
40558 bool
40559 ix86_expand_vec_perm_const (rtx operands[4])
40561 struct expand_vec_perm_d d;
40562 unsigned char perm[MAX_VECT_LEN];
40563 int i, nelt;
40564 bool two_args;
40565 rtx sel;
40567 d.target = operands[0];
40568 d.op0 = operands[1];
40569 d.op1 = operands[2];
40570 sel = operands[3];
40572 d.vmode = GET_MODE (d.target);
40573 gcc_assert (VECTOR_MODE_P (d.vmode));
40574 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40575 d.testing_p = false;
40577 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40578 gcc_assert (XVECLEN (sel, 0) == nelt);
40579 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40581 for (i = 0; i < nelt; ++i)
40583 rtx e = XVECEXP (sel, 0, i);
40584 int ei = INTVAL (e) & (2 * nelt - 1);
40585 d.perm[i] = ei;
40586 perm[i] = ei;
40589 two_args = canonicalize_perm (&d);
40591 if (ix86_expand_vec_perm_const_1 (&d))
40592 return true;
40594 /* If the selector says both arguments are needed, but the operands are the
40595 same, the above tried to expand with one_operand_p and flattened selector.
40596 If that didn't work, retry without one_operand_p; we succeeded with that
40597 during testing. */
40598 if (two_args && d.one_operand_p)
40600 d.one_operand_p = false;
40601 memcpy (d.perm, perm, sizeof (perm));
40602 return ix86_expand_vec_perm_const_1 (&d);
40605 return false;
40608 /* Implement targetm.vectorize.vec_perm_const_ok. */
40610 static bool
40611 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40612 const unsigned char *sel)
40614 struct expand_vec_perm_d d;
40615 unsigned int i, nelt, which;
40616 bool ret;
40618 d.vmode = vmode;
40619 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40620 d.testing_p = true;
40622 /* Given sufficient ISA support we can just return true here
40623 for selected vector modes. */
40624 if (GET_MODE_SIZE (d.vmode) == 16)
40626 /* All implementable with a single vpperm insn. */
40627 if (TARGET_XOP)
40628 return true;
40629 /* All implementable with 2 pshufb + 1 ior. */
40630 if (TARGET_SSSE3)
40631 return true;
40632 /* All implementable with shufpd or unpck[lh]pd. */
40633 if (d.nelt == 2)
40634 return true;
40637 /* Extract the values from the vector CST into the permutation
40638 array in D. */
40639 memcpy (d.perm, sel, nelt);
40640 for (i = which = 0; i < nelt; ++i)
40642 unsigned char e = d.perm[i];
40643 gcc_assert (e < 2 * nelt);
40644 which |= (e < nelt ? 1 : 2);
40647 /* For all elements from second vector, fold the elements to first. */
40648 if (which == 2)
40649 for (i = 0; i < nelt; ++i)
40650 d.perm[i] -= nelt;
40652 /* Check whether the mask can be applied to the vector type. */
40653 d.one_operand_p = (which != 3);
40655 /* Implementable with shufps or pshufd. */
40656 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40657 return true;
40659 /* Otherwise we have to go through the motions and see if we can
40660 figure out how to generate the requested permutation. */
40661 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40662 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40663 if (!d.one_operand_p)
40664 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40666 start_sequence ();
40667 ret = ix86_expand_vec_perm_const_1 (&d);
40668 end_sequence ();
40670 return ret;
40673 void
40674 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40676 struct expand_vec_perm_d d;
40677 unsigned i, nelt;
40679 d.target = targ;
40680 d.op0 = op0;
40681 d.op1 = op1;
40682 d.vmode = GET_MODE (targ);
40683 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40684 d.one_operand_p = false;
40685 d.testing_p = false;
40687 for (i = 0; i < nelt; ++i)
40688 d.perm[i] = i * 2 + odd;
40690 /* We'll either be able to implement the permutation directly... */
40691 if (expand_vec_perm_1 (&d))
40692 return;
40694 /* ... or we use the special-case patterns. */
40695 expand_vec_perm_even_odd_1 (&d, odd);
40698 static void
40699 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40701 struct expand_vec_perm_d d;
40702 unsigned i, nelt, base;
40703 bool ok;
40705 d.target = targ;
40706 d.op0 = op0;
40707 d.op1 = op1;
40708 d.vmode = GET_MODE (targ);
40709 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40710 d.one_operand_p = false;
40711 d.testing_p = false;
40713 base = high_p ? nelt / 2 : 0;
40714 for (i = 0; i < nelt / 2; ++i)
40716 d.perm[i * 2] = i + base;
40717 d.perm[i * 2 + 1] = i + base + nelt;
40720 /* Note that for AVX this isn't one instruction. */
40721 ok = ix86_expand_vec_perm_const_1 (&d);
40722 gcc_assert (ok);
40726 /* Expand a vector operation CODE for a V*QImode in terms of the
40727 same operation on V*HImode. */
40729 void
40730 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40732 enum machine_mode qimode = GET_MODE (dest);
40733 enum machine_mode himode;
40734 rtx (*gen_il) (rtx, rtx, rtx);
40735 rtx (*gen_ih) (rtx, rtx, rtx);
40736 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40737 struct expand_vec_perm_d d;
40738 bool ok, full_interleave;
40739 bool uns_p = false;
40740 int i;
40742 switch (qimode)
40744 case V16QImode:
40745 himode = V8HImode;
40746 gen_il = gen_vec_interleave_lowv16qi;
40747 gen_ih = gen_vec_interleave_highv16qi;
40748 break;
40749 case V32QImode:
40750 himode = V16HImode;
40751 gen_il = gen_avx2_interleave_lowv32qi;
40752 gen_ih = gen_avx2_interleave_highv32qi;
40753 break;
40754 default:
40755 gcc_unreachable ();
40758 op2_l = op2_h = op2;
40759 switch (code)
40761 case MULT:
40762 /* Unpack data such that we've got a source byte in each low byte of
40763 each word. We don't care what goes into the high byte of each word.
40764 Rather than trying to get zero in there, most convenient is to let
40765 it be a copy of the low byte. */
40766 op2_l = gen_reg_rtx (qimode);
40767 op2_h = gen_reg_rtx (qimode);
40768 emit_insn (gen_il (op2_l, op2, op2));
40769 emit_insn (gen_ih (op2_h, op2, op2));
40770 /* FALLTHRU */
40772 op1_l = gen_reg_rtx (qimode);
40773 op1_h = gen_reg_rtx (qimode);
40774 emit_insn (gen_il (op1_l, op1, op1));
40775 emit_insn (gen_ih (op1_h, op1, op1));
40776 full_interleave = qimode == V16QImode;
40777 break;
40779 case ASHIFT:
40780 case LSHIFTRT:
40781 uns_p = true;
40782 /* FALLTHRU */
40783 case ASHIFTRT:
40784 op1_l = gen_reg_rtx (himode);
40785 op1_h = gen_reg_rtx (himode);
40786 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40787 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40788 full_interleave = true;
40789 break;
40790 default:
40791 gcc_unreachable ();
40794 /* Perform the operation. */
40795 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40796 1, OPTAB_DIRECT);
40797 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40798 1, OPTAB_DIRECT);
40799 gcc_assert (res_l && res_h);
40801 /* Merge the data back into the right place. */
40802 d.target = dest;
40803 d.op0 = gen_lowpart (qimode, res_l);
40804 d.op1 = gen_lowpart (qimode, res_h);
40805 d.vmode = qimode;
40806 d.nelt = GET_MODE_NUNITS (qimode);
40807 d.one_operand_p = false;
40808 d.testing_p = false;
40810 if (full_interleave)
40812 /* For SSE2, we used an full interleave, so the desired
40813 results are in the even elements. */
40814 for (i = 0; i < 32; ++i)
40815 d.perm[i] = i * 2;
40817 else
40819 /* For AVX, the interleave used above was not cross-lane. So the
40820 extraction is evens but with the second and third quarter swapped.
40821 Happily, that is even one insn shorter than even extraction. */
40822 for (i = 0; i < 32; ++i)
40823 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40826 ok = ix86_expand_vec_perm_const_1 (&d);
40827 gcc_assert (ok);
40829 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40830 gen_rtx_fmt_ee (code, qimode, op1, op2));
40833 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
40834 if op is CONST_VECTOR with all odd elements equal to their
40835 preceeding element. */
40837 static bool
40838 const_vector_equal_evenodd_p (rtx op)
40840 enum machine_mode mode = GET_MODE (op);
40841 int i, nunits = GET_MODE_NUNITS (mode);
40842 if (GET_CODE (op) != CONST_VECTOR
40843 || nunits != CONST_VECTOR_NUNITS (op))
40844 return false;
40845 for (i = 0; i < nunits; i += 2)
40846 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
40847 return false;
40848 return true;
40851 void
40852 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40853 bool uns_p, bool odd_p)
40855 enum machine_mode mode = GET_MODE (op1);
40856 enum machine_mode wmode = GET_MODE (dest);
40857 rtx x;
40858 rtx orig_op1 = op1, orig_op2 = op2;
40860 if (!nonimmediate_operand (op1, mode))
40861 op1 = force_reg (mode, op1);
40862 if (!nonimmediate_operand (op2, mode))
40863 op2 = force_reg (mode, op2);
40865 /* We only play even/odd games with vectors of SImode. */
40866 gcc_assert (mode == V4SImode || mode == V8SImode);
40868 /* If we're looking for the odd results, shift those members down to
40869 the even slots. For some cpus this is faster than a PSHUFD. */
40870 if (odd_p)
40872 /* For XOP use vpmacsdqh, but only for smult, as it is only
40873 signed. */
40874 if (TARGET_XOP && mode == V4SImode && !uns_p)
40876 x = force_reg (wmode, CONST0_RTX (wmode));
40877 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40878 return;
40881 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40882 if (!const_vector_equal_evenodd_p (orig_op1))
40883 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40884 x, NULL, 1, OPTAB_DIRECT);
40885 if (!const_vector_equal_evenodd_p (orig_op2))
40886 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40887 x, NULL, 1, OPTAB_DIRECT);
40888 op1 = gen_lowpart (mode, op1);
40889 op2 = gen_lowpart (mode, op2);
40892 if (mode == V8SImode)
40894 if (uns_p)
40895 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40896 else
40897 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40899 else if (uns_p)
40900 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40901 else if (TARGET_SSE4_1)
40902 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40903 else
40905 rtx s1, s2, t0, t1, t2;
40907 /* The easiest way to implement this without PMULDQ is to go through
40908 the motions as if we are performing a full 64-bit multiply. With
40909 the exception that we need to do less shuffling of the elements. */
40911 /* Compute the sign-extension, aka highparts, of the two operands. */
40912 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40913 op1, pc_rtx, pc_rtx);
40914 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40915 op2, pc_rtx, pc_rtx);
40917 /* Multiply LO(A) * HI(B), and vice-versa. */
40918 t1 = gen_reg_rtx (wmode);
40919 t2 = gen_reg_rtx (wmode);
40920 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40921 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40923 /* Multiply LO(A) * LO(B). */
40924 t0 = gen_reg_rtx (wmode);
40925 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40927 /* Combine and shift the highparts into place. */
40928 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40929 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40930 1, OPTAB_DIRECT);
40932 /* Combine high and low parts. */
40933 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40934 return;
40936 emit_insn (x);
40939 void
40940 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40941 bool uns_p, bool high_p)
40943 enum machine_mode wmode = GET_MODE (dest);
40944 enum machine_mode mode = GET_MODE (op1);
40945 rtx t1, t2, t3, t4, mask;
40947 switch (mode)
40949 case V4SImode:
40950 t1 = gen_reg_rtx (mode);
40951 t2 = gen_reg_rtx (mode);
40952 if (TARGET_XOP && !uns_p)
40954 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40955 shuffle the elements once so that all elements are in the right
40956 place for immediate use: { A C B D }. */
40957 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40958 const1_rtx, GEN_INT (3)));
40959 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40960 const1_rtx, GEN_INT (3)));
40962 else
40964 /* Put the elements into place for the multiply. */
40965 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40966 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40967 high_p = false;
40969 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40970 break;
40972 case V8SImode:
40973 /* Shuffle the elements between the lanes. After this we
40974 have { A B E F | C D G H } for each operand. */
40975 t1 = gen_reg_rtx (V4DImode);
40976 t2 = gen_reg_rtx (V4DImode);
40977 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40978 const0_rtx, const2_rtx,
40979 const1_rtx, GEN_INT (3)));
40980 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40981 const0_rtx, const2_rtx,
40982 const1_rtx, GEN_INT (3)));
40984 /* Shuffle the elements within the lanes. After this we
40985 have { A A B B | C C D D } or { E E F F | G G H H }. */
40986 t3 = gen_reg_rtx (V8SImode);
40987 t4 = gen_reg_rtx (V8SImode);
40988 mask = GEN_INT (high_p
40989 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40990 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40991 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40992 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40994 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40995 break;
40997 case V8HImode:
40998 case V16HImode:
40999 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
41000 uns_p, OPTAB_DIRECT);
41001 t2 = expand_binop (mode,
41002 uns_p ? umul_highpart_optab : smul_highpart_optab,
41003 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
41004 gcc_assert (t1 && t2);
41006 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
41007 break;
41009 case V16QImode:
41010 case V32QImode:
41011 t1 = gen_reg_rtx (wmode);
41012 t2 = gen_reg_rtx (wmode);
41013 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
41014 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
41016 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
41017 break;
41019 default:
41020 gcc_unreachable ();
41024 void
41025 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
41027 rtx res_1, res_2;
41029 res_1 = gen_reg_rtx (V4SImode);
41030 res_2 = gen_reg_rtx (V4SImode);
41031 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
41032 op1, op2, true, false);
41033 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
41034 op1, op2, true, true);
41036 /* Move the results in element 2 down to element 1; we don't care
41037 what goes in elements 2 and 3. Then we can merge the parts
41038 back together with an interleave.
41040 Note that two other sequences were tried:
41041 (1) Use interleaves at the start instead of psrldq, which allows
41042 us to use a single shufps to merge things back at the end.
41043 (2) Use shufps here to combine the two vectors, then pshufd to
41044 put the elements in the correct order.
41045 In both cases the cost of the reformatting stall was too high
41046 and the overall sequence slower. */
41048 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
41049 const0_rtx, const0_rtx));
41050 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
41051 const0_rtx, const0_rtx));
41052 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
41054 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
41057 void
41058 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
41060 enum machine_mode mode = GET_MODE (op0);
41061 rtx t1, t2, t3, t4, t5, t6;
41063 if (TARGET_XOP && mode == V2DImode)
41065 /* op1: A,B,C,D, op2: E,F,G,H */
41066 op1 = gen_lowpart (V4SImode, op1);
41067 op2 = gen_lowpart (V4SImode, op2);
41069 t1 = gen_reg_rtx (V4SImode);
41070 t2 = gen_reg_rtx (V4SImode);
41071 t3 = gen_reg_rtx (V2DImode);
41072 t4 = gen_reg_rtx (V2DImode);
41074 /* t1: B,A,D,C */
41075 emit_insn (gen_sse2_pshufd_1 (t1, op1,
41076 GEN_INT (1),
41077 GEN_INT (0),
41078 GEN_INT (3),
41079 GEN_INT (2)));
41081 /* t2: (B*E),(A*F),(D*G),(C*H) */
41082 emit_insn (gen_mulv4si3 (t2, t1, op2));
41084 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
41085 emit_insn (gen_xop_phadddq (t3, t2));
41087 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
41088 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
41090 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
41091 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
41093 else
41095 enum machine_mode nmode;
41096 rtx (*umul) (rtx, rtx, rtx);
41098 if (mode == V2DImode)
41100 umul = gen_vec_widen_umult_even_v4si;
41101 nmode = V4SImode;
41103 else if (mode == V4DImode)
41105 umul = gen_vec_widen_umult_even_v8si;
41106 nmode = V8SImode;
41108 else
41109 gcc_unreachable ();
41112 /* Multiply low parts. */
41113 t1 = gen_reg_rtx (mode);
41114 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
41116 /* Shift input vectors right 32 bits so we can multiply high parts. */
41117 t6 = GEN_INT (32);
41118 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
41119 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
41121 /* Multiply high parts by low parts. */
41122 t4 = gen_reg_rtx (mode);
41123 t5 = gen_reg_rtx (mode);
41124 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
41125 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
41127 /* Combine and shift the highparts back. */
41128 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
41129 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
41131 /* Combine high and low parts. */
41132 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
41135 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41136 gen_rtx_MULT (mode, op1, op2));
41139 /* Expand an insert into a vector register through pinsr insn.
41140 Return true if successful. */
41142 bool
41143 ix86_expand_pinsr (rtx *operands)
41145 rtx dst = operands[0];
41146 rtx src = operands[3];
41148 unsigned int size = INTVAL (operands[1]);
41149 unsigned int pos = INTVAL (operands[2]);
41151 if (GET_CODE (dst) == SUBREG)
41153 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41154 dst = SUBREG_REG (dst);
41157 if (GET_CODE (src) == SUBREG)
41158 src = SUBREG_REG (src);
41160 switch (GET_MODE (dst))
41162 case V16QImode:
41163 case V8HImode:
41164 case V4SImode:
41165 case V2DImode:
41167 enum machine_mode srcmode, dstmode;
41168 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41170 srcmode = mode_for_size (size, MODE_INT, 0);
41172 switch (srcmode)
41174 case QImode:
41175 if (!TARGET_SSE4_1)
41176 return false;
41177 dstmode = V16QImode;
41178 pinsr = gen_sse4_1_pinsrb;
41179 break;
41181 case HImode:
41182 if (!TARGET_SSE2)
41183 return false;
41184 dstmode = V8HImode;
41185 pinsr = gen_sse2_pinsrw;
41186 break;
41188 case SImode:
41189 if (!TARGET_SSE4_1)
41190 return false;
41191 dstmode = V4SImode;
41192 pinsr = gen_sse4_1_pinsrd;
41193 break;
41195 case DImode:
41196 gcc_assert (TARGET_64BIT);
41197 if (!TARGET_SSE4_1)
41198 return false;
41199 dstmode = V2DImode;
41200 pinsr = gen_sse4_1_pinsrq;
41201 break;
41203 default:
41204 return false;
41207 dst = gen_lowpart (dstmode, dst);
41208 src = gen_lowpart (srcmode, src);
41210 pos /= size;
41212 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41213 return true;
41216 default:
41217 return false;
41221 /* This function returns the calling abi specific va_list type node.
41222 It returns the FNDECL specific va_list type. */
41224 static tree
41225 ix86_fn_abi_va_list (tree fndecl)
41227 if (!TARGET_64BIT)
41228 return va_list_type_node;
41229 gcc_assert (fndecl != NULL_TREE);
41231 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41232 return ms_va_list_type_node;
41233 else
41234 return sysv_va_list_type_node;
41237 /* Returns the canonical va_list type specified by TYPE. If there
41238 is no valid TYPE provided, it return NULL_TREE. */
41240 static tree
41241 ix86_canonical_va_list_type (tree type)
41243 tree wtype, htype;
41245 /* Resolve references and pointers to va_list type. */
41246 if (TREE_CODE (type) == MEM_REF)
41247 type = TREE_TYPE (type);
41248 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41249 type = TREE_TYPE (type);
41250 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41251 type = TREE_TYPE (type);
41253 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41255 wtype = va_list_type_node;
41256 gcc_assert (wtype != NULL_TREE);
41257 htype = type;
41258 if (TREE_CODE (wtype) == ARRAY_TYPE)
41260 /* If va_list is an array type, the argument may have decayed
41261 to a pointer type, e.g. by being passed to another function.
41262 In that case, unwrap both types so that we can compare the
41263 underlying records. */
41264 if (TREE_CODE (htype) == ARRAY_TYPE
41265 || POINTER_TYPE_P (htype))
41267 wtype = TREE_TYPE (wtype);
41268 htype = TREE_TYPE (htype);
41271 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41272 return va_list_type_node;
41273 wtype = sysv_va_list_type_node;
41274 gcc_assert (wtype != NULL_TREE);
41275 htype = type;
41276 if (TREE_CODE (wtype) == ARRAY_TYPE)
41278 /* If va_list is an array type, the argument may have decayed
41279 to a pointer type, e.g. by being passed to another function.
41280 In that case, unwrap both types so that we can compare the
41281 underlying records. */
41282 if (TREE_CODE (htype) == ARRAY_TYPE
41283 || POINTER_TYPE_P (htype))
41285 wtype = TREE_TYPE (wtype);
41286 htype = TREE_TYPE (htype);
41289 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41290 return sysv_va_list_type_node;
41291 wtype = ms_va_list_type_node;
41292 gcc_assert (wtype != NULL_TREE);
41293 htype = type;
41294 if (TREE_CODE (wtype) == ARRAY_TYPE)
41296 /* If va_list is an array type, the argument may have decayed
41297 to a pointer type, e.g. by being passed to another function.
41298 In that case, unwrap both types so that we can compare the
41299 underlying records. */
41300 if (TREE_CODE (htype) == ARRAY_TYPE
41301 || POINTER_TYPE_P (htype))
41303 wtype = TREE_TYPE (wtype);
41304 htype = TREE_TYPE (htype);
41307 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41308 return ms_va_list_type_node;
41309 return NULL_TREE;
41311 return std_canonical_va_list_type (type);
41314 /* Iterate through the target-specific builtin types for va_list.
41315 IDX denotes the iterator, *PTREE is set to the result type of
41316 the va_list builtin, and *PNAME to its internal type.
41317 Returns zero if there is no element for this index, otherwise
41318 IDX should be increased upon the next call.
41319 Note, do not iterate a base builtin's name like __builtin_va_list.
41320 Used from c_common_nodes_and_builtins. */
41322 static int
41323 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41325 if (TARGET_64BIT)
41327 switch (idx)
41329 default:
41330 break;
41332 case 0:
41333 *ptree = ms_va_list_type_node;
41334 *pname = "__builtin_ms_va_list";
41335 return 1;
41337 case 1:
41338 *ptree = sysv_va_list_type_node;
41339 *pname = "__builtin_sysv_va_list";
41340 return 1;
41344 return 0;
41347 #undef TARGET_SCHED_DISPATCH
41348 #define TARGET_SCHED_DISPATCH has_dispatch
41349 #undef TARGET_SCHED_DISPATCH_DO
41350 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41351 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41352 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41353 #undef TARGET_SCHED_REORDER
41354 #define TARGET_SCHED_REORDER ix86_sched_reorder
41355 #undef TARGET_SCHED_ADJUST_PRIORITY
41356 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41357 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41358 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
41360 /* The size of the dispatch window is the total number of bytes of
41361 object code allowed in a window. */
41362 #define DISPATCH_WINDOW_SIZE 16
41364 /* Number of dispatch windows considered for scheduling. */
41365 #define MAX_DISPATCH_WINDOWS 3
41367 /* Maximum number of instructions in a window. */
41368 #define MAX_INSN 4
41370 /* Maximum number of immediate operands in a window. */
41371 #define MAX_IMM 4
41373 /* Maximum number of immediate bits allowed in a window. */
41374 #define MAX_IMM_SIZE 128
41376 /* Maximum number of 32 bit immediates allowed in a window. */
41377 #define MAX_IMM_32 4
41379 /* Maximum number of 64 bit immediates allowed in a window. */
41380 #define MAX_IMM_64 2
41382 /* Maximum total of loads or prefetches allowed in a window. */
41383 #define MAX_LOAD 2
41385 /* Maximum total of stores allowed in a window. */
41386 #define MAX_STORE 1
41388 #undef BIG
41389 #define BIG 100
41392 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41393 enum dispatch_group {
41394 disp_no_group = 0,
41395 disp_load,
41396 disp_store,
41397 disp_load_store,
41398 disp_prefetch,
41399 disp_imm,
41400 disp_imm_32,
41401 disp_imm_64,
41402 disp_branch,
41403 disp_cmp,
41404 disp_jcc,
41405 disp_last
41408 /* Number of allowable groups in a dispatch window. It is an array
41409 indexed by dispatch_group enum. 100 is used as a big number,
41410 because the number of these kind of operations does not have any
41411 effect in dispatch window, but we need them for other reasons in
41412 the table. */
41413 static unsigned int num_allowable_groups[disp_last] = {
41414 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41417 char group_name[disp_last + 1][16] = {
41418 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41419 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41420 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41423 /* Instruction path. */
41424 enum insn_path {
41425 no_path = 0,
41426 path_single, /* Single micro op. */
41427 path_double, /* Double micro op. */
41428 path_multi, /* Instructions with more than 2 micro op.. */
41429 last_path
41432 /* sched_insn_info defines a window to the instructions scheduled in
41433 the basic block. It contains a pointer to the insn_info table and
41434 the instruction scheduled.
41436 Windows are allocated for each basic block and are linked
41437 together. */
41438 typedef struct sched_insn_info_s {
41439 rtx insn;
41440 enum dispatch_group group;
41441 enum insn_path path;
41442 int byte_len;
41443 int imm_bytes;
41444 } sched_insn_info;
41446 /* Linked list of dispatch windows. This is a two way list of
41447 dispatch windows of a basic block. It contains information about
41448 the number of uops in the window and the total number of
41449 instructions and of bytes in the object code for this dispatch
41450 window. */
41451 typedef struct dispatch_windows_s {
41452 int num_insn; /* Number of insn in the window. */
41453 int num_uops; /* Number of uops in the window. */
41454 int window_size; /* Number of bytes in the window. */
41455 int window_num; /* Window number between 0 or 1. */
41456 int num_imm; /* Number of immediates in an insn. */
41457 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41458 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41459 int imm_size; /* Total immediates in the window. */
41460 int num_loads; /* Total memory loads in the window. */
41461 int num_stores; /* Total memory stores in the window. */
41462 int violation; /* Violation exists in window. */
41463 sched_insn_info *window; /* Pointer to the window. */
41464 struct dispatch_windows_s *next;
41465 struct dispatch_windows_s *prev;
41466 } dispatch_windows;
41468 /* Immediate valuse used in an insn. */
41469 typedef struct imm_info_s
41471 int imm;
41472 int imm32;
41473 int imm64;
41474 } imm_info;
41476 static dispatch_windows *dispatch_window_list;
41477 static dispatch_windows *dispatch_window_list1;
41479 /* Get dispatch group of insn. */
41481 static enum dispatch_group
41482 get_mem_group (rtx insn)
41484 enum attr_memory memory;
41486 if (INSN_CODE (insn) < 0)
41487 return disp_no_group;
41488 memory = get_attr_memory (insn);
41489 if (memory == MEMORY_STORE)
41490 return disp_store;
41492 if (memory == MEMORY_LOAD)
41493 return disp_load;
41495 if (memory == MEMORY_BOTH)
41496 return disp_load_store;
41498 return disp_no_group;
41501 /* Return true if insn is a compare instruction. */
41503 static bool
41504 is_cmp (rtx insn)
41506 enum attr_type type;
41508 type = get_attr_type (insn);
41509 return (type == TYPE_TEST
41510 || type == TYPE_ICMP
41511 || type == TYPE_FCMP
41512 || GET_CODE (PATTERN (insn)) == COMPARE);
41515 /* Return true if a dispatch violation encountered. */
41517 static bool
41518 dispatch_violation (void)
41520 if (dispatch_window_list->next)
41521 return dispatch_window_list->next->violation;
41522 return dispatch_window_list->violation;
41525 /* Return true if insn is a branch instruction. */
41527 static bool
41528 is_branch (rtx insn)
41530 return (CALL_P (insn) || JUMP_P (insn));
41533 /* Return true if insn is a prefetch instruction. */
41535 static bool
41536 is_prefetch (rtx insn)
41538 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41541 /* This function initializes a dispatch window and the list container holding a
41542 pointer to the window. */
41544 static void
41545 init_window (int window_num)
41547 int i;
41548 dispatch_windows *new_list;
41550 if (window_num == 0)
41551 new_list = dispatch_window_list;
41552 else
41553 new_list = dispatch_window_list1;
41555 new_list->num_insn = 0;
41556 new_list->num_uops = 0;
41557 new_list->window_size = 0;
41558 new_list->next = NULL;
41559 new_list->prev = NULL;
41560 new_list->window_num = window_num;
41561 new_list->num_imm = 0;
41562 new_list->num_imm_32 = 0;
41563 new_list->num_imm_64 = 0;
41564 new_list->imm_size = 0;
41565 new_list->num_loads = 0;
41566 new_list->num_stores = 0;
41567 new_list->violation = false;
41569 for (i = 0; i < MAX_INSN; i++)
41571 new_list->window[i].insn = NULL;
41572 new_list->window[i].group = disp_no_group;
41573 new_list->window[i].path = no_path;
41574 new_list->window[i].byte_len = 0;
41575 new_list->window[i].imm_bytes = 0;
41577 return;
41580 /* This function allocates and initializes a dispatch window and the
41581 list container holding a pointer to the window. */
41583 static dispatch_windows *
41584 allocate_window (void)
41586 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41587 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41589 return new_list;
41592 /* This routine initializes the dispatch scheduling information. It
41593 initiates building dispatch scheduler tables and constructs the
41594 first dispatch window. */
41596 static void
41597 init_dispatch_sched (void)
41599 /* Allocate a dispatch list and a window. */
41600 dispatch_window_list = allocate_window ();
41601 dispatch_window_list1 = allocate_window ();
41602 init_window (0);
41603 init_window (1);
41606 /* This function returns true if a branch is detected. End of a basic block
41607 does not have to be a branch, but here we assume only branches end a
41608 window. */
41610 static bool
41611 is_end_basic_block (enum dispatch_group group)
41613 return group == disp_branch;
41616 /* This function is called when the end of a window processing is reached. */
41618 static void
41619 process_end_window (void)
41621 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41622 if (dispatch_window_list->next)
41624 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41625 gcc_assert (dispatch_window_list->window_size
41626 + dispatch_window_list1->window_size <= 48);
41627 init_window (1);
41629 init_window (0);
41632 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41633 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41634 for 48 bytes of instructions. Note that these windows are not dispatch
41635 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41637 static dispatch_windows *
41638 allocate_next_window (int window_num)
41640 if (window_num == 0)
41642 if (dispatch_window_list->next)
41643 init_window (1);
41644 init_window (0);
41645 return dispatch_window_list;
41648 dispatch_window_list->next = dispatch_window_list1;
41649 dispatch_window_list1->prev = dispatch_window_list;
41651 return dispatch_window_list1;
41654 /* Increment the number of immediate operands of an instruction. */
41656 static int
41657 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41659 if (*in_rtx == 0)
41660 return 0;
41662 switch ( GET_CODE (*in_rtx))
41664 case CONST:
41665 case SYMBOL_REF:
41666 case CONST_INT:
41667 (imm_values->imm)++;
41668 if (x86_64_immediate_operand (*in_rtx, SImode))
41669 (imm_values->imm32)++;
41670 else
41671 (imm_values->imm64)++;
41672 break;
41674 case CONST_DOUBLE:
41675 (imm_values->imm)++;
41676 (imm_values->imm64)++;
41677 break;
41679 case CODE_LABEL:
41680 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41682 (imm_values->imm)++;
41683 (imm_values->imm32)++;
41685 break;
41687 default:
41688 break;
41691 return 0;
41694 /* Compute number of immediate operands of an instruction. */
41696 static void
41697 find_constant (rtx in_rtx, imm_info *imm_values)
41699 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41700 (rtx_function) find_constant_1, (void *) imm_values);
41703 /* Return total size of immediate operands of an instruction along with number
41704 of corresponding immediate-operands. It initializes its parameters to zero
41705 befor calling FIND_CONSTANT.
41706 INSN is the input instruction. IMM is the total of immediates.
41707 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41708 bit immediates. */
41710 static int
41711 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41713 imm_info imm_values = {0, 0, 0};
41715 find_constant (insn, &imm_values);
41716 *imm = imm_values.imm;
41717 *imm32 = imm_values.imm32;
41718 *imm64 = imm_values.imm64;
41719 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41722 /* This function indicates if an operand of an instruction is an
41723 immediate. */
41725 static bool
41726 has_immediate (rtx insn)
41728 int num_imm_operand;
41729 int num_imm32_operand;
41730 int num_imm64_operand;
41732 if (insn)
41733 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41734 &num_imm64_operand);
41735 return false;
41738 /* Return single or double path for instructions. */
41740 static enum insn_path
41741 get_insn_path (rtx insn)
41743 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41745 if ((int)path == 0)
41746 return path_single;
41748 if ((int)path == 1)
41749 return path_double;
41751 return path_multi;
41754 /* Return insn dispatch group. */
41756 static enum dispatch_group
41757 get_insn_group (rtx insn)
41759 enum dispatch_group group = get_mem_group (insn);
41760 if (group)
41761 return group;
41763 if (is_branch (insn))
41764 return disp_branch;
41766 if (is_cmp (insn))
41767 return disp_cmp;
41769 if (has_immediate (insn))
41770 return disp_imm;
41772 if (is_prefetch (insn))
41773 return disp_prefetch;
41775 return disp_no_group;
41778 /* Count number of GROUP restricted instructions in a dispatch
41779 window WINDOW_LIST. */
41781 static int
41782 count_num_restricted (rtx insn, dispatch_windows *window_list)
41784 enum dispatch_group group = get_insn_group (insn);
41785 int imm_size;
41786 int num_imm_operand;
41787 int num_imm32_operand;
41788 int num_imm64_operand;
41790 if (group == disp_no_group)
41791 return 0;
41793 if (group == disp_imm)
41795 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41796 &num_imm64_operand);
41797 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41798 || num_imm_operand + window_list->num_imm > MAX_IMM
41799 || (num_imm32_operand > 0
41800 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41801 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41802 || (num_imm64_operand > 0
41803 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41804 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41805 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41806 && num_imm64_operand > 0
41807 && ((window_list->num_imm_64 > 0
41808 && window_list->num_insn >= 2)
41809 || window_list->num_insn >= 3)))
41810 return BIG;
41812 return 1;
41815 if ((group == disp_load_store
41816 && (window_list->num_loads >= MAX_LOAD
41817 || window_list->num_stores >= MAX_STORE))
41818 || ((group == disp_load
41819 || group == disp_prefetch)
41820 && window_list->num_loads >= MAX_LOAD)
41821 || (group == disp_store
41822 && window_list->num_stores >= MAX_STORE))
41823 return BIG;
41825 return 1;
41828 /* This function returns true if insn satisfies dispatch rules on the
41829 last window scheduled. */
41831 static bool
41832 fits_dispatch_window (rtx insn)
41834 dispatch_windows *window_list = dispatch_window_list;
41835 dispatch_windows *window_list_next = dispatch_window_list->next;
41836 unsigned int num_restrict;
41837 enum dispatch_group group = get_insn_group (insn);
41838 enum insn_path path = get_insn_path (insn);
41839 int sum;
41841 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41842 instructions should be given the lowest priority in the
41843 scheduling process in Haifa scheduler to make sure they will be
41844 scheduled in the same dispatch window as the reference to them. */
41845 if (group == disp_jcc || group == disp_cmp)
41846 return false;
41848 /* Check nonrestricted. */
41849 if (group == disp_no_group || group == disp_branch)
41850 return true;
41852 /* Get last dispatch window. */
41853 if (window_list_next)
41854 window_list = window_list_next;
41856 if (window_list->window_num == 1)
41858 sum = window_list->prev->window_size + window_list->window_size;
41860 if (sum == 32
41861 || (min_insn_size (insn) + sum) >= 48)
41862 /* Window 1 is full. Go for next window. */
41863 return true;
41866 num_restrict = count_num_restricted (insn, window_list);
41868 if (num_restrict > num_allowable_groups[group])
41869 return false;
41871 /* See if it fits in the first window. */
41872 if (window_list->window_num == 0)
41874 /* The first widow should have only single and double path
41875 uops. */
41876 if (path == path_double
41877 && (window_list->num_uops + 2) > MAX_INSN)
41878 return false;
41879 else if (path != path_single)
41880 return false;
41882 return true;
41885 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41886 dispatch window WINDOW_LIST. */
41888 static void
41889 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41891 int byte_len = min_insn_size (insn);
41892 int num_insn = window_list->num_insn;
41893 int imm_size;
41894 sched_insn_info *window = window_list->window;
41895 enum dispatch_group group = get_insn_group (insn);
41896 enum insn_path path = get_insn_path (insn);
41897 int num_imm_operand;
41898 int num_imm32_operand;
41899 int num_imm64_operand;
41901 if (!window_list->violation && group != disp_cmp
41902 && !fits_dispatch_window (insn))
41903 window_list->violation = true;
41905 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41906 &num_imm64_operand);
41908 /* Initialize window with new instruction. */
41909 window[num_insn].insn = insn;
41910 window[num_insn].byte_len = byte_len;
41911 window[num_insn].group = group;
41912 window[num_insn].path = path;
41913 window[num_insn].imm_bytes = imm_size;
41915 window_list->window_size += byte_len;
41916 window_list->num_insn = num_insn + 1;
41917 window_list->num_uops = window_list->num_uops + num_uops;
41918 window_list->imm_size += imm_size;
41919 window_list->num_imm += num_imm_operand;
41920 window_list->num_imm_32 += num_imm32_operand;
41921 window_list->num_imm_64 += num_imm64_operand;
41923 if (group == disp_store)
41924 window_list->num_stores += 1;
41925 else if (group == disp_load
41926 || group == disp_prefetch)
41927 window_list->num_loads += 1;
41928 else if (group == disp_load_store)
41930 window_list->num_stores += 1;
41931 window_list->num_loads += 1;
41935 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41936 If the total bytes of instructions or the number of instructions in
41937 the window exceed allowable, it allocates a new window. */
41939 static void
41940 add_to_dispatch_window (rtx insn)
41942 int byte_len;
41943 dispatch_windows *window_list;
41944 dispatch_windows *next_list;
41945 dispatch_windows *window0_list;
41946 enum insn_path path;
41947 enum dispatch_group insn_group;
41948 bool insn_fits;
41949 int num_insn;
41950 int num_uops;
41951 int window_num;
41952 int insn_num_uops;
41953 int sum;
41955 if (INSN_CODE (insn) < 0)
41956 return;
41958 byte_len = min_insn_size (insn);
41959 window_list = dispatch_window_list;
41960 next_list = window_list->next;
41961 path = get_insn_path (insn);
41962 insn_group = get_insn_group (insn);
41964 /* Get the last dispatch window. */
41965 if (next_list)
41966 window_list = dispatch_window_list->next;
41968 if (path == path_single)
41969 insn_num_uops = 1;
41970 else if (path == path_double)
41971 insn_num_uops = 2;
41972 else
41973 insn_num_uops = (int) path;
41975 /* If current window is full, get a new window.
41976 Window number zero is full, if MAX_INSN uops are scheduled in it.
41977 Window number one is full, if window zero's bytes plus window
41978 one's bytes is 32, or if the bytes of the new instruction added
41979 to the total makes it greater than 48, or it has already MAX_INSN
41980 instructions in it. */
41981 num_insn = window_list->num_insn;
41982 num_uops = window_list->num_uops;
41983 window_num = window_list->window_num;
41984 insn_fits = fits_dispatch_window (insn);
41986 if (num_insn >= MAX_INSN
41987 || num_uops + insn_num_uops > MAX_INSN
41988 || !(insn_fits))
41990 window_num = ~window_num & 1;
41991 window_list = allocate_next_window (window_num);
41994 if (window_num == 0)
41996 add_insn_window (insn, window_list, insn_num_uops);
41997 if (window_list->num_insn >= MAX_INSN
41998 && insn_group == disp_branch)
42000 process_end_window ();
42001 return;
42004 else if (window_num == 1)
42006 window0_list = window_list->prev;
42007 sum = window0_list->window_size + window_list->window_size;
42008 if (sum == 32
42009 || (byte_len + sum) >= 48)
42011 process_end_window ();
42012 window_list = dispatch_window_list;
42015 add_insn_window (insn, window_list, insn_num_uops);
42017 else
42018 gcc_unreachable ();
42020 if (is_end_basic_block (insn_group))
42022 /* End of basic block is reached do end-basic-block process. */
42023 process_end_window ();
42024 return;
42028 /* Print the dispatch window, WINDOW_NUM, to FILE. */
42030 DEBUG_FUNCTION static void
42031 debug_dispatch_window_file (FILE *file, int window_num)
42033 dispatch_windows *list;
42034 int i;
42036 if (window_num == 0)
42037 list = dispatch_window_list;
42038 else
42039 list = dispatch_window_list1;
42041 fprintf (file, "Window #%d:\n", list->window_num);
42042 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
42043 list->num_insn, list->num_uops, list->window_size);
42044 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42045 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
42047 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
42048 list->num_stores);
42049 fprintf (file, " insn info:\n");
42051 for (i = 0; i < MAX_INSN; i++)
42053 if (!list->window[i].insn)
42054 break;
42055 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
42056 i, group_name[list->window[i].group],
42057 i, (void *)list->window[i].insn,
42058 i, list->window[i].path,
42059 i, list->window[i].byte_len,
42060 i, list->window[i].imm_bytes);
42064 /* Print to stdout a dispatch window. */
42066 DEBUG_FUNCTION void
42067 debug_dispatch_window (int window_num)
42069 debug_dispatch_window_file (stdout, window_num);
42072 /* Print INSN dispatch information to FILE. */
42074 DEBUG_FUNCTION static void
42075 debug_insn_dispatch_info_file (FILE *file, rtx insn)
42077 int byte_len;
42078 enum insn_path path;
42079 enum dispatch_group group;
42080 int imm_size;
42081 int num_imm_operand;
42082 int num_imm32_operand;
42083 int num_imm64_operand;
42085 if (INSN_CODE (insn) < 0)
42086 return;
42088 byte_len = min_insn_size (insn);
42089 path = get_insn_path (insn);
42090 group = get_insn_group (insn);
42091 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42092 &num_imm64_operand);
42094 fprintf (file, " insn info:\n");
42095 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
42096 group_name[group], path, byte_len);
42097 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42098 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
42101 /* Print to STDERR the status of the ready list with respect to
42102 dispatch windows. */
42104 DEBUG_FUNCTION void
42105 debug_ready_dispatch (void)
42107 int i;
42108 int no_ready = number_in_ready ();
42110 fprintf (stdout, "Number of ready: %d\n", no_ready);
42112 for (i = 0; i < no_ready; i++)
42113 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
42116 /* This routine is the driver of the dispatch scheduler. */
42118 static void
42119 do_dispatch (rtx insn, int mode)
42121 if (mode == DISPATCH_INIT)
42122 init_dispatch_sched ();
42123 else if (mode == ADD_TO_DISPATCH_WINDOW)
42124 add_to_dispatch_window (insn);
42127 /* Return TRUE if Dispatch Scheduling is supported. */
42129 static bool
42130 has_dispatch (rtx insn, int action)
42132 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
42133 && flag_dispatch_scheduler)
42134 switch (action)
42136 default:
42137 return false;
42139 case IS_DISPATCH_ON:
42140 return true;
42141 break;
42143 case IS_CMP:
42144 return is_cmp (insn);
42146 case DISPATCH_VIOLATION:
42147 return dispatch_violation ();
42149 case FITS_DISPATCH_WINDOW:
42150 return fits_dispatch_window (insn);
42153 return false;
42156 /* Implementation of reassociation_width target hook used by
42157 reassoc phase to identify parallelism level in reassociated
42158 tree. Statements tree_code is passed in OPC. Arguments type
42159 is passed in MODE.
42161 Currently parallel reassociation is enabled for Atom
42162 processors only and we set reassociation width to be 2
42163 because Atom may issue up to 2 instructions per cycle.
42165 Return value should be fixed if parallel reassociation is
42166 enabled for other processors. */
42168 static int
42169 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42170 enum machine_mode mode)
42172 int res = 1;
42174 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42175 res = 2;
42176 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42177 res = 2;
42179 return res;
42182 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42183 place emms and femms instructions. */
42185 static enum machine_mode
42186 ix86_preferred_simd_mode (enum machine_mode mode)
42188 if (!TARGET_SSE)
42189 return word_mode;
42191 switch (mode)
42193 case QImode:
42194 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42195 case HImode:
42196 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42197 case SImode:
42198 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42199 case DImode:
42200 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42202 case SFmode:
42203 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42204 return V8SFmode;
42205 else
42206 return V4SFmode;
42208 case DFmode:
42209 if (!TARGET_VECTORIZE_DOUBLE)
42210 return word_mode;
42211 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42212 return V4DFmode;
42213 else if (TARGET_SSE2)
42214 return V2DFmode;
42215 /* FALLTHRU */
42217 default:
42218 return word_mode;
42222 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42223 vectors. */
42225 static unsigned int
42226 ix86_autovectorize_vector_sizes (void)
42228 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42233 /* Return class of registers which could be used for pseudo of MODE
42234 and of class RCLASS for spilling instead of memory. Return NO_REGS
42235 if it is not possible or non-profitable. */
42236 static reg_class_t
42237 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42239 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42240 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42241 && INTEGER_CLASS_P (rclass))
42242 return SSE_REGS;
42243 return NO_REGS;
42246 /* Implement targetm.vectorize.init_cost. */
42248 static void *
42249 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42251 unsigned *cost = XNEWVEC (unsigned, 3);
42252 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42253 return cost;
42256 /* Implement targetm.vectorize.add_stmt_cost. */
42258 static unsigned
42259 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42260 struct _stmt_vec_info *stmt_info, int misalign,
42261 enum vect_cost_model_location where)
42263 unsigned *cost = (unsigned *) data;
42264 unsigned retval = 0;
42266 if (flag_vect_cost_model)
42268 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42269 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42271 /* Statements in an inner loop relative to the loop being
42272 vectorized are weighted more heavily. The value here is
42273 arbitrary and could potentially be improved with analysis. */
42274 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42275 count *= 50; /* FIXME. */
42277 retval = (unsigned) (count * stmt_cost);
42278 cost[where] += retval;
42281 return retval;
42284 /* Implement targetm.vectorize.finish_cost. */
42286 static void
42287 ix86_finish_cost (void *data, unsigned *prologue_cost,
42288 unsigned *body_cost, unsigned *epilogue_cost)
42290 unsigned *cost = (unsigned *) data;
42291 *prologue_cost = cost[vect_prologue];
42292 *body_cost = cost[vect_body];
42293 *epilogue_cost = cost[vect_epilogue];
42296 /* Implement targetm.vectorize.destroy_cost_data. */
42298 static void
42299 ix86_destroy_cost_data (void *data)
42301 free (data);
42304 /* Validate target specific memory model bits in VAL. */
42306 static unsigned HOST_WIDE_INT
42307 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42309 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42310 bool strong;
42312 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42313 |MEMMODEL_MASK)
42314 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42316 warning (OPT_Winvalid_memory_model,
42317 "Unknown architecture specific memory model");
42318 return MEMMODEL_SEQ_CST;
42320 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42321 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42323 warning (OPT_Winvalid_memory_model,
42324 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42325 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42327 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42329 warning (OPT_Winvalid_memory_model,
42330 "HLE_RELEASE not used with RELEASE or stronger memory model");
42331 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42333 return val;
42336 /* Initialize the GCC target structure. */
42337 #undef TARGET_RETURN_IN_MEMORY
42338 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42340 #undef TARGET_LEGITIMIZE_ADDRESS
42341 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42343 #undef TARGET_ATTRIBUTE_TABLE
42344 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42345 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42346 # undef TARGET_MERGE_DECL_ATTRIBUTES
42347 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42348 #endif
42350 #undef TARGET_COMP_TYPE_ATTRIBUTES
42351 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42353 #undef TARGET_INIT_BUILTINS
42354 #define TARGET_INIT_BUILTINS ix86_init_builtins
42355 #undef TARGET_BUILTIN_DECL
42356 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42357 #undef TARGET_EXPAND_BUILTIN
42358 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42360 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42361 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42362 ix86_builtin_vectorized_function
42364 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42365 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42367 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42368 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42370 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42371 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42373 #undef TARGET_BUILTIN_RECIPROCAL
42374 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42376 #undef TARGET_ASM_FUNCTION_EPILOGUE
42377 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42379 #undef TARGET_ENCODE_SECTION_INFO
42380 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42381 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42382 #else
42383 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42384 #endif
42386 #undef TARGET_ASM_OPEN_PAREN
42387 #define TARGET_ASM_OPEN_PAREN ""
42388 #undef TARGET_ASM_CLOSE_PAREN
42389 #define TARGET_ASM_CLOSE_PAREN ""
42391 #undef TARGET_ASM_BYTE_OP
42392 #define TARGET_ASM_BYTE_OP ASM_BYTE
42394 #undef TARGET_ASM_ALIGNED_HI_OP
42395 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42396 #undef TARGET_ASM_ALIGNED_SI_OP
42397 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42398 #ifdef ASM_QUAD
42399 #undef TARGET_ASM_ALIGNED_DI_OP
42400 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42401 #endif
42403 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42404 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42406 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42407 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42409 #undef TARGET_ASM_UNALIGNED_HI_OP
42410 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42411 #undef TARGET_ASM_UNALIGNED_SI_OP
42412 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42413 #undef TARGET_ASM_UNALIGNED_DI_OP
42414 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42416 #undef TARGET_PRINT_OPERAND
42417 #define TARGET_PRINT_OPERAND ix86_print_operand
42418 #undef TARGET_PRINT_OPERAND_ADDRESS
42419 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42420 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42421 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42422 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42423 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42425 #undef TARGET_SCHED_INIT_GLOBAL
42426 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42427 #undef TARGET_SCHED_ADJUST_COST
42428 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42429 #undef TARGET_SCHED_ISSUE_RATE
42430 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42431 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42432 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42433 ia32_multipass_dfa_lookahead
42435 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42436 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42438 #undef TARGET_MEMMODEL_CHECK
42439 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42441 #ifdef HAVE_AS_TLS
42442 #undef TARGET_HAVE_TLS
42443 #define TARGET_HAVE_TLS true
42444 #endif
42445 #undef TARGET_CANNOT_FORCE_CONST_MEM
42446 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42447 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42448 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42450 #undef TARGET_DELEGITIMIZE_ADDRESS
42451 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42453 #undef TARGET_MS_BITFIELD_LAYOUT_P
42454 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42456 #if TARGET_MACHO
42457 #undef TARGET_BINDS_LOCAL_P
42458 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42459 #endif
42460 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42461 #undef TARGET_BINDS_LOCAL_P
42462 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42463 #endif
42465 #undef TARGET_ASM_OUTPUT_MI_THUNK
42466 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42467 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42468 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42470 #undef TARGET_ASM_FILE_START
42471 #define TARGET_ASM_FILE_START x86_file_start
42473 #undef TARGET_OPTION_OVERRIDE
42474 #define TARGET_OPTION_OVERRIDE ix86_option_override
42476 #undef TARGET_REGISTER_MOVE_COST
42477 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42478 #undef TARGET_MEMORY_MOVE_COST
42479 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42480 #undef TARGET_RTX_COSTS
42481 #define TARGET_RTX_COSTS ix86_rtx_costs
42482 #undef TARGET_ADDRESS_COST
42483 #define TARGET_ADDRESS_COST ix86_address_cost
42485 #undef TARGET_FIXED_CONDITION_CODE_REGS
42486 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42487 #undef TARGET_CC_MODES_COMPATIBLE
42488 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42490 #undef TARGET_MACHINE_DEPENDENT_REORG
42491 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42493 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42494 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42496 #undef TARGET_BUILD_BUILTIN_VA_LIST
42497 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42499 #undef TARGET_FOLD_BUILTIN
42500 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42502 #undef TARGET_COMPARE_VERSION_PRIORITY
42503 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42505 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42506 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42507 ix86_generate_version_dispatcher_body
42509 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42510 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42511 ix86_get_function_versions_dispatcher
42513 #undef TARGET_ENUM_VA_LIST_P
42514 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42516 #undef TARGET_FN_ABI_VA_LIST
42517 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42519 #undef TARGET_CANONICAL_VA_LIST_TYPE
42520 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42522 #undef TARGET_EXPAND_BUILTIN_VA_START
42523 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42525 #undef TARGET_MD_ASM_CLOBBERS
42526 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42528 #undef TARGET_PROMOTE_PROTOTYPES
42529 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42530 #undef TARGET_STRUCT_VALUE_RTX
42531 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42532 #undef TARGET_SETUP_INCOMING_VARARGS
42533 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42534 #undef TARGET_MUST_PASS_IN_STACK
42535 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42536 #undef TARGET_FUNCTION_ARG_ADVANCE
42537 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42538 #undef TARGET_FUNCTION_ARG
42539 #define TARGET_FUNCTION_ARG ix86_function_arg
42540 #undef TARGET_FUNCTION_ARG_BOUNDARY
42541 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42542 #undef TARGET_PASS_BY_REFERENCE
42543 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42544 #undef TARGET_INTERNAL_ARG_POINTER
42545 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42546 #undef TARGET_UPDATE_STACK_BOUNDARY
42547 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42548 #undef TARGET_GET_DRAP_RTX
42549 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42550 #undef TARGET_STRICT_ARGUMENT_NAMING
42551 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42552 #undef TARGET_STATIC_CHAIN
42553 #define TARGET_STATIC_CHAIN ix86_static_chain
42554 #undef TARGET_TRAMPOLINE_INIT
42555 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42556 #undef TARGET_RETURN_POPS_ARGS
42557 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42559 #undef TARGET_LEGITIMATE_COMBINED_INSN
42560 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42562 #undef TARGET_ASAN_SHADOW_OFFSET
42563 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42565 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42566 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42568 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42569 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42571 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42572 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42574 #undef TARGET_C_MODE_FOR_SUFFIX
42575 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42577 #ifdef HAVE_AS_TLS
42578 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42579 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42580 #endif
42582 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42583 #undef TARGET_INSERT_ATTRIBUTES
42584 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42585 #endif
42587 #undef TARGET_MANGLE_TYPE
42588 #define TARGET_MANGLE_TYPE ix86_mangle_type
42590 #if !TARGET_MACHO
42591 #undef TARGET_STACK_PROTECT_FAIL
42592 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42593 #endif
42595 #undef TARGET_FUNCTION_VALUE
42596 #define TARGET_FUNCTION_VALUE ix86_function_value
42598 #undef TARGET_FUNCTION_VALUE_REGNO_P
42599 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42601 #undef TARGET_PROMOTE_FUNCTION_MODE
42602 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42604 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42605 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42607 #undef TARGET_INSTANTIATE_DECLS
42608 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42610 #undef TARGET_SECONDARY_RELOAD
42611 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42613 #undef TARGET_CLASS_MAX_NREGS
42614 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42616 #undef TARGET_PREFERRED_RELOAD_CLASS
42617 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42618 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42619 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42620 #undef TARGET_CLASS_LIKELY_SPILLED_P
42621 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42623 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42624 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42625 ix86_builtin_vectorization_cost
42626 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42627 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42628 ix86_vectorize_vec_perm_const_ok
42629 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42630 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42631 ix86_preferred_simd_mode
42632 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42633 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42634 ix86_autovectorize_vector_sizes
42635 #undef TARGET_VECTORIZE_INIT_COST
42636 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42637 #undef TARGET_VECTORIZE_ADD_STMT_COST
42638 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42639 #undef TARGET_VECTORIZE_FINISH_COST
42640 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42641 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42642 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42644 #undef TARGET_SET_CURRENT_FUNCTION
42645 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42647 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42648 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42650 #undef TARGET_OPTION_SAVE
42651 #define TARGET_OPTION_SAVE ix86_function_specific_save
42653 #undef TARGET_OPTION_RESTORE
42654 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42656 #undef TARGET_OPTION_PRINT
42657 #define TARGET_OPTION_PRINT ix86_function_specific_print
42659 #undef TARGET_OPTION_FUNCTION_VERSIONS
42660 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42662 #undef TARGET_CAN_INLINE_P
42663 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42665 #undef TARGET_EXPAND_TO_RTL_HOOK
42666 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42668 #undef TARGET_LEGITIMATE_ADDRESS_P
42669 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42671 #undef TARGET_LRA_P
42672 #define TARGET_LRA_P hook_bool_void_true
42674 #undef TARGET_REGISTER_PRIORITY
42675 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42677 #undef TARGET_LEGITIMATE_CONSTANT_P
42678 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42680 #undef TARGET_FRAME_POINTER_REQUIRED
42681 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42683 #undef TARGET_CAN_ELIMINATE
42684 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42686 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42687 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42689 #undef TARGET_ASM_CODE_END
42690 #define TARGET_ASM_CODE_END ix86_code_end
42692 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42693 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42695 #if TARGET_MACHO
42696 #undef TARGET_INIT_LIBFUNCS
42697 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42698 #endif
42700 #undef TARGET_SPILL_CLASS
42701 #define TARGET_SPILL_CLASS ix86_spill_class
42703 struct gcc_target targetm = TARGET_INITIALIZER;
42705 #include "gt-i386.h"