Implement -mmemcpy-strategy= and -mmemset-strategy= options
[official-gcc.git] / gcc / config / i386 / i386.c
blob509cb6e8f832d1fc42300f85ce97375c3a7b5310
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
65 #include "context.h"
66 #include "pass_manager.h"
68 static rtx legitimize_dllimport_symbol (rtx, bool);
69 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
70 static rtx legitimize_pe_coff_symbol (rtx, bool);
72 #ifndef CHECK_STACK_LIMIT
73 #define CHECK_STACK_LIMIT (-1)
74 #endif
76 /* Return index of given mode in mult and division cost tables. */
77 #define MODE_INDEX(mode) \
78 ((mode) == QImode ? 0 \
79 : (mode) == HImode ? 1 \
80 : (mode) == SImode ? 2 \
81 : (mode) == DImode ? 3 \
82 : 4)
84 /* Processor costs (relative to an add) */
85 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
86 #define COSTS_N_BYTES(N) ((N) * 2)
88 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
90 static stringop_algs ix86_size_memcpy[2] = {
91 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
92 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
93 static stringop_algs ix86_size_memset[2] = {
94 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
95 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
97 const
98 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
99 COSTS_N_BYTES (2), /* cost of an add instruction */
100 COSTS_N_BYTES (3), /* cost of a lea instruction */
101 COSTS_N_BYTES (2), /* variable shift costs */
102 COSTS_N_BYTES (3), /* constant shift costs */
103 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
104 COSTS_N_BYTES (3), /* HI */
105 COSTS_N_BYTES (3), /* SI */
106 COSTS_N_BYTES (3), /* DI */
107 COSTS_N_BYTES (5)}, /* other */
108 0, /* cost of multiply per each bit set */
109 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
110 COSTS_N_BYTES (3), /* HI */
111 COSTS_N_BYTES (3), /* SI */
112 COSTS_N_BYTES (3), /* DI */
113 COSTS_N_BYTES (5)}, /* other */
114 COSTS_N_BYTES (3), /* cost of movsx */
115 COSTS_N_BYTES (3), /* cost of movzx */
116 0, /* "large" insn */
117 2, /* MOVE_RATIO */
118 2, /* cost for loading QImode using movzbl */
119 {2, 2, 2}, /* cost of loading integer registers
120 in QImode, HImode and SImode.
121 Relative to reg-reg move (2). */
122 {2, 2, 2}, /* cost of storing integer registers */
123 2, /* cost of reg,reg fld/fst */
124 {2, 2, 2}, /* cost of loading fp registers
125 in SFmode, DFmode and XFmode */
126 {2, 2, 2}, /* cost of storing fp registers
127 in SFmode, DFmode and XFmode */
128 3, /* cost of moving MMX register */
129 {3, 3}, /* cost of loading MMX registers
130 in SImode and DImode */
131 {3, 3}, /* cost of storing MMX registers
132 in SImode and DImode */
133 3, /* cost of moving SSE register */
134 {3, 3, 3}, /* cost of loading SSE registers
135 in SImode, DImode and TImode */
136 {3, 3, 3}, /* cost of storing SSE registers
137 in SImode, DImode and TImode */
138 3, /* MMX or SSE register to integer */
139 0, /* size of l1 cache */
140 0, /* size of l2 cache */
141 0, /* size of prefetch block */
142 0, /* number of parallel prefetches */
143 2, /* Branch cost */
144 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
145 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
146 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
147 COSTS_N_BYTES (2), /* cost of FABS instruction. */
148 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
149 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
150 ix86_size_memcpy,
151 ix86_size_memset,
152 1, /* scalar_stmt_cost. */
153 1, /* scalar load_cost. */
154 1, /* scalar_store_cost. */
155 1, /* vec_stmt_cost. */
156 1, /* vec_to_scalar_cost. */
157 1, /* scalar_to_vec_cost. */
158 1, /* vec_align_load_cost. */
159 1, /* vec_unalign_load_cost. */
160 1, /* vec_store_cost. */
161 1, /* cond_taken_branch_cost. */
162 1, /* cond_not_taken_branch_cost. */
165 /* Processor costs (relative to an add) */
166 static stringop_algs i386_memcpy[2] = {
167 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
168 DUMMY_STRINGOP_ALGS};
169 static stringop_algs i386_memset[2] = {
170 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
171 DUMMY_STRINGOP_ALGS};
173 static const
174 struct processor_costs i386_cost = { /* 386 specific costs */
175 COSTS_N_INSNS (1), /* cost of an add instruction */
176 COSTS_N_INSNS (1), /* cost of a lea instruction */
177 COSTS_N_INSNS (3), /* variable shift costs */
178 COSTS_N_INSNS (2), /* constant shift costs */
179 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
180 COSTS_N_INSNS (6), /* HI */
181 COSTS_N_INSNS (6), /* SI */
182 COSTS_N_INSNS (6), /* DI */
183 COSTS_N_INSNS (6)}, /* other */
184 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
185 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
186 COSTS_N_INSNS (23), /* HI */
187 COSTS_N_INSNS (23), /* SI */
188 COSTS_N_INSNS (23), /* DI */
189 COSTS_N_INSNS (23)}, /* other */
190 COSTS_N_INSNS (3), /* cost of movsx */
191 COSTS_N_INSNS (2), /* cost of movzx */
192 15, /* "large" insn */
193 3, /* MOVE_RATIO */
194 4, /* cost for loading QImode using movzbl */
195 {2, 4, 2}, /* cost of loading integer registers
196 in QImode, HImode and SImode.
197 Relative to reg-reg move (2). */
198 {2, 4, 2}, /* cost of storing integer registers */
199 2, /* cost of reg,reg fld/fst */
200 {8, 8, 8}, /* cost of loading fp registers
201 in SFmode, DFmode and XFmode */
202 {8, 8, 8}, /* cost of storing fp registers
203 in SFmode, DFmode and XFmode */
204 2, /* cost of moving MMX register */
205 {4, 8}, /* cost of loading MMX registers
206 in SImode and DImode */
207 {4, 8}, /* cost of storing MMX registers
208 in SImode and DImode */
209 2, /* cost of moving SSE register */
210 {4, 8, 16}, /* cost of loading SSE registers
211 in SImode, DImode and TImode */
212 {4, 8, 16}, /* cost of storing SSE registers
213 in SImode, DImode and TImode */
214 3, /* MMX or SSE register to integer */
215 0, /* size of l1 cache */
216 0, /* size of l2 cache */
217 0, /* size of prefetch block */
218 0, /* number of parallel prefetches */
219 1, /* Branch cost */
220 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
221 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
222 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
223 COSTS_N_INSNS (22), /* cost of FABS instruction. */
224 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
225 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
226 i386_memcpy,
227 i386_memset,
228 1, /* scalar_stmt_cost. */
229 1, /* scalar load_cost. */
230 1, /* scalar_store_cost. */
231 1, /* vec_stmt_cost. */
232 1, /* vec_to_scalar_cost. */
233 1, /* scalar_to_vec_cost. */
234 1, /* vec_align_load_cost. */
235 2, /* vec_unalign_load_cost. */
236 1, /* vec_store_cost. */
237 3, /* cond_taken_branch_cost. */
238 1, /* cond_not_taken_branch_cost. */
241 static stringop_algs i486_memcpy[2] = {
242 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
243 DUMMY_STRINGOP_ALGS};
244 static stringop_algs i486_memset[2] = {
245 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
246 DUMMY_STRINGOP_ALGS};
248 static const
249 struct processor_costs i486_cost = { /* 486 specific costs */
250 COSTS_N_INSNS (1), /* cost of an add instruction */
251 COSTS_N_INSNS (1), /* cost of a lea instruction */
252 COSTS_N_INSNS (3), /* variable shift costs */
253 COSTS_N_INSNS (2), /* constant shift costs */
254 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
255 COSTS_N_INSNS (12), /* HI */
256 COSTS_N_INSNS (12), /* SI */
257 COSTS_N_INSNS (12), /* DI */
258 COSTS_N_INSNS (12)}, /* other */
259 1, /* cost of multiply per each bit set */
260 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
261 COSTS_N_INSNS (40), /* HI */
262 COSTS_N_INSNS (40), /* SI */
263 COSTS_N_INSNS (40), /* DI */
264 COSTS_N_INSNS (40)}, /* other */
265 COSTS_N_INSNS (3), /* cost of movsx */
266 COSTS_N_INSNS (2), /* cost of movzx */
267 15, /* "large" insn */
268 3, /* MOVE_RATIO */
269 4, /* cost for loading QImode using movzbl */
270 {2, 4, 2}, /* cost of loading integer registers
271 in QImode, HImode and SImode.
272 Relative to reg-reg move (2). */
273 {2, 4, 2}, /* cost of storing integer registers */
274 2, /* cost of reg,reg fld/fst */
275 {8, 8, 8}, /* cost of loading fp registers
276 in SFmode, DFmode and XFmode */
277 {8, 8, 8}, /* cost of storing fp registers
278 in SFmode, DFmode and XFmode */
279 2, /* cost of moving MMX register */
280 {4, 8}, /* cost of loading MMX registers
281 in SImode and DImode */
282 {4, 8}, /* cost of storing MMX registers
283 in SImode and DImode */
284 2, /* cost of moving SSE register */
285 {4, 8, 16}, /* cost of loading SSE registers
286 in SImode, DImode and TImode */
287 {4, 8, 16}, /* cost of storing SSE registers
288 in SImode, DImode and TImode */
289 3, /* MMX or SSE register to integer */
290 4, /* size of l1 cache. 486 has 8kB cache
291 shared for code and data, so 4kB is
292 not really precise. */
293 4, /* size of l2 cache */
294 0, /* size of prefetch block */
295 0, /* number of parallel prefetches */
296 1, /* Branch cost */
297 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
298 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
299 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
300 COSTS_N_INSNS (3), /* cost of FABS instruction. */
301 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
302 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
303 i486_memcpy,
304 i486_memset,
305 1, /* scalar_stmt_cost. */
306 1, /* scalar load_cost. */
307 1, /* scalar_store_cost. */
308 1, /* vec_stmt_cost. */
309 1, /* vec_to_scalar_cost. */
310 1, /* scalar_to_vec_cost. */
311 1, /* vec_align_load_cost. */
312 2, /* vec_unalign_load_cost. */
313 1, /* vec_store_cost. */
314 3, /* cond_taken_branch_cost. */
315 1, /* cond_not_taken_branch_cost. */
318 static stringop_algs pentium_memcpy[2] = {
319 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
320 DUMMY_STRINGOP_ALGS};
321 static stringop_algs pentium_memset[2] = {
322 {libcall, {{-1, rep_prefix_4_byte, false}}},
323 DUMMY_STRINGOP_ALGS};
325 static const
326 struct processor_costs pentium_cost = {
327 COSTS_N_INSNS (1), /* cost of an add instruction */
328 COSTS_N_INSNS (1), /* cost of a lea instruction */
329 COSTS_N_INSNS (4), /* variable shift costs */
330 COSTS_N_INSNS (1), /* constant shift costs */
331 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
332 COSTS_N_INSNS (11), /* HI */
333 COSTS_N_INSNS (11), /* SI */
334 COSTS_N_INSNS (11), /* DI */
335 COSTS_N_INSNS (11)}, /* other */
336 0, /* cost of multiply per each bit set */
337 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
338 COSTS_N_INSNS (25), /* HI */
339 COSTS_N_INSNS (25), /* SI */
340 COSTS_N_INSNS (25), /* DI */
341 COSTS_N_INSNS (25)}, /* other */
342 COSTS_N_INSNS (3), /* cost of movsx */
343 COSTS_N_INSNS (2), /* cost of movzx */
344 8, /* "large" insn */
345 6, /* MOVE_RATIO */
346 6, /* cost for loading QImode using movzbl */
347 {2, 4, 2}, /* cost of loading integer registers
348 in QImode, HImode and SImode.
349 Relative to reg-reg move (2). */
350 {2, 4, 2}, /* cost of storing integer registers */
351 2, /* cost of reg,reg fld/fst */
352 {2, 2, 6}, /* cost of loading fp registers
353 in SFmode, DFmode and XFmode */
354 {4, 4, 6}, /* cost of storing fp registers
355 in SFmode, DFmode and XFmode */
356 8, /* cost of moving MMX register */
357 {8, 8}, /* cost of loading MMX registers
358 in SImode and DImode */
359 {8, 8}, /* cost of storing MMX registers
360 in SImode and DImode */
361 2, /* cost of moving SSE register */
362 {4, 8, 16}, /* cost of loading SSE registers
363 in SImode, DImode and TImode */
364 {4, 8, 16}, /* cost of storing SSE registers
365 in SImode, DImode and TImode */
366 3, /* MMX or SSE register to integer */
367 8, /* size of l1 cache. */
368 8, /* size of l2 cache */
369 0, /* size of prefetch block */
370 0, /* number of parallel prefetches */
371 2, /* Branch cost */
372 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
373 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
374 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
375 COSTS_N_INSNS (1), /* cost of FABS instruction. */
376 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
377 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
378 pentium_memcpy,
379 pentium_memset,
380 1, /* scalar_stmt_cost. */
381 1, /* scalar load_cost. */
382 1, /* scalar_store_cost. */
383 1, /* vec_stmt_cost. */
384 1, /* vec_to_scalar_cost. */
385 1, /* scalar_to_vec_cost. */
386 1, /* vec_align_load_cost. */
387 2, /* vec_unalign_load_cost. */
388 1, /* vec_store_cost. */
389 3, /* cond_taken_branch_cost. */
390 1, /* cond_not_taken_branch_cost. */
393 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
394 (we ensure the alignment). For small blocks inline loop is still a
395 noticeable win, for bigger blocks either rep movsl or rep movsb is
396 way to go. Rep movsb has apparently more expensive startup time in CPU,
397 but after 4K the difference is down in the noise. */
398 static stringop_algs pentiumpro_memcpy[2] = {
399 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
400 {8192, rep_prefix_4_byte, false},
401 {-1, rep_prefix_1_byte, false}}},
402 DUMMY_STRINGOP_ALGS};
403 static stringop_algs pentiumpro_memset[2] = {
404 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
405 {8192, rep_prefix_4_byte, false},
406 {-1, libcall, false}}},
407 DUMMY_STRINGOP_ALGS};
408 static const
409 struct processor_costs pentiumpro_cost = {
410 COSTS_N_INSNS (1), /* cost of an add instruction */
411 COSTS_N_INSNS (1), /* cost of a lea instruction */
412 COSTS_N_INSNS (1), /* variable shift costs */
413 COSTS_N_INSNS (1), /* constant shift costs */
414 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
415 COSTS_N_INSNS (4), /* HI */
416 COSTS_N_INSNS (4), /* SI */
417 COSTS_N_INSNS (4), /* DI */
418 COSTS_N_INSNS (4)}, /* other */
419 0, /* cost of multiply per each bit set */
420 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
421 COSTS_N_INSNS (17), /* HI */
422 COSTS_N_INSNS (17), /* SI */
423 COSTS_N_INSNS (17), /* DI */
424 COSTS_N_INSNS (17)}, /* other */
425 COSTS_N_INSNS (1), /* cost of movsx */
426 COSTS_N_INSNS (1), /* cost of movzx */
427 8, /* "large" insn */
428 6, /* MOVE_RATIO */
429 2, /* cost for loading QImode using movzbl */
430 {4, 4, 4}, /* cost of loading integer registers
431 in QImode, HImode and SImode.
432 Relative to reg-reg move (2). */
433 {2, 2, 2}, /* cost of storing integer registers */
434 2, /* cost of reg,reg fld/fst */
435 {2, 2, 6}, /* cost of loading fp registers
436 in SFmode, DFmode and XFmode */
437 {4, 4, 6}, /* cost of storing fp registers
438 in SFmode, DFmode and XFmode */
439 2, /* cost of moving MMX register */
440 {2, 2}, /* cost of loading MMX registers
441 in SImode and DImode */
442 {2, 2}, /* cost of storing MMX registers
443 in SImode and DImode */
444 2, /* cost of moving SSE register */
445 {2, 2, 8}, /* cost of loading SSE registers
446 in SImode, DImode and TImode */
447 {2, 2, 8}, /* cost of storing SSE registers
448 in SImode, DImode and TImode */
449 3, /* MMX or SSE register to integer */
450 8, /* size of l1 cache. */
451 256, /* size of l2 cache */
452 32, /* size of prefetch block */
453 6, /* number of parallel prefetches */
454 2, /* Branch cost */
455 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
456 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
457 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
458 COSTS_N_INSNS (2), /* cost of FABS instruction. */
459 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
460 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
461 pentiumpro_memcpy,
462 pentiumpro_memset,
463 1, /* scalar_stmt_cost. */
464 1, /* scalar load_cost. */
465 1, /* scalar_store_cost. */
466 1, /* vec_stmt_cost. */
467 1, /* vec_to_scalar_cost. */
468 1, /* scalar_to_vec_cost. */
469 1, /* vec_align_load_cost. */
470 2, /* vec_unalign_load_cost. */
471 1, /* vec_store_cost. */
472 3, /* cond_taken_branch_cost. */
473 1, /* cond_not_taken_branch_cost. */
476 static stringop_algs geode_memcpy[2] = {
477 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
478 DUMMY_STRINGOP_ALGS};
479 static stringop_algs geode_memset[2] = {
480 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
481 DUMMY_STRINGOP_ALGS};
482 static const
483 struct processor_costs geode_cost = {
484 COSTS_N_INSNS (1), /* cost of an add instruction */
485 COSTS_N_INSNS (1), /* cost of a lea instruction */
486 COSTS_N_INSNS (2), /* variable shift costs */
487 COSTS_N_INSNS (1), /* constant shift costs */
488 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
489 COSTS_N_INSNS (4), /* HI */
490 COSTS_N_INSNS (7), /* SI */
491 COSTS_N_INSNS (7), /* DI */
492 COSTS_N_INSNS (7)}, /* other */
493 0, /* cost of multiply per each bit set */
494 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
495 COSTS_N_INSNS (23), /* HI */
496 COSTS_N_INSNS (39), /* SI */
497 COSTS_N_INSNS (39), /* DI */
498 COSTS_N_INSNS (39)}, /* other */
499 COSTS_N_INSNS (1), /* cost of movsx */
500 COSTS_N_INSNS (1), /* cost of movzx */
501 8, /* "large" insn */
502 4, /* MOVE_RATIO */
503 1, /* cost for loading QImode using movzbl */
504 {1, 1, 1}, /* cost of loading integer registers
505 in QImode, HImode and SImode.
506 Relative to reg-reg move (2). */
507 {1, 1, 1}, /* cost of storing integer registers */
508 1, /* cost of reg,reg fld/fst */
509 {1, 1, 1}, /* cost of loading fp registers
510 in SFmode, DFmode and XFmode */
511 {4, 6, 6}, /* cost of storing fp registers
512 in SFmode, DFmode and XFmode */
514 1, /* cost of moving MMX register */
515 {1, 1}, /* cost of loading MMX registers
516 in SImode and DImode */
517 {1, 1}, /* cost of storing MMX registers
518 in SImode and DImode */
519 1, /* cost of moving SSE register */
520 {1, 1, 1}, /* cost of loading SSE registers
521 in SImode, DImode and TImode */
522 {1, 1, 1}, /* cost of storing SSE registers
523 in SImode, DImode and TImode */
524 1, /* MMX or SSE register to integer */
525 64, /* size of l1 cache. */
526 128, /* size of l2 cache. */
527 32, /* size of prefetch block */
528 1, /* number of parallel prefetches */
529 1, /* Branch cost */
530 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
531 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
532 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
533 COSTS_N_INSNS (1), /* cost of FABS instruction. */
534 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
535 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
536 geode_memcpy,
537 geode_memset,
538 1, /* scalar_stmt_cost. */
539 1, /* scalar load_cost. */
540 1, /* scalar_store_cost. */
541 1, /* vec_stmt_cost. */
542 1, /* vec_to_scalar_cost. */
543 1, /* scalar_to_vec_cost. */
544 1, /* vec_align_load_cost. */
545 2, /* vec_unalign_load_cost. */
546 1, /* vec_store_cost. */
547 3, /* cond_taken_branch_cost. */
548 1, /* cond_not_taken_branch_cost. */
551 static stringop_algs k6_memcpy[2] = {
552 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
553 DUMMY_STRINGOP_ALGS};
554 static stringop_algs k6_memset[2] = {
555 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
556 DUMMY_STRINGOP_ALGS};
557 static const
558 struct processor_costs k6_cost = {
559 COSTS_N_INSNS (1), /* cost of an add instruction */
560 COSTS_N_INSNS (2), /* cost of a lea instruction */
561 COSTS_N_INSNS (1), /* variable shift costs */
562 COSTS_N_INSNS (1), /* constant shift costs */
563 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
564 COSTS_N_INSNS (3), /* HI */
565 COSTS_N_INSNS (3), /* SI */
566 COSTS_N_INSNS (3), /* DI */
567 COSTS_N_INSNS (3)}, /* other */
568 0, /* cost of multiply per each bit set */
569 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
570 COSTS_N_INSNS (18), /* HI */
571 COSTS_N_INSNS (18), /* SI */
572 COSTS_N_INSNS (18), /* DI */
573 COSTS_N_INSNS (18)}, /* other */
574 COSTS_N_INSNS (2), /* cost of movsx */
575 COSTS_N_INSNS (2), /* cost of movzx */
576 8, /* "large" insn */
577 4, /* MOVE_RATIO */
578 3, /* cost for loading QImode using movzbl */
579 {4, 5, 4}, /* cost of loading integer registers
580 in QImode, HImode and SImode.
581 Relative to reg-reg move (2). */
582 {2, 3, 2}, /* cost of storing integer registers */
583 4, /* cost of reg,reg fld/fst */
584 {6, 6, 6}, /* cost of loading fp registers
585 in SFmode, DFmode and XFmode */
586 {4, 4, 4}, /* cost of storing fp registers
587 in SFmode, DFmode and XFmode */
588 2, /* cost of moving MMX register */
589 {2, 2}, /* cost of loading MMX registers
590 in SImode and DImode */
591 {2, 2}, /* cost of storing MMX registers
592 in SImode and DImode */
593 2, /* cost of moving SSE register */
594 {2, 2, 8}, /* cost of loading SSE registers
595 in SImode, DImode and TImode */
596 {2, 2, 8}, /* cost of storing SSE registers
597 in SImode, DImode and TImode */
598 6, /* MMX or SSE register to integer */
599 32, /* size of l1 cache. */
600 32, /* size of l2 cache. Some models
601 have integrated l2 cache, but
602 optimizing for k6 is not important
603 enough to worry about that. */
604 32, /* size of prefetch block */
605 1, /* number of parallel prefetches */
606 1, /* Branch cost */
607 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
608 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
609 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
610 COSTS_N_INSNS (2), /* cost of FABS instruction. */
611 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
612 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
613 k6_memcpy,
614 k6_memset,
615 1, /* scalar_stmt_cost. */
616 1, /* scalar load_cost. */
617 1, /* scalar_store_cost. */
618 1, /* vec_stmt_cost. */
619 1, /* vec_to_scalar_cost. */
620 1, /* scalar_to_vec_cost. */
621 1, /* vec_align_load_cost. */
622 2, /* vec_unalign_load_cost. */
623 1, /* vec_store_cost. */
624 3, /* cond_taken_branch_cost. */
625 1, /* cond_not_taken_branch_cost. */
628 /* For some reason, Athlon deals better with REP prefix (relative to loops)
629 compared to K8. Alignment becomes important after 8 bytes for memcpy and
630 128 bytes for memset. */
631 static stringop_algs athlon_memcpy[2] = {
632 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
633 DUMMY_STRINGOP_ALGS};
634 static stringop_algs athlon_memset[2] = {
635 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
636 DUMMY_STRINGOP_ALGS};
637 static const
638 struct processor_costs athlon_cost = {
639 COSTS_N_INSNS (1), /* cost of an add instruction */
640 COSTS_N_INSNS (2), /* cost of a lea instruction */
641 COSTS_N_INSNS (1), /* variable shift costs */
642 COSTS_N_INSNS (1), /* constant shift costs */
643 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
644 COSTS_N_INSNS (5), /* HI */
645 COSTS_N_INSNS (5), /* SI */
646 COSTS_N_INSNS (5), /* DI */
647 COSTS_N_INSNS (5)}, /* other */
648 0, /* cost of multiply per each bit set */
649 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
650 COSTS_N_INSNS (26), /* HI */
651 COSTS_N_INSNS (42), /* SI */
652 COSTS_N_INSNS (74), /* DI */
653 COSTS_N_INSNS (74)}, /* other */
654 COSTS_N_INSNS (1), /* cost of movsx */
655 COSTS_N_INSNS (1), /* cost of movzx */
656 8, /* "large" insn */
657 9, /* MOVE_RATIO */
658 4, /* cost for loading QImode using movzbl */
659 {3, 4, 3}, /* cost of loading integer registers
660 in QImode, HImode and SImode.
661 Relative to reg-reg move (2). */
662 {3, 4, 3}, /* cost of storing integer registers */
663 4, /* cost of reg,reg fld/fst */
664 {4, 4, 12}, /* cost of loading fp registers
665 in SFmode, DFmode and XFmode */
666 {6, 6, 8}, /* cost of storing fp registers
667 in SFmode, DFmode and XFmode */
668 2, /* cost of moving MMX register */
669 {4, 4}, /* cost of loading MMX registers
670 in SImode and DImode */
671 {4, 4}, /* cost of storing MMX registers
672 in SImode and DImode */
673 2, /* cost of moving SSE register */
674 {4, 4, 6}, /* cost of loading SSE registers
675 in SImode, DImode and TImode */
676 {4, 4, 5}, /* cost of storing SSE registers
677 in SImode, DImode and TImode */
678 5, /* MMX or SSE register to integer */
679 64, /* size of l1 cache. */
680 256, /* size of l2 cache. */
681 64, /* size of prefetch block */
682 6, /* number of parallel prefetches */
683 5, /* Branch cost */
684 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
685 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
686 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
687 COSTS_N_INSNS (2), /* cost of FABS instruction. */
688 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
689 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
690 athlon_memcpy,
691 athlon_memset,
692 1, /* scalar_stmt_cost. */
693 1, /* scalar load_cost. */
694 1, /* scalar_store_cost. */
695 1, /* vec_stmt_cost. */
696 1, /* vec_to_scalar_cost. */
697 1, /* scalar_to_vec_cost. */
698 1, /* vec_align_load_cost. */
699 2, /* vec_unalign_load_cost. */
700 1, /* vec_store_cost. */
701 3, /* cond_taken_branch_cost. */
702 1, /* cond_not_taken_branch_cost. */
705 /* K8 has optimized REP instruction for medium sized blocks, but for very
706 small blocks it is better to use loop. For large blocks, libcall can
707 do nontemporary accesses and beat inline considerably. */
708 static stringop_algs k8_memcpy[2] = {
709 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
710 {-1, rep_prefix_4_byte, false}}},
711 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
712 {-1, libcall, false}}}};
713 static stringop_algs k8_memset[2] = {
714 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
715 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
716 {libcall, {{48, unrolled_loop, false},
717 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
718 static const
719 struct processor_costs k8_cost = {
720 COSTS_N_INSNS (1), /* cost of an add instruction */
721 COSTS_N_INSNS (2), /* cost of a lea instruction */
722 COSTS_N_INSNS (1), /* variable shift costs */
723 COSTS_N_INSNS (1), /* constant shift costs */
724 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
725 COSTS_N_INSNS (4), /* HI */
726 COSTS_N_INSNS (3), /* SI */
727 COSTS_N_INSNS (4), /* DI */
728 COSTS_N_INSNS (5)}, /* other */
729 0, /* cost of multiply per each bit set */
730 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
731 COSTS_N_INSNS (26), /* HI */
732 COSTS_N_INSNS (42), /* SI */
733 COSTS_N_INSNS (74), /* DI */
734 COSTS_N_INSNS (74)}, /* other */
735 COSTS_N_INSNS (1), /* cost of movsx */
736 COSTS_N_INSNS (1), /* cost of movzx */
737 8, /* "large" insn */
738 9, /* MOVE_RATIO */
739 4, /* cost for loading QImode using movzbl */
740 {3, 4, 3}, /* cost of loading integer registers
741 in QImode, HImode and SImode.
742 Relative to reg-reg move (2). */
743 {3, 4, 3}, /* cost of storing integer registers */
744 4, /* cost of reg,reg fld/fst */
745 {4, 4, 12}, /* cost of loading fp registers
746 in SFmode, DFmode and XFmode */
747 {6, 6, 8}, /* cost of storing fp registers
748 in SFmode, DFmode and XFmode */
749 2, /* cost of moving MMX register */
750 {3, 3}, /* cost of loading MMX registers
751 in SImode and DImode */
752 {4, 4}, /* cost of storing MMX registers
753 in SImode and DImode */
754 2, /* cost of moving SSE register */
755 {4, 3, 6}, /* cost of loading SSE registers
756 in SImode, DImode and TImode */
757 {4, 4, 5}, /* cost of storing SSE registers
758 in SImode, DImode and TImode */
759 5, /* MMX or SSE register to integer */
760 64, /* size of l1 cache. */
761 512, /* size of l2 cache. */
762 64, /* size of prefetch block */
763 /* New AMD processors never drop prefetches; if they cannot be performed
764 immediately, they are queued. We set number of simultaneous prefetches
765 to a large constant to reflect this (it probably is not a good idea not
766 to limit number of prefetches at all, as their execution also takes some
767 time). */
768 100, /* number of parallel prefetches */
769 3, /* Branch cost */
770 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
771 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
772 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
773 COSTS_N_INSNS (2), /* cost of FABS instruction. */
774 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
775 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
777 k8_memcpy,
778 k8_memset,
779 4, /* scalar_stmt_cost. */
780 2, /* scalar load_cost. */
781 2, /* scalar_store_cost. */
782 5, /* vec_stmt_cost. */
783 0, /* vec_to_scalar_cost. */
784 2, /* scalar_to_vec_cost. */
785 2, /* vec_align_load_cost. */
786 3, /* vec_unalign_load_cost. */
787 3, /* vec_store_cost. */
788 3, /* cond_taken_branch_cost. */
789 2, /* cond_not_taken_branch_cost. */
792 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
793 very small blocks it is better to use loop. For large blocks, libcall can
794 do nontemporary accesses and beat inline considerably. */
795 static stringop_algs amdfam10_memcpy[2] = {
796 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
797 {-1, rep_prefix_4_byte, false}}},
798 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
799 {-1, libcall, false}}}};
800 static stringop_algs amdfam10_memset[2] = {
801 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
802 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
803 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
804 {-1, libcall, false}}}};
805 struct processor_costs amdfam10_cost = {
806 COSTS_N_INSNS (1), /* cost of an add instruction */
807 COSTS_N_INSNS (2), /* cost of a lea instruction */
808 COSTS_N_INSNS (1), /* variable shift costs */
809 COSTS_N_INSNS (1), /* constant shift costs */
810 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
811 COSTS_N_INSNS (4), /* HI */
812 COSTS_N_INSNS (3), /* SI */
813 COSTS_N_INSNS (4), /* DI */
814 COSTS_N_INSNS (5)}, /* other */
815 0, /* cost of multiply per each bit set */
816 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
817 COSTS_N_INSNS (35), /* HI */
818 COSTS_N_INSNS (51), /* SI */
819 COSTS_N_INSNS (83), /* DI */
820 COSTS_N_INSNS (83)}, /* other */
821 COSTS_N_INSNS (1), /* cost of movsx */
822 COSTS_N_INSNS (1), /* cost of movzx */
823 8, /* "large" insn */
824 9, /* MOVE_RATIO */
825 4, /* cost for loading QImode using movzbl */
826 {3, 4, 3}, /* cost of loading integer registers
827 in QImode, HImode and SImode.
828 Relative to reg-reg move (2). */
829 {3, 4, 3}, /* cost of storing integer registers */
830 4, /* cost of reg,reg fld/fst */
831 {4, 4, 12}, /* cost of loading fp registers
832 in SFmode, DFmode and XFmode */
833 {6, 6, 8}, /* cost of storing fp registers
834 in SFmode, DFmode and XFmode */
835 2, /* cost of moving MMX register */
836 {3, 3}, /* cost of loading MMX registers
837 in SImode and DImode */
838 {4, 4}, /* cost of storing MMX registers
839 in SImode and DImode */
840 2, /* cost of moving SSE register */
841 {4, 4, 3}, /* cost of loading SSE registers
842 in SImode, DImode and TImode */
843 {4, 4, 5}, /* cost of storing SSE registers
844 in SImode, DImode and TImode */
845 3, /* MMX or SSE register to integer */
846 /* On K8:
847 MOVD reg64, xmmreg Double FSTORE 4
848 MOVD reg32, xmmreg Double FSTORE 4
849 On AMDFAM10:
850 MOVD reg64, xmmreg Double FADD 3
851 1/1 1/1
852 MOVD reg32, xmmreg Double FADD 3
853 1/1 1/1 */
854 64, /* size of l1 cache. */
855 512, /* size of l2 cache. */
856 64, /* size of prefetch block */
857 /* New AMD processors never drop prefetches; if they cannot be performed
858 immediately, they are queued. We set number of simultaneous prefetches
859 to a large constant to reflect this (it probably is not a good idea not
860 to limit number of prefetches at all, as their execution also takes some
861 time). */
862 100, /* number of parallel prefetches */
863 2, /* Branch cost */
864 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
865 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
866 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
867 COSTS_N_INSNS (2), /* cost of FABS instruction. */
868 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
869 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
871 amdfam10_memcpy,
872 amdfam10_memset,
873 4, /* scalar_stmt_cost. */
874 2, /* scalar load_cost. */
875 2, /* scalar_store_cost. */
876 6, /* vec_stmt_cost. */
877 0, /* vec_to_scalar_cost. */
878 2, /* scalar_to_vec_cost. */
879 2, /* vec_align_load_cost. */
880 2, /* vec_unalign_load_cost. */
881 2, /* vec_store_cost. */
882 2, /* cond_taken_branch_cost. */
883 1, /* cond_not_taken_branch_cost. */
886 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
887 very small blocks it is better to use loop. For large blocks, libcall
888 can do nontemporary accesses and beat inline considerably. */
889 static stringop_algs bdver1_memcpy[2] = {
890 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
891 {-1, rep_prefix_4_byte, false}}},
892 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
893 {-1, libcall, false}}}};
894 static stringop_algs bdver1_memset[2] = {
895 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
896 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
897 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
898 {-1, libcall, false}}}};
900 const struct processor_costs bdver1_cost = {
901 COSTS_N_INSNS (1), /* cost of an add instruction */
902 COSTS_N_INSNS (1), /* cost of a lea instruction */
903 COSTS_N_INSNS (1), /* variable shift costs */
904 COSTS_N_INSNS (1), /* constant shift costs */
905 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
906 COSTS_N_INSNS (4), /* HI */
907 COSTS_N_INSNS (4), /* SI */
908 COSTS_N_INSNS (6), /* DI */
909 COSTS_N_INSNS (6)}, /* other */
910 0, /* cost of multiply per each bit set */
911 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
912 COSTS_N_INSNS (35), /* HI */
913 COSTS_N_INSNS (51), /* SI */
914 COSTS_N_INSNS (83), /* DI */
915 COSTS_N_INSNS (83)}, /* other */
916 COSTS_N_INSNS (1), /* cost of movsx */
917 COSTS_N_INSNS (1), /* cost of movzx */
918 8, /* "large" insn */
919 9, /* MOVE_RATIO */
920 4, /* cost for loading QImode using movzbl */
921 {5, 5, 4}, /* cost of loading integer registers
922 in QImode, HImode and SImode.
923 Relative to reg-reg move (2). */
924 {4, 4, 4}, /* cost of storing integer registers */
925 2, /* cost of reg,reg fld/fst */
926 {5, 5, 12}, /* cost of loading fp registers
927 in SFmode, DFmode and XFmode */
928 {4, 4, 8}, /* cost of storing fp registers
929 in SFmode, DFmode and XFmode */
930 2, /* cost of moving MMX register */
931 {4, 4}, /* cost of loading MMX registers
932 in SImode and DImode */
933 {4, 4}, /* cost of storing MMX registers
934 in SImode and DImode */
935 2, /* cost of moving SSE register */
936 {4, 4, 4}, /* cost of loading SSE registers
937 in SImode, DImode and TImode */
938 {4, 4, 4}, /* cost of storing SSE registers
939 in SImode, DImode and TImode */
940 2, /* MMX or SSE register to integer */
941 /* On K8:
942 MOVD reg64, xmmreg Double FSTORE 4
943 MOVD reg32, xmmreg Double FSTORE 4
944 On AMDFAM10:
945 MOVD reg64, xmmreg Double FADD 3
946 1/1 1/1
947 MOVD reg32, xmmreg Double FADD 3
948 1/1 1/1 */
949 16, /* size of l1 cache. */
950 2048, /* size of l2 cache. */
951 64, /* size of prefetch block */
952 /* New AMD processors never drop prefetches; if they cannot be performed
953 immediately, they are queued. We set number of simultaneous prefetches
954 to a large constant to reflect this (it probably is not a good idea not
955 to limit number of prefetches at all, as their execution also takes some
956 time). */
957 100, /* number of parallel prefetches */
958 2, /* Branch cost */
959 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
960 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
961 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
962 COSTS_N_INSNS (2), /* cost of FABS instruction. */
963 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
964 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
966 bdver1_memcpy,
967 bdver1_memset,
968 6, /* scalar_stmt_cost. */
969 4, /* scalar load_cost. */
970 4, /* scalar_store_cost. */
971 6, /* vec_stmt_cost. */
972 0, /* vec_to_scalar_cost. */
973 2, /* scalar_to_vec_cost. */
974 4, /* vec_align_load_cost. */
975 4, /* vec_unalign_load_cost. */
976 4, /* vec_store_cost. */
977 2, /* cond_taken_branch_cost. */
978 1, /* cond_not_taken_branch_cost. */
981 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
982 very small blocks it is better to use loop. For large blocks, libcall
983 can do nontemporary accesses and beat inline considerably. */
985 static stringop_algs bdver2_memcpy[2] = {
986 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
987 {-1, rep_prefix_4_byte, false}}},
988 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
989 {-1, libcall, false}}}};
990 static stringop_algs bdver2_memset[2] = {
991 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
992 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
993 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
994 {-1, libcall, false}}}};
996 const struct processor_costs bdver2_cost = {
997 COSTS_N_INSNS (1), /* cost of an add instruction */
998 COSTS_N_INSNS (1), /* cost of a lea instruction */
999 COSTS_N_INSNS (1), /* variable shift costs */
1000 COSTS_N_INSNS (1), /* constant shift costs */
1001 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1002 COSTS_N_INSNS (4), /* HI */
1003 COSTS_N_INSNS (4), /* SI */
1004 COSTS_N_INSNS (6), /* DI */
1005 COSTS_N_INSNS (6)}, /* other */
1006 0, /* cost of multiply per each bit set */
1007 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1008 COSTS_N_INSNS (35), /* HI */
1009 COSTS_N_INSNS (51), /* SI */
1010 COSTS_N_INSNS (83), /* DI */
1011 COSTS_N_INSNS (83)}, /* other */
1012 COSTS_N_INSNS (1), /* cost of movsx */
1013 COSTS_N_INSNS (1), /* cost of movzx */
1014 8, /* "large" insn */
1015 9, /* MOVE_RATIO */
1016 4, /* cost for loading QImode using movzbl */
1017 {5, 5, 4}, /* cost of loading integer registers
1018 in QImode, HImode and SImode.
1019 Relative to reg-reg move (2). */
1020 {4, 4, 4}, /* cost of storing integer registers */
1021 2, /* cost of reg,reg fld/fst */
1022 {5, 5, 12}, /* cost of loading fp registers
1023 in SFmode, DFmode and XFmode */
1024 {4, 4, 8}, /* cost of storing fp registers
1025 in SFmode, DFmode and XFmode */
1026 2, /* cost of moving MMX register */
1027 {4, 4}, /* cost of loading MMX registers
1028 in SImode and DImode */
1029 {4, 4}, /* cost of storing MMX registers
1030 in SImode and DImode */
1031 2, /* cost of moving SSE register */
1032 {4, 4, 4}, /* cost of loading SSE registers
1033 in SImode, DImode and TImode */
1034 {4, 4, 4}, /* cost of storing SSE registers
1035 in SImode, DImode and TImode */
1036 2, /* MMX or SSE register to integer */
1037 /* On K8:
1038 MOVD reg64, xmmreg Double FSTORE 4
1039 MOVD reg32, xmmreg Double FSTORE 4
1040 On AMDFAM10:
1041 MOVD reg64, xmmreg Double FADD 3
1042 1/1 1/1
1043 MOVD reg32, xmmreg Double FADD 3
1044 1/1 1/1 */
1045 16, /* size of l1 cache. */
1046 2048, /* size of l2 cache. */
1047 64, /* size of prefetch block */
1048 /* New AMD processors never drop prefetches; if they cannot be performed
1049 immediately, they are queued. We set number of simultaneous prefetches
1050 to a large constant to reflect this (it probably is not a good idea not
1051 to limit number of prefetches at all, as their execution also takes some
1052 time). */
1053 100, /* number of parallel prefetches */
1054 2, /* Branch cost */
1055 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1056 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1057 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1058 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1059 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1060 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1062 bdver2_memcpy,
1063 bdver2_memset,
1064 6, /* scalar_stmt_cost. */
1065 4, /* scalar load_cost. */
1066 4, /* scalar_store_cost. */
1067 6, /* vec_stmt_cost. */
1068 0, /* vec_to_scalar_cost. */
1069 2, /* scalar_to_vec_cost. */
1070 4, /* vec_align_load_cost. */
1071 4, /* vec_unalign_load_cost. */
1072 4, /* vec_store_cost. */
1073 2, /* cond_taken_branch_cost. */
1074 1, /* cond_not_taken_branch_cost. */
1078 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1079 very small blocks it is better to use loop. For large blocks, libcall
1080 can do nontemporary accesses and beat inline considerably. */
1081 static stringop_algs bdver3_memcpy[2] = {
1082 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1083 {-1, rep_prefix_4_byte, false}}},
1084 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}};
1086 static stringop_algs bdver3_memset[2] = {
1087 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1088 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1089 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1090 {-1, libcall, false}}}};
1091 struct processor_costs bdver3_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (1), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (4), /* SI */
1099 COSTS_N_INSNS (6), /* DI */
1100 COSTS_N_INSNS (6)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (35), /* HI */
1104 COSTS_N_INSNS (51), /* SI */
1105 COSTS_N_INSNS (83), /* DI */
1106 COSTS_N_INSNS (83)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {5, 5, 4}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {4, 4, 4}, /* cost of storing integer registers */
1116 2, /* cost of reg,reg fld/fst */
1117 {5, 5, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {4, 4, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {4, 4}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 4, 4}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 4}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 2, /* MMX or SSE register to integer */
1132 16, /* size of l1 cache. */
1133 2048, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 2, /* Branch cost */
1142 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1149 bdver3_memcpy,
1150 bdver3_memset,
1151 6, /* scalar_stmt_cost. */
1152 4, /* scalar load_cost. */
1153 4, /* scalar_store_cost. */
1154 6, /* vec_stmt_cost. */
1155 0, /* vec_to_scalar_cost. */
1156 2, /* scalar_to_vec_cost. */
1157 4, /* vec_align_load_cost. */
1158 4, /* vec_unalign_load_cost. */
1159 4, /* vec_store_cost. */
1160 2, /* cond_taken_branch_cost. */
1161 1, /* cond_not_taken_branch_cost. */
1164 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1165 very small blocks it is better to use loop. For large blocks, libcall can
1166 do nontemporary accesses and beat inline considerably. */
1167 static stringop_algs btver1_memcpy[2] = {
1168 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1169 {-1, rep_prefix_4_byte, false}}},
1170 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1171 {-1, libcall, false}}}};
1172 static stringop_algs btver1_memset[2] = {
1173 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1174 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1175 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1176 {-1, libcall, false}}}};
1177 const struct processor_costs btver1_cost = {
1178 COSTS_N_INSNS (1), /* cost of an add instruction */
1179 COSTS_N_INSNS (2), /* cost of a lea instruction */
1180 COSTS_N_INSNS (1), /* variable shift costs */
1181 COSTS_N_INSNS (1), /* constant shift costs */
1182 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1183 COSTS_N_INSNS (4), /* HI */
1184 COSTS_N_INSNS (3), /* SI */
1185 COSTS_N_INSNS (4), /* DI */
1186 COSTS_N_INSNS (5)}, /* other */
1187 0, /* cost of multiply per each bit set */
1188 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1189 COSTS_N_INSNS (35), /* HI */
1190 COSTS_N_INSNS (51), /* SI */
1191 COSTS_N_INSNS (83), /* DI */
1192 COSTS_N_INSNS (83)}, /* other */
1193 COSTS_N_INSNS (1), /* cost of movsx */
1194 COSTS_N_INSNS (1), /* cost of movzx */
1195 8, /* "large" insn */
1196 9, /* MOVE_RATIO */
1197 4, /* cost for loading QImode using movzbl */
1198 {3, 4, 3}, /* cost of loading integer registers
1199 in QImode, HImode and SImode.
1200 Relative to reg-reg move (2). */
1201 {3, 4, 3}, /* cost of storing integer registers */
1202 4, /* cost of reg,reg fld/fst */
1203 {4, 4, 12}, /* cost of loading fp registers
1204 in SFmode, DFmode and XFmode */
1205 {6, 6, 8}, /* cost of storing fp registers
1206 in SFmode, DFmode and XFmode */
1207 2, /* cost of moving MMX register */
1208 {3, 3}, /* cost of loading MMX registers
1209 in SImode and DImode */
1210 {4, 4}, /* cost of storing MMX registers
1211 in SImode and DImode */
1212 2, /* cost of moving SSE register */
1213 {4, 4, 3}, /* cost of loading SSE registers
1214 in SImode, DImode and TImode */
1215 {4, 4, 5}, /* cost of storing SSE registers
1216 in SImode, DImode and TImode */
1217 3, /* MMX or SSE register to integer */
1218 /* On K8:
1219 MOVD reg64, xmmreg Double FSTORE 4
1220 MOVD reg32, xmmreg Double FSTORE 4
1221 On AMDFAM10:
1222 MOVD reg64, xmmreg Double FADD 3
1223 1/1 1/1
1224 MOVD reg32, xmmreg Double FADD 3
1225 1/1 1/1 */
1226 32, /* size of l1 cache. */
1227 512, /* size of l2 cache. */
1228 64, /* size of prefetch block */
1229 100, /* number of parallel prefetches */
1230 2, /* Branch cost */
1231 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1232 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1233 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1234 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1235 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1236 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1238 btver1_memcpy,
1239 btver1_memset,
1240 4, /* scalar_stmt_cost. */
1241 2, /* scalar load_cost. */
1242 2, /* scalar_store_cost. */
1243 6, /* vec_stmt_cost. */
1244 0, /* vec_to_scalar_cost. */
1245 2, /* scalar_to_vec_cost. */
1246 2, /* vec_align_load_cost. */
1247 2, /* vec_unalign_load_cost. */
1248 2, /* vec_store_cost. */
1249 2, /* cond_taken_branch_cost. */
1250 1, /* cond_not_taken_branch_cost. */
1253 static stringop_algs btver2_memcpy[2] = {
1254 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1255 {-1, rep_prefix_4_byte, false}}},
1256 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1257 {-1, libcall, false}}}};
1258 static stringop_algs btver2_memset[2] = {
1259 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1260 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1261 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1262 {-1, libcall, false}}}};
1263 const struct processor_costs btver2_cost = {
1264 COSTS_N_INSNS (1), /* cost of an add instruction */
1265 COSTS_N_INSNS (2), /* cost of a lea instruction */
1266 COSTS_N_INSNS (1), /* variable shift costs */
1267 COSTS_N_INSNS (1), /* constant shift costs */
1268 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1269 COSTS_N_INSNS (4), /* HI */
1270 COSTS_N_INSNS (3), /* SI */
1271 COSTS_N_INSNS (4), /* DI */
1272 COSTS_N_INSNS (5)}, /* other */
1273 0, /* cost of multiply per each bit set */
1274 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1275 COSTS_N_INSNS (35), /* HI */
1276 COSTS_N_INSNS (51), /* SI */
1277 COSTS_N_INSNS (83), /* DI */
1278 COSTS_N_INSNS (83)}, /* other */
1279 COSTS_N_INSNS (1), /* cost of movsx */
1280 COSTS_N_INSNS (1), /* cost of movzx */
1281 8, /* "large" insn */
1282 9, /* MOVE_RATIO */
1283 4, /* cost for loading QImode using movzbl */
1284 {3, 4, 3}, /* cost of loading integer registers
1285 in QImode, HImode and SImode.
1286 Relative to reg-reg move (2). */
1287 {3, 4, 3}, /* cost of storing integer registers */
1288 4, /* cost of reg,reg fld/fst */
1289 {4, 4, 12}, /* cost of loading fp registers
1290 in SFmode, DFmode and XFmode */
1291 {6, 6, 8}, /* cost of storing fp registers
1292 in SFmode, DFmode and XFmode */
1293 2, /* cost of moving MMX register */
1294 {3, 3}, /* cost of loading MMX registers
1295 in SImode and DImode */
1296 {4, 4}, /* cost of storing MMX registers
1297 in SImode and DImode */
1298 2, /* cost of moving SSE register */
1299 {4, 4, 3}, /* cost of loading SSE registers
1300 in SImode, DImode and TImode */
1301 {4, 4, 5}, /* cost of storing SSE registers
1302 in SImode, DImode and TImode */
1303 3, /* MMX or SSE register to integer */
1304 /* On K8:
1305 MOVD reg64, xmmreg Double FSTORE 4
1306 MOVD reg32, xmmreg Double FSTORE 4
1307 On AMDFAM10:
1308 MOVD reg64, xmmreg Double FADD 3
1309 1/1 1/1
1310 MOVD reg32, xmmreg Double FADD 3
1311 1/1 1/1 */
1312 32, /* size of l1 cache. */
1313 2048, /* size of l2 cache. */
1314 64, /* size of prefetch block */
1315 100, /* number of parallel prefetches */
1316 2, /* Branch cost */
1317 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1318 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1319 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1320 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1321 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1322 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1323 btver2_memcpy,
1324 btver2_memset,
1325 4, /* scalar_stmt_cost. */
1326 2, /* scalar load_cost. */
1327 2, /* scalar_store_cost. */
1328 6, /* vec_stmt_cost. */
1329 0, /* vec_to_scalar_cost. */
1330 2, /* scalar_to_vec_cost. */
1331 2, /* vec_align_load_cost. */
1332 2, /* vec_unalign_load_cost. */
1333 2, /* vec_store_cost. */
1334 2, /* cond_taken_branch_cost. */
1335 1, /* cond_not_taken_branch_cost. */
1338 static stringop_algs pentium4_memcpy[2] = {
1339 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1340 DUMMY_STRINGOP_ALGS};
1341 static stringop_algs pentium4_memset[2] = {
1342 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1343 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1344 DUMMY_STRINGOP_ALGS};
1346 static const
1347 struct processor_costs pentium4_cost = {
1348 COSTS_N_INSNS (1), /* cost of an add instruction */
1349 COSTS_N_INSNS (3), /* cost of a lea instruction */
1350 COSTS_N_INSNS (4), /* variable shift costs */
1351 COSTS_N_INSNS (4), /* constant shift costs */
1352 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1353 COSTS_N_INSNS (15), /* HI */
1354 COSTS_N_INSNS (15), /* SI */
1355 COSTS_N_INSNS (15), /* DI */
1356 COSTS_N_INSNS (15)}, /* other */
1357 0, /* cost of multiply per each bit set */
1358 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1359 COSTS_N_INSNS (56), /* HI */
1360 COSTS_N_INSNS (56), /* SI */
1361 COSTS_N_INSNS (56), /* DI */
1362 COSTS_N_INSNS (56)}, /* other */
1363 COSTS_N_INSNS (1), /* cost of movsx */
1364 COSTS_N_INSNS (1), /* cost of movzx */
1365 16, /* "large" insn */
1366 6, /* MOVE_RATIO */
1367 2, /* cost for loading QImode using movzbl */
1368 {4, 5, 4}, /* cost of loading integer registers
1369 in QImode, HImode and SImode.
1370 Relative to reg-reg move (2). */
1371 {2, 3, 2}, /* cost of storing integer registers */
1372 2, /* cost of reg,reg fld/fst */
1373 {2, 2, 6}, /* cost of loading fp registers
1374 in SFmode, DFmode and XFmode */
1375 {4, 4, 6}, /* cost of storing fp registers
1376 in SFmode, DFmode and XFmode */
1377 2, /* cost of moving MMX register */
1378 {2, 2}, /* cost of loading MMX registers
1379 in SImode and DImode */
1380 {2, 2}, /* cost of storing MMX registers
1381 in SImode and DImode */
1382 12, /* cost of moving SSE register */
1383 {12, 12, 12}, /* cost of loading SSE registers
1384 in SImode, DImode and TImode */
1385 {2, 2, 8}, /* cost of storing SSE registers
1386 in SImode, DImode and TImode */
1387 10, /* MMX or SSE register to integer */
1388 8, /* size of l1 cache. */
1389 256, /* size of l2 cache. */
1390 64, /* size of prefetch block */
1391 6, /* number of parallel prefetches */
1392 2, /* Branch cost */
1393 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1394 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1395 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1396 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1397 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1398 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1399 pentium4_memcpy,
1400 pentium4_memset,
1401 1, /* scalar_stmt_cost. */
1402 1, /* scalar load_cost. */
1403 1, /* scalar_store_cost. */
1404 1, /* vec_stmt_cost. */
1405 1, /* vec_to_scalar_cost. */
1406 1, /* scalar_to_vec_cost. */
1407 1, /* vec_align_load_cost. */
1408 2, /* vec_unalign_load_cost. */
1409 1, /* vec_store_cost. */
1410 3, /* cond_taken_branch_cost. */
1411 1, /* cond_not_taken_branch_cost. */
1414 static stringop_algs nocona_memcpy[2] = {
1415 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1416 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1417 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1419 static stringop_algs nocona_memset[2] = {
1420 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1421 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1422 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1423 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1425 static const
1426 struct processor_costs nocona_cost = {
1427 COSTS_N_INSNS (1), /* cost of an add instruction */
1428 COSTS_N_INSNS (1), /* cost of a lea instruction */
1429 COSTS_N_INSNS (1), /* variable shift costs */
1430 COSTS_N_INSNS (1), /* constant shift costs */
1431 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1432 COSTS_N_INSNS (10), /* HI */
1433 COSTS_N_INSNS (10), /* SI */
1434 COSTS_N_INSNS (10), /* DI */
1435 COSTS_N_INSNS (10)}, /* other */
1436 0, /* cost of multiply per each bit set */
1437 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1438 COSTS_N_INSNS (66), /* HI */
1439 COSTS_N_INSNS (66), /* SI */
1440 COSTS_N_INSNS (66), /* DI */
1441 COSTS_N_INSNS (66)}, /* other */
1442 COSTS_N_INSNS (1), /* cost of movsx */
1443 COSTS_N_INSNS (1), /* cost of movzx */
1444 16, /* "large" insn */
1445 17, /* MOVE_RATIO */
1446 4, /* cost for loading QImode using movzbl */
1447 {4, 4, 4}, /* cost of loading integer registers
1448 in QImode, HImode and SImode.
1449 Relative to reg-reg move (2). */
1450 {4, 4, 4}, /* cost of storing integer registers */
1451 3, /* cost of reg,reg fld/fst */
1452 {12, 12, 12}, /* cost of loading fp registers
1453 in SFmode, DFmode and XFmode */
1454 {4, 4, 4}, /* cost of storing fp registers
1455 in SFmode, DFmode and XFmode */
1456 6, /* cost of moving MMX register */
1457 {12, 12}, /* cost of loading MMX registers
1458 in SImode and DImode */
1459 {12, 12}, /* cost of storing MMX registers
1460 in SImode and DImode */
1461 6, /* cost of moving SSE register */
1462 {12, 12, 12}, /* cost of loading SSE registers
1463 in SImode, DImode and TImode */
1464 {12, 12, 12}, /* cost of storing SSE registers
1465 in SImode, DImode and TImode */
1466 8, /* MMX or SSE register to integer */
1467 8, /* size of l1 cache. */
1468 1024, /* size of l2 cache. */
1469 128, /* size of prefetch block */
1470 8, /* number of parallel prefetches */
1471 1, /* Branch cost */
1472 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1473 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1474 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1475 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1476 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1477 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1478 nocona_memcpy,
1479 nocona_memset,
1480 1, /* scalar_stmt_cost. */
1481 1, /* scalar load_cost. */
1482 1, /* scalar_store_cost. */
1483 1, /* vec_stmt_cost. */
1484 1, /* vec_to_scalar_cost. */
1485 1, /* scalar_to_vec_cost. */
1486 1, /* vec_align_load_cost. */
1487 2, /* vec_unalign_load_cost. */
1488 1, /* vec_store_cost. */
1489 3, /* cond_taken_branch_cost. */
1490 1, /* cond_not_taken_branch_cost. */
1493 static stringop_algs atom_memcpy[2] = {
1494 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1495 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1496 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1497 static stringop_algs atom_memset[2] = {
1498 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1499 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1500 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1501 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1502 static const
1503 struct processor_costs atom_cost = {
1504 COSTS_N_INSNS (1), /* cost of an add instruction */
1505 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1506 COSTS_N_INSNS (1), /* variable shift costs */
1507 COSTS_N_INSNS (1), /* constant shift costs */
1508 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1509 COSTS_N_INSNS (4), /* HI */
1510 COSTS_N_INSNS (3), /* SI */
1511 COSTS_N_INSNS (4), /* DI */
1512 COSTS_N_INSNS (2)}, /* other */
1513 0, /* cost of multiply per each bit set */
1514 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1515 COSTS_N_INSNS (26), /* HI */
1516 COSTS_N_INSNS (42), /* SI */
1517 COSTS_N_INSNS (74), /* DI */
1518 COSTS_N_INSNS (74)}, /* other */
1519 COSTS_N_INSNS (1), /* cost of movsx */
1520 COSTS_N_INSNS (1), /* cost of movzx */
1521 8, /* "large" insn */
1522 17, /* MOVE_RATIO */
1523 4, /* cost for loading QImode using movzbl */
1524 {4, 4, 4}, /* cost of loading integer registers
1525 in QImode, HImode and SImode.
1526 Relative to reg-reg move (2). */
1527 {4, 4, 4}, /* cost of storing integer registers */
1528 4, /* cost of reg,reg fld/fst */
1529 {12, 12, 12}, /* cost of loading fp registers
1530 in SFmode, DFmode and XFmode */
1531 {6, 6, 8}, /* cost of storing fp registers
1532 in SFmode, DFmode and XFmode */
1533 2, /* cost of moving MMX register */
1534 {8, 8}, /* cost of loading MMX registers
1535 in SImode and DImode */
1536 {8, 8}, /* cost of storing MMX registers
1537 in SImode and DImode */
1538 2, /* cost of moving SSE register */
1539 {8, 8, 8}, /* cost of loading SSE registers
1540 in SImode, DImode and TImode */
1541 {8, 8, 8}, /* cost of storing SSE registers
1542 in SImode, DImode and TImode */
1543 5, /* MMX or SSE register to integer */
1544 32, /* size of l1 cache. */
1545 256, /* size of l2 cache. */
1546 64, /* size of prefetch block */
1547 6, /* number of parallel prefetches */
1548 3, /* Branch cost */
1549 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1550 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1551 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1552 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1553 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1554 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1555 atom_memcpy,
1556 atom_memset,
1557 1, /* scalar_stmt_cost. */
1558 1, /* scalar load_cost. */
1559 1, /* scalar_store_cost. */
1560 1, /* vec_stmt_cost. */
1561 1, /* vec_to_scalar_cost. */
1562 1, /* scalar_to_vec_cost. */
1563 1, /* vec_align_load_cost. */
1564 2, /* vec_unalign_load_cost. */
1565 1, /* vec_store_cost. */
1566 3, /* cond_taken_branch_cost. */
1567 1, /* cond_not_taken_branch_cost. */
1570 static stringop_algs slm_memcpy[2] = {
1571 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1572 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1573 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1574 static stringop_algs slm_memset[2] = {
1575 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1576 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1577 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1578 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1579 static const
1580 struct processor_costs slm_cost = {
1581 COSTS_N_INSNS (1), /* cost of an add instruction */
1582 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1583 COSTS_N_INSNS (1), /* variable shift costs */
1584 COSTS_N_INSNS (1), /* constant shift costs */
1585 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1586 COSTS_N_INSNS (4), /* HI */
1587 COSTS_N_INSNS (3), /* SI */
1588 COSTS_N_INSNS (4), /* DI */
1589 COSTS_N_INSNS (2)}, /* other */
1590 0, /* cost of multiply per each bit set */
1591 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1592 COSTS_N_INSNS (26), /* HI */
1593 COSTS_N_INSNS (42), /* SI */
1594 COSTS_N_INSNS (74), /* DI */
1595 COSTS_N_INSNS (74)}, /* other */
1596 COSTS_N_INSNS (1), /* cost of movsx */
1597 COSTS_N_INSNS (1), /* cost of movzx */
1598 8, /* "large" insn */
1599 17, /* MOVE_RATIO */
1600 4, /* cost for loading QImode using movzbl */
1601 {4, 4, 4}, /* cost of loading integer registers
1602 in QImode, HImode and SImode.
1603 Relative to reg-reg move (2). */
1604 {4, 4, 4}, /* cost of storing integer registers */
1605 4, /* cost of reg,reg fld/fst */
1606 {12, 12, 12}, /* cost of loading fp registers
1607 in SFmode, DFmode and XFmode */
1608 {6, 6, 8}, /* cost of storing fp registers
1609 in SFmode, DFmode and XFmode */
1610 2, /* cost of moving MMX register */
1611 {8, 8}, /* cost of loading MMX registers
1612 in SImode and DImode */
1613 {8, 8}, /* cost of storing MMX registers
1614 in SImode and DImode */
1615 2, /* cost of moving SSE register */
1616 {8, 8, 8}, /* cost of loading SSE registers
1617 in SImode, DImode and TImode */
1618 {8, 8, 8}, /* cost of storing SSE registers
1619 in SImode, DImode and TImode */
1620 5, /* MMX or SSE register to integer */
1621 32, /* size of l1 cache. */
1622 256, /* size of l2 cache. */
1623 64, /* size of prefetch block */
1624 6, /* number of parallel prefetches */
1625 3, /* Branch cost */
1626 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1627 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1628 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1629 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1630 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1631 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1632 slm_memcpy,
1633 slm_memset,
1634 1, /* scalar_stmt_cost. */
1635 1, /* scalar load_cost. */
1636 1, /* scalar_store_cost. */
1637 1, /* vec_stmt_cost. */
1638 1, /* vec_to_scalar_cost. */
1639 1, /* scalar_to_vec_cost. */
1640 1, /* vec_align_load_cost. */
1641 2, /* vec_unalign_load_cost. */
1642 1, /* vec_store_cost. */
1643 3, /* cond_taken_branch_cost. */
1644 1, /* cond_not_taken_branch_cost. */
1647 /* Generic64 should produce code tuned for Nocona and K8. */
1649 static stringop_algs generic64_memcpy[2] = {
1650 DUMMY_STRINGOP_ALGS,
1651 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1652 {-1, libcall, false}}}};
1653 static stringop_algs generic64_memset[2] = {
1654 DUMMY_STRINGOP_ALGS,
1655 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1656 {-1, libcall, false}}}};
1657 static const
1658 struct processor_costs generic64_cost = {
1659 COSTS_N_INSNS (1), /* cost of an add instruction */
1660 /* On all chips taken into consideration lea is 2 cycles and more. With
1661 this cost however our current implementation of synth_mult results in
1662 use of unnecessary temporary registers causing regression on several
1663 SPECfp benchmarks. */
1664 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1665 COSTS_N_INSNS (1), /* variable shift costs */
1666 COSTS_N_INSNS (1), /* constant shift costs */
1667 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1668 COSTS_N_INSNS (4), /* HI */
1669 COSTS_N_INSNS (3), /* SI */
1670 COSTS_N_INSNS (4), /* DI */
1671 COSTS_N_INSNS (2)}, /* other */
1672 0, /* cost of multiply per each bit set */
1673 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1674 COSTS_N_INSNS (26), /* HI */
1675 COSTS_N_INSNS (42), /* SI */
1676 COSTS_N_INSNS (74), /* DI */
1677 COSTS_N_INSNS (74)}, /* other */
1678 COSTS_N_INSNS (1), /* cost of movsx */
1679 COSTS_N_INSNS (1), /* cost of movzx */
1680 8, /* "large" insn */
1681 17, /* MOVE_RATIO */
1682 4, /* cost for loading QImode using movzbl */
1683 {4, 4, 4}, /* cost of loading integer registers
1684 in QImode, HImode and SImode.
1685 Relative to reg-reg move (2). */
1686 {4, 4, 4}, /* cost of storing integer registers */
1687 4, /* cost of reg,reg fld/fst */
1688 {12, 12, 12}, /* cost of loading fp registers
1689 in SFmode, DFmode and XFmode */
1690 {6, 6, 8}, /* cost of storing fp registers
1691 in SFmode, DFmode and XFmode */
1692 2, /* cost of moving MMX register */
1693 {8, 8}, /* cost of loading MMX registers
1694 in SImode and DImode */
1695 {8, 8}, /* cost of storing MMX registers
1696 in SImode and DImode */
1697 2, /* cost of moving SSE register */
1698 {8, 8, 8}, /* cost of loading SSE registers
1699 in SImode, DImode and TImode */
1700 {8, 8, 8}, /* cost of storing SSE registers
1701 in SImode, DImode and TImode */
1702 5, /* MMX or SSE register to integer */
1703 32, /* size of l1 cache. */
1704 512, /* size of l2 cache. */
1705 64, /* size of prefetch block */
1706 6, /* number of parallel prefetches */
1707 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1708 value is increased to perhaps more appropriate value of 5. */
1709 3, /* Branch cost */
1710 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1711 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1712 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1713 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1714 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1715 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1716 generic64_memcpy,
1717 generic64_memset,
1718 1, /* scalar_stmt_cost. */
1719 1, /* scalar load_cost. */
1720 1, /* scalar_store_cost. */
1721 1, /* vec_stmt_cost. */
1722 1, /* vec_to_scalar_cost. */
1723 1, /* scalar_to_vec_cost. */
1724 1, /* vec_align_load_cost. */
1725 2, /* vec_unalign_load_cost. */
1726 1, /* vec_store_cost. */
1727 3, /* cond_taken_branch_cost. */
1728 1, /* cond_not_taken_branch_cost. */
1731 /* core_cost should produce code tuned for Core familly of CPUs. */
1732 static stringop_algs core_memcpy[2] = {
1733 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1734 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1735 {-1, libcall, false}}}};
1736 static stringop_algs core_memset[2] = {
1737 {libcall, {{6, loop_1_byte, true},
1738 {24, loop, true},
1739 {8192, rep_prefix_4_byte, true},
1740 {-1, libcall, false}}},
1741 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1742 {-1, libcall, false}}}};
1744 static const
1745 struct processor_costs core_cost = {
1746 COSTS_N_INSNS (1), /* cost of an add instruction */
1747 /* On all chips taken into consideration lea is 2 cycles and more. With
1748 this cost however our current implementation of synth_mult results in
1749 use of unnecessary temporary registers causing regression on several
1750 SPECfp benchmarks. */
1751 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1752 COSTS_N_INSNS (1), /* variable shift costs */
1753 COSTS_N_INSNS (1), /* constant shift costs */
1754 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1755 COSTS_N_INSNS (4), /* HI */
1756 COSTS_N_INSNS (3), /* SI */
1757 COSTS_N_INSNS (4), /* DI */
1758 COSTS_N_INSNS (2)}, /* other */
1759 0, /* cost of multiply per each bit set */
1760 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1761 COSTS_N_INSNS (26), /* HI */
1762 COSTS_N_INSNS (42), /* SI */
1763 COSTS_N_INSNS (74), /* DI */
1764 COSTS_N_INSNS (74)}, /* other */
1765 COSTS_N_INSNS (1), /* cost of movsx */
1766 COSTS_N_INSNS (1), /* cost of movzx */
1767 8, /* "large" insn */
1768 17, /* MOVE_RATIO */
1769 4, /* cost for loading QImode using movzbl */
1770 {4, 4, 4}, /* cost of loading integer registers
1771 in QImode, HImode and SImode.
1772 Relative to reg-reg move (2). */
1773 {4, 4, 4}, /* cost of storing integer registers */
1774 4, /* cost of reg,reg fld/fst */
1775 {12, 12, 12}, /* cost of loading fp registers
1776 in SFmode, DFmode and XFmode */
1777 {6, 6, 8}, /* cost of storing fp registers
1778 in SFmode, DFmode and XFmode */
1779 2, /* cost of moving MMX register */
1780 {8, 8}, /* cost of loading MMX registers
1781 in SImode and DImode */
1782 {8, 8}, /* cost of storing MMX registers
1783 in SImode and DImode */
1784 2, /* cost of moving SSE register */
1785 {8, 8, 8}, /* cost of loading SSE registers
1786 in SImode, DImode and TImode */
1787 {8, 8, 8}, /* cost of storing SSE registers
1788 in SImode, DImode and TImode */
1789 5, /* MMX or SSE register to integer */
1790 64, /* size of l1 cache. */
1791 512, /* size of l2 cache. */
1792 64, /* size of prefetch block */
1793 6, /* number of parallel prefetches */
1794 /* FIXME perhaps more appropriate value is 5. */
1795 3, /* Branch cost */
1796 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1797 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1798 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1799 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1800 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1801 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1802 core_memcpy,
1803 core_memset,
1804 1, /* scalar_stmt_cost. */
1805 1, /* scalar load_cost. */
1806 1, /* scalar_store_cost. */
1807 1, /* vec_stmt_cost. */
1808 1, /* vec_to_scalar_cost. */
1809 1, /* scalar_to_vec_cost. */
1810 1, /* vec_align_load_cost. */
1811 2, /* vec_unalign_load_cost. */
1812 1, /* vec_store_cost. */
1813 3, /* cond_taken_branch_cost. */
1814 1, /* cond_not_taken_branch_cost. */
1817 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1818 Athlon and K8. */
1819 static stringop_algs generic32_memcpy[2] = {
1820 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1821 {-1, libcall, false}}},
1822 DUMMY_STRINGOP_ALGS};
1823 static stringop_algs generic32_memset[2] = {
1824 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1825 {-1, libcall, false}}},
1826 DUMMY_STRINGOP_ALGS};
1827 static const
1828 struct processor_costs generic32_cost = {
1829 COSTS_N_INSNS (1), /* cost of an add instruction */
1830 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1831 COSTS_N_INSNS (1), /* variable shift costs */
1832 COSTS_N_INSNS (1), /* constant shift costs */
1833 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1834 COSTS_N_INSNS (4), /* HI */
1835 COSTS_N_INSNS (3), /* SI */
1836 COSTS_N_INSNS (4), /* DI */
1837 COSTS_N_INSNS (2)}, /* other */
1838 0, /* cost of multiply per each bit set */
1839 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1840 COSTS_N_INSNS (26), /* HI */
1841 COSTS_N_INSNS (42), /* SI */
1842 COSTS_N_INSNS (74), /* DI */
1843 COSTS_N_INSNS (74)}, /* other */
1844 COSTS_N_INSNS (1), /* cost of movsx */
1845 COSTS_N_INSNS (1), /* cost of movzx */
1846 8, /* "large" insn */
1847 17, /* MOVE_RATIO */
1848 4, /* cost for loading QImode using movzbl */
1849 {4, 4, 4}, /* cost of loading integer registers
1850 in QImode, HImode and SImode.
1851 Relative to reg-reg move (2). */
1852 {4, 4, 4}, /* cost of storing integer registers */
1853 4, /* cost of reg,reg fld/fst */
1854 {12, 12, 12}, /* cost of loading fp registers
1855 in SFmode, DFmode and XFmode */
1856 {6, 6, 8}, /* cost of storing fp registers
1857 in SFmode, DFmode and XFmode */
1858 2, /* cost of moving MMX register */
1859 {8, 8}, /* cost of loading MMX registers
1860 in SImode and DImode */
1861 {8, 8}, /* cost of storing MMX registers
1862 in SImode and DImode */
1863 2, /* cost of moving SSE register */
1864 {8, 8, 8}, /* cost of loading SSE registers
1865 in SImode, DImode and TImode */
1866 {8, 8, 8}, /* cost of storing SSE registers
1867 in SImode, DImode and TImode */
1868 5, /* MMX or SSE register to integer */
1869 32, /* size of l1 cache. */
1870 256, /* size of l2 cache. */
1871 64, /* size of prefetch block */
1872 6, /* number of parallel prefetches */
1873 3, /* Branch cost */
1874 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1875 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1876 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1877 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1878 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1879 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1880 generic32_memcpy,
1881 generic32_memset,
1882 1, /* scalar_stmt_cost. */
1883 1, /* scalar load_cost. */
1884 1, /* scalar_store_cost. */
1885 1, /* vec_stmt_cost. */
1886 1, /* vec_to_scalar_cost. */
1887 1, /* scalar_to_vec_cost. */
1888 1, /* vec_align_load_cost. */
1889 2, /* vec_unalign_load_cost. */
1890 1, /* vec_store_cost. */
1891 3, /* cond_taken_branch_cost. */
1892 1, /* cond_not_taken_branch_cost. */
1895 /* Set by -mtune. */
1896 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1898 /* Set by -mtune or -Os. */
1899 const struct processor_costs *ix86_cost = &pentium_cost;
1901 /* Processor feature/optimization bitmasks. */
1902 #define m_386 (1<<PROCESSOR_I386)
1903 #define m_486 (1<<PROCESSOR_I486)
1904 #define m_PENT (1<<PROCESSOR_PENTIUM)
1905 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1906 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1907 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1908 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1909 #define m_CORE2 (1<<PROCESSOR_CORE2)
1910 #define m_COREI7 (1<<PROCESSOR_COREI7)
1911 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1912 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1913 #define m_ATOM (1<<PROCESSOR_ATOM)
1914 #define m_SLM (1<<PROCESSOR_SLM)
1916 #define m_GEODE (1<<PROCESSOR_GEODE)
1917 #define m_K6 (1<<PROCESSOR_K6)
1918 #define m_K6_GEODE (m_K6 | m_GEODE)
1919 #define m_K8 (1<<PROCESSOR_K8)
1920 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1921 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1922 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1923 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1924 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1925 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1926 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1927 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1928 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1929 #define m_BTVER (m_BTVER1 | m_BTVER2)
1930 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1932 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1933 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1935 /* Generic instruction choice should be common subset of supported CPUs
1936 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1937 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1939 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1940 #undef DEF_TUNE
1941 #define DEF_TUNE(tune, name) name,
1942 #include "x86-tune.def"
1943 #undef DEF_TUNE
1946 /* Feature tests against the various tunings. */
1947 unsigned char ix86_tune_features[X86_TUNE_LAST];
1949 /* Feature tests against the various tunings used to create ix86_tune_features
1950 based on the processor mask. */
1951 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1952 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1953 negatively, so enabling for Generic64 seems like good code size
1954 tradeoff. We can't enable it for 32bit generic because it does not
1955 work well with PPro base chips. */
1956 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1958 /* X86_TUNE_PUSH_MEMORY */
1959 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1961 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1962 m_486 | m_PENT,
1964 /* X86_TUNE_UNROLL_STRLEN */
1965 m_486 | m_PENT | m_PPRO | m_ATOM | m_SLM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1967 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1968 on simulation result. But after P4 was made, no performance benefit
1969 was observed with branch hints. It also increases the code size.
1970 As a result, icc never generates branch hints. */
1973 /* X86_TUNE_DOUBLE_WITH_ADD */
1974 ~m_386,
1976 /* X86_TUNE_USE_SAHF */
1977 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1979 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1980 partial dependencies. */
1981 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1983 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1984 register stalls on Generic32 compilation setting as well. However
1985 in current implementation the partial register stalls are not eliminated
1986 very well - they can be introduced via subregs synthesized by combine
1987 and can happen in caller/callee saving sequences. Because this option
1988 pays back little on PPro based chips and is in conflict with partial reg
1989 dependencies used by Athlon/P4 based chips, it is better to leave it off
1990 for generic32 for now. */
1991 m_PPRO,
1993 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1994 m_CORE_ALL | m_GENERIC,
1996 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1997 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1998 m_CORE_ALL | m_GENERIC,
2000 /* X86_TUNE_USE_HIMODE_FIOP */
2001 m_386 | m_486 | m_K6_GEODE,
2003 /* X86_TUNE_USE_SIMODE_FIOP */
2004 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC),
2006 /* X86_TUNE_USE_MOV0 */
2007 m_K6,
2009 /* X86_TUNE_USE_CLTD */
2010 ~(m_PENT | m_ATOM | m_SLM | m_K6),
2012 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
2013 m_PENT4,
2015 /* X86_TUNE_SPLIT_LONG_MOVES */
2016 m_PPRO,
2018 /* X86_TUNE_READ_MODIFY_WRITE */
2019 ~m_PENT,
2021 /* X86_TUNE_READ_MODIFY */
2022 ~(m_PENT | m_PPRO),
2024 /* X86_TUNE_PROMOTE_QIMODE */
2025 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2027 /* X86_TUNE_FAST_PREFIX */
2028 ~(m_386 | m_486 | m_PENT),
2030 /* X86_TUNE_SINGLE_STRINGOP */
2031 m_386 | m_P4_NOCONA,
2033 /* X86_TUNE_QIMODE_MATH */
2036 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2037 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2038 might be considered for Generic32 if our scheme for avoiding partial
2039 stalls was more effective. */
2040 ~m_PPRO,
2042 /* X86_TUNE_PROMOTE_QI_REGS */
2045 /* X86_TUNE_PROMOTE_HI_REGS */
2046 m_PPRO,
2048 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2049 over esp addition. */
2050 m_386 | m_486 | m_PENT | m_PPRO,
2052 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2053 over esp addition. */
2054 m_PENT,
2056 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2057 over esp subtraction. */
2058 m_386 | m_486 | m_PENT | m_K6_GEODE,
2060 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2061 over esp subtraction. */
2062 m_PENT | m_K6_GEODE,
2064 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2065 for DFmode copies */
2066 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC),
2068 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2069 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
2071 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2072 conflict here in between PPro/Pentium4 based chips that thread 128bit
2073 SSE registers as single units versus K8 based chips that divide SSE
2074 registers to two 64bit halves. This knob promotes all store destinations
2075 to be 128bit to allow register renaming on 128bit SSE units, but usually
2076 results in one extra microop on 64bit SSE units. Experimental results
2077 shows that disabling this option on P4 brings over 20% SPECfp regression,
2078 while enabling it on K8 brings roughly 2.4% regression that can be partly
2079 masked by careful scheduling of moves. */
2080 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2082 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2083 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM,
2085 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2086 m_COREI7 | m_BDVER | m_SLM,
2088 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2089 m_BDVER ,
2091 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2092 are resolved on SSE register parts instead of whole registers, so we may
2093 maintain just lower part of scalar values in proper format leaving the
2094 upper part undefined. */
2095 m_ATHLON_K8,
2097 /* X86_TUNE_SSE_TYPELESS_STORES */
2098 m_AMD_MULTIPLE,
2100 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2101 m_PPRO | m_P4_NOCONA,
2103 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2104 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
2106 /* X86_TUNE_PROLOGUE_USING_MOVE */
2107 m_PPRO | m_ATHLON_K8,
2109 /* X86_TUNE_EPILOGUE_USING_MOVE */
2110 m_PPRO | m_ATHLON_K8,
2112 /* X86_TUNE_SHIFT1 */
2113 ~m_486,
2115 /* X86_TUNE_USE_FFREEP */
2116 m_AMD_MULTIPLE,
2118 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC */
2119 ~(m_AMD_MULTIPLE | m_GENERIC),
2121 /* X86_TUNE_INTER_UNIT_MOVES_FROM_VEC */
2122 ~m_ATHLON_K8,
2124 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2125 ~(m_AMDFAM10 | m_BDVER ),
2127 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2128 than 4 branch instructions in the 16 byte window. */
2129 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
2131 /* X86_TUNE_SCHEDULE */
2132 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2134 /* X86_TUNE_USE_BT */
2135 m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
2137 /* X86_TUNE_USE_INCDEC */
2138 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC),
2140 /* X86_TUNE_PAD_RETURNS */
2141 m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
2143 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short function. */
2144 m_ATOM,
2146 /* X86_TUNE_EXT_80387_CONSTANTS */
2147 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2149 /* X86_TUNE_AVOID_VECTOR_DECODE */
2150 m_CORE_ALL | m_K8 | m_GENERIC64,
2152 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2153 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2154 ~(m_386 | m_486),
2156 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2157 vector path on AMD machines. */
2158 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2160 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2161 machines. */
2162 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2164 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2165 than a MOV. */
2166 m_PENT,
2168 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2169 but one byte longer. */
2170 m_PENT,
2172 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2173 operand that cannot be represented using a modRM byte. The XOR
2174 replacement is long decoded, so this split helps here as well. */
2175 m_K6,
2177 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2178 from FP to FP. */
2179 m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
2181 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2182 from integer to FP. */
2183 m_AMDFAM10,
2185 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2186 with a subsequent conditional jump instruction into a single
2187 compare-and-branch uop. */
2188 m_BDVER,
2190 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2191 will impact LEA instruction selection. */
2192 m_ATOM | m_SLM,
2194 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2195 instructions. */
2196 ~m_ATOM,
2198 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2199 at -O3. For the moment, the prefetching seems badly tuned for Intel
2200 chips. */
2201 m_K6_GEODE | m_AMD_MULTIPLE,
2203 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2204 the auto-vectorizer. */
2205 m_BDVER | m_BTVER2,
2207 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2208 during reassociation of integer computation. */
2209 m_ATOM,
2211 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2212 during reassociation of fp computation. */
2213 m_ATOM | m_SLM | m_HASWELL | m_BDVER1 | m_BDVER2,
2215 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2216 regs instead of memory. */
2217 m_CORE_ALL,
2219 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2220 a conditional move. */
2221 m_ATOM,
2223 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
2224 fp converts to destination register. */
2225 m_SLM
2229 /* Feature tests against the various architecture variations. */
2230 unsigned char ix86_arch_features[X86_ARCH_LAST];
2232 /* Feature tests against the various architecture variations, used to create
2233 ix86_arch_features based on the processor mask. */
2234 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2235 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2236 ~(m_386 | m_486 | m_PENT | m_K6),
2238 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2239 ~m_386,
2241 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2242 ~(m_386 | m_486),
2244 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2245 ~m_386,
2247 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2248 ~m_386,
2251 static const unsigned int x86_accumulate_outgoing_args
2252 = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
2254 static const unsigned int x86_arch_always_fancy_math_387
2255 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
2257 static const unsigned int x86_avx256_split_unaligned_load
2258 = m_COREI7 | m_GENERIC;
2260 static const unsigned int x86_avx256_split_unaligned_store
2261 = m_COREI7 | m_BDVER | m_GENERIC;
2263 /* In case the average insn count for single function invocation is
2264 lower than this constant, emit fast (but longer) prologue and
2265 epilogue code. */
2266 #define FAST_PROLOGUE_INSN_COUNT 20
2268 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2269 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2270 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2271 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2273 /* Array of the smallest class containing reg number REGNO, indexed by
2274 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2276 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2278 /* ax, dx, cx, bx */
2279 AREG, DREG, CREG, BREG,
2280 /* si, di, bp, sp */
2281 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2282 /* FP registers */
2283 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2284 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2285 /* arg pointer */
2286 NON_Q_REGS,
2287 /* flags, fpsr, fpcr, frame */
2288 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2289 /* SSE registers */
2290 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2291 SSE_REGS, SSE_REGS,
2292 /* MMX registers */
2293 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2294 MMX_REGS, MMX_REGS,
2295 /* REX registers */
2296 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2297 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2298 /* SSE REX registers */
2299 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2300 SSE_REGS, SSE_REGS,
2303 /* The "default" register map used in 32bit mode. */
2305 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2307 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2308 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2309 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2310 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2311 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2312 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2313 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2316 /* The "default" register map used in 64bit mode. */
2318 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2320 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2321 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2322 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2323 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2324 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2325 8,9,10,11,12,13,14,15, /* extended integer registers */
2326 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2329 /* Define the register numbers to be used in Dwarf debugging information.
2330 The SVR4 reference port C compiler uses the following register numbers
2331 in its Dwarf output code:
2332 0 for %eax (gcc regno = 0)
2333 1 for %ecx (gcc regno = 2)
2334 2 for %edx (gcc regno = 1)
2335 3 for %ebx (gcc regno = 3)
2336 4 for %esp (gcc regno = 7)
2337 5 for %ebp (gcc regno = 6)
2338 6 for %esi (gcc regno = 4)
2339 7 for %edi (gcc regno = 5)
2340 The following three DWARF register numbers are never generated by
2341 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2342 believes these numbers have these meanings.
2343 8 for %eip (no gcc equivalent)
2344 9 for %eflags (gcc regno = 17)
2345 10 for %trapno (no gcc equivalent)
2346 It is not at all clear how we should number the FP stack registers
2347 for the x86 architecture. If the version of SDB on x86/svr4 were
2348 a bit less brain dead with respect to floating-point then we would
2349 have a precedent to follow with respect to DWARF register numbers
2350 for x86 FP registers, but the SDB on x86/svr4 is so completely
2351 broken with respect to FP registers that it is hardly worth thinking
2352 of it as something to strive for compatibility with.
2353 The version of x86/svr4 SDB I have at the moment does (partially)
2354 seem to believe that DWARF register number 11 is associated with
2355 the x86 register %st(0), but that's about all. Higher DWARF
2356 register numbers don't seem to be associated with anything in
2357 particular, and even for DWARF regno 11, SDB only seems to under-
2358 stand that it should say that a variable lives in %st(0) (when
2359 asked via an `=' command) if we said it was in DWARF regno 11,
2360 but SDB still prints garbage when asked for the value of the
2361 variable in question (via a `/' command).
2362 (Also note that the labels SDB prints for various FP stack regs
2363 when doing an `x' command are all wrong.)
2364 Note that these problems generally don't affect the native SVR4
2365 C compiler because it doesn't allow the use of -O with -g and
2366 because when it is *not* optimizing, it allocates a memory
2367 location for each floating-point variable, and the memory
2368 location is what gets described in the DWARF AT_location
2369 attribute for the variable in question.
2370 Regardless of the severe mental illness of the x86/svr4 SDB, we
2371 do something sensible here and we use the following DWARF
2372 register numbers. Note that these are all stack-top-relative
2373 numbers.
2374 11 for %st(0) (gcc regno = 8)
2375 12 for %st(1) (gcc regno = 9)
2376 13 for %st(2) (gcc regno = 10)
2377 14 for %st(3) (gcc regno = 11)
2378 15 for %st(4) (gcc regno = 12)
2379 16 for %st(5) (gcc regno = 13)
2380 17 for %st(6) (gcc regno = 14)
2381 18 for %st(7) (gcc regno = 15)
2383 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2385 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2386 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2387 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2388 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2389 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2390 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2391 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2394 /* Define parameter passing and return registers. */
2396 static int const x86_64_int_parameter_registers[6] =
2398 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2401 static int const x86_64_ms_abi_int_parameter_registers[4] =
2403 CX_REG, DX_REG, R8_REG, R9_REG
2406 static int const x86_64_int_return_registers[4] =
2408 AX_REG, DX_REG, DI_REG, SI_REG
2411 /* Additional registers that are clobbered by SYSV calls. */
2413 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2415 SI_REG, DI_REG,
2416 XMM6_REG, XMM7_REG,
2417 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2418 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2421 /* Define the structure for the machine field in struct function. */
2423 struct GTY(()) stack_local_entry {
2424 unsigned short mode;
2425 unsigned short n;
2426 rtx rtl;
2427 struct stack_local_entry *next;
2430 /* Structure describing stack frame layout.
2431 Stack grows downward:
2433 [arguments]
2434 <- ARG_POINTER
2435 saved pc
2437 saved static chain if ix86_static_chain_on_stack
2439 saved frame pointer if frame_pointer_needed
2440 <- HARD_FRAME_POINTER
2441 [saved regs]
2442 <- regs_save_offset
2443 [padding0]
2445 [saved SSE regs]
2446 <- sse_regs_save_offset
2447 [padding1] |
2448 | <- FRAME_POINTER
2449 [va_arg registers] |
2451 [frame] |
2453 [padding2] | = to_allocate
2454 <- STACK_POINTER
2456 struct ix86_frame
2458 int nsseregs;
2459 int nregs;
2460 int va_arg_size;
2461 int red_zone_size;
2462 int outgoing_arguments_size;
2464 /* The offsets relative to ARG_POINTER. */
2465 HOST_WIDE_INT frame_pointer_offset;
2466 HOST_WIDE_INT hard_frame_pointer_offset;
2467 HOST_WIDE_INT stack_pointer_offset;
2468 HOST_WIDE_INT hfp_save_offset;
2469 HOST_WIDE_INT reg_save_offset;
2470 HOST_WIDE_INT sse_reg_save_offset;
2472 /* When save_regs_using_mov is set, emit prologue using
2473 move instead of push instructions. */
2474 bool save_regs_using_mov;
2477 /* Which cpu are we scheduling for. */
2478 enum attr_cpu ix86_schedule;
2480 /* Which cpu are we optimizing for. */
2481 enum processor_type ix86_tune;
2483 /* Which instruction set architecture to use. */
2484 enum processor_type ix86_arch;
2486 /* True if processor has SSE prefetch instruction. */
2487 unsigned char x86_prefetch_sse;
2489 /* -mstackrealign option */
2490 static const char ix86_force_align_arg_pointer_string[]
2491 = "force_align_arg_pointer";
2493 static rtx (*ix86_gen_leave) (void);
2494 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2495 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2496 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2497 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2498 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2499 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2500 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2501 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2502 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2503 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2504 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2506 /* Preferred alignment for stack boundary in bits. */
2507 unsigned int ix86_preferred_stack_boundary;
2509 /* Alignment for incoming stack boundary in bits specified at
2510 command line. */
2511 static unsigned int ix86_user_incoming_stack_boundary;
2513 /* Default alignment for incoming stack boundary in bits. */
2514 static unsigned int ix86_default_incoming_stack_boundary;
2516 /* Alignment for incoming stack boundary in bits. */
2517 unsigned int ix86_incoming_stack_boundary;
2519 /* Calling abi specific va_list type nodes. */
2520 static GTY(()) tree sysv_va_list_type_node;
2521 static GTY(()) tree ms_va_list_type_node;
2523 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2524 char internal_label_prefix[16];
2525 int internal_label_prefix_len;
2527 /* Fence to use after loop using movnt. */
2528 tree x86_mfence;
2530 /* Register class used for passing given 64bit part of the argument.
2531 These represent classes as documented by the PS ABI, with the exception
2532 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2533 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2535 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2536 whenever possible (upper half does contain padding). */
2537 enum x86_64_reg_class
2539 X86_64_NO_CLASS,
2540 X86_64_INTEGER_CLASS,
2541 X86_64_INTEGERSI_CLASS,
2542 X86_64_SSE_CLASS,
2543 X86_64_SSESF_CLASS,
2544 X86_64_SSEDF_CLASS,
2545 X86_64_SSEUP_CLASS,
2546 X86_64_X87_CLASS,
2547 X86_64_X87UP_CLASS,
2548 X86_64_COMPLEX_X87_CLASS,
2549 X86_64_MEMORY_CLASS
2552 #define MAX_CLASSES 4
2554 /* Table of constants used by fldpi, fldln2, etc.... */
2555 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2556 static bool ext_80387_constants_init = 0;
2559 static struct machine_function * ix86_init_machine_status (void);
2560 static rtx ix86_function_value (const_tree, const_tree, bool);
2561 static bool ix86_function_value_regno_p (const unsigned int);
2562 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2563 const_tree);
2564 static rtx ix86_static_chain (const_tree, bool);
2565 static int ix86_function_regparm (const_tree, const_tree);
2566 static void ix86_compute_frame_layout (struct ix86_frame *);
2567 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2568 rtx, rtx, int);
2569 static void ix86_add_new_builtins (HOST_WIDE_INT);
2570 static tree ix86_canonical_va_list_type (tree);
2571 static void predict_jump (int);
2572 static unsigned int split_stack_prologue_scratch_regno (void);
2573 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2575 enum ix86_function_specific_strings
2577 IX86_FUNCTION_SPECIFIC_ARCH,
2578 IX86_FUNCTION_SPECIFIC_TUNE,
2579 IX86_FUNCTION_SPECIFIC_MAX
2582 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2583 const char *, enum fpmath_unit, bool);
2584 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2585 static void ix86_function_specific_save (struct cl_target_option *);
2586 static void ix86_function_specific_restore (struct cl_target_option *);
2587 static void ix86_function_specific_print (FILE *, int,
2588 struct cl_target_option *);
2589 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2590 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2591 struct gcc_options *);
2592 static bool ix86_can_inline_p (tree, tree);
2593 static void ix86_set_current_function (tree);
2594 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2596 static enum calling_abi ix86_function_abi (const_tree);
2599 #ifndef SUBTARGET32_DEFAULT_CPU
2600 #define SUBTARGET32_DEFAULT_CPU "i386"
2601 #endif
2603 /* Whether -mtune= or -march= were specified */
2604 static int ix86_tune_defaulted;
2605 static int ix86_arch_specified;
2607 /* Vectorization library interface and handlers. */
2608 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2610 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2611 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2613 /* Processor target table, indexed by processor number */
2614 struct ptt
2616 const struct processor_costs *cost; /* Processor costs */
2617 const int align_loop; /* Default alignments. */
2618 const int align_loop_max_skip;
2619 const int align_jump;
2620 const int align_jump_max_skip;
2621 const int align_func;
2624 static const struct ptt processor_target_table[PROCESSOR_max] =
2626 {&i386_cost, 4, 3, 4, 3, 4},
2627 {&i486_cost, 16, 15, 16, 15, 16},
2628 {&pentium_cost, 16, 7, 16, 7, 16},
2629 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2630 {&geode_cost, 0, 0, 0, 0, 0},
2631 {&k6_cost, 32, 7, 32, 7, 32},
2632 {&athlon_cost, 16, 7, 16, 7, 16},
2633 {&pentium4_cost, 0, 0, 0, 0, 0},
2634 {&k8_cost, 16, 7, 16, 7, 16},
2635 {&nocona_cost, 0, 0, 0, 0, 0},
2636 /* Core 2 */
2637 {&core_cost, 16, 10, 16, 10, 16},
2638 /* Core i7 */
2639 {&core_cost, 16, 10, 16, 10, 16},
2640 /* Core avx2 */
2641 {&core_cost, 16, 10, 16, 10, 16},
2642 {&generic32_cost, 16, 7, 16, 7, 16},
2643 {&generic64_cost, 16, 10, 16, 10, 16},
2644 {&amdfam10_cost, 32, 24, 32, 7, 32},
2645 {&bdver1_cost, 16, 10, 16, 7, 11},
2646 {&bdver2_cost, 16, 10, 16, 7, 11},
2647 {&bdver3_cost, 16, 10, 16, 7, 11},
2648 {&btver1_cost, 16, 10, 16, 7, 11},
2649 {&btver2_cost, 16, 10, 16, 7, 11},
2650 {&atom_cost, 16, 15, 16, 7, 16},
2651 {&slm_cost, 16, 15, 16, 7, 16}
2654 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2656 "generic",
2657 "i386",
2658 "i486",
2659 "pentium",
2660 "pentium-mmx",
2661 "pentiumpro",
2662 "pentium2",
2663 "pentium3",
2664 "pentium4",
2665 "pentium-m",
2666 "prescott",
2667 "nocona",
2668 "core2",
2669 "corei7",
2670 "core-avx2",
2671 "atom",
2672 "slm",
2673 "geode",
2674 "k6",
2675 "k6-2",
2676 "k6-3",
2677 "athlon",
2678 "athlon-4",
2679 "k8",
2680 "amdfam10",
2681 "bdver1",
2682 "bdver2",
2683 "bdver3",
2684 "btver1",
2685 "btver2"
2688 static bool
2689 gate_insert_vzeroupper (void)
2691 return TARGET_AVX && TARGET_VZEROUPPER;
2694 static unsigned int
2695 rest_of_handle_insert_vzeroupper (void)
2697 int i;
2699 /* vzeroupper instructions are inserted immediately after reload to
2700 account for possible spills from 256bit registers. The pass
2701 reuses mode switching infrastructure by re-running mode insertion
2702 pass, so disable entities that have already been processed. */
2703 for (i = 0; i < MAX_386_ENTITIES; i++)
2704 ix86_optimize_mode_switching[i] = 0;
2706 ix86_optimize_mode_switching[AVX_U128] = 1;
2708 /* Call optimize_mode_switching. */
2709 g->get_passes ()->execute_pass_mode_switching ();
2710 return 0;
2713 namespace {
2715 const pass_data pass_data_insert_vzeroupper =
2717 RTL_PASS, /* type */
2718 "vzeroupper", /* name */
2719 OPTGROUP_NONE, /* optinfo_flags */
2720 true, /* has_gate */
2721 true, /* has_execute */
2722 TV_NONE, /* tv_id */
2723 0, /* properties_required */
2724 0, /* properties_provided */
2725 0, /* properties_destroyed */
2726 0, /* todo_flags_start */
2727 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2730 class pass_insert_vzeroupper : public rtl_opt_pass
2732 public:
2733 pass_insert_vzeroupper(gcc::context *ctxt)
2734 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2737 /* opt_pass methods: */
2738 bool gate () { return gate_insert_vzeroupper (); }
2739 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2741 }; // class pass_insert_vzeroupper
2743 } // anon namespace
2745 rtl_opt_pass *
2746 make_pass_insert_vzeroupper (gcc::context *ctxt)
2748 return new pass_insert_vzeroupper (ctxt);
2751 /* Return true if a red-zone is in use. */
2753 static inline bool
2754 ix86_using_red_zone (void)
2756 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2759 /* Return a string that documents the current -m options. The caller is
2760 responsible for freeing the string. */
2762 static char *
2763 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2764 const char *tune, enum fpmath_unit fpmath,
2765 bool add_nl_p)
2767 struct ix86_target_opts
2769 const char *option; /* option string */
2770 HOST_WIDE_INT mask; /* isa mask options */
2773 /* This table is ordered so that options like -msse4.2 that imply
2774 preceding options while match those first. */
2775 static struct ix86_target_opts isa_opts[] =
2777 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2778 { "-mfma", OPTION_MASK_ISA_FMA },
2779 { "-mxop", OPTION_MASK_ISA_XOP },
2780 { "-mlwp", OPTION_MASK_ISA_LWP },
2781 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2782 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2783 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2784 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2785 { "-msse3", OPTION_MASK_ISA_SSE3 },
2786 { "-msse2", OPTION_MASK_ISA_SSE2 },
2787 { "-msse", OPTION_MASK_ISA_SSE },
2788 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2789 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2790 { "-mmmx", OPTION_MASK_ISA_MMX },
2791 { "-mabm", OPTION_MASK_ISA_ABM },
2792 { "-mbmi", OPTION_MASK_ISA_BMI },
2793 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2794 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2795 { "-mhle", OPTION_MASK_ISA_HLE },
2796 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2797 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2798 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2799 { "-madx", OPTION_MASK_ISA_ADX },
2800 { "-mtbm", OPTION_MASK_ISA_TBM },
2801 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2802 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2803 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2804 { "-maes", OPTION_MASK_ISA_AES },
2805 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2806 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2807 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2808 { "-mf16c", OPTION_MASK_ISA_F16C },
2809 { "-mrtm", OPTION_MASK_ISA_RTM },
2810 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2811 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2814 /* Flag options. */
2815 static struct ix86_target_opts flag_opts[] =
2817 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2818 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2819 { "-m80387", MASK_80387 },
2820 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2821 { "-malign-double", MASK_ALIGN_DOUBLE },
2822 { "-mcld", MASK_CLD },
2823 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2824 { "-mieee-fp", MASK_IEEE_FP },
2825 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2826 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2827 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2828 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2829 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2830 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2831 { "-mno-red-zone", MASK_NO_RED_ZONE },
2832 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2833 { "-mrecip", MASK_RECIP },
2834 { "-mrtd", MASK_RTD },
2835 { "-msseregparm", MASK_SSEREGPARM },
2836 { "-mstack-arg-probe", MASK_STACK_PROBE },
2837 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2838 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2839 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2840 { "-mvzeroupper", MASK_VZEROUPPER },
2841 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2842 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2843 { "-mprefer-avx128", MASK_PREFER_AVX128},
2846 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2848 char isa_other[40];
2849 char target_other[40];
2850 unsigned num = 0;
2851 unsigned i, j;
2852 char *ret;
2853 char *ptr;
2854 size_t len;
2855 size_t line_len;
2856 size_t sep_len;
2857 const char *abi;
2859 memset (opts, '\0', sizeof (opts));
2861 /* Add -march= option. */
2862 if (arch)
2864 opts[num][0] = "-march=";
2865 opts[num++][1] = arch;
2868 /* Add -mtune= option. */
2869 if (tune)
2871 opts[num][0] = "-mtune=";
2872 opts[num++][1] = tune;
2875 /* Add -m32/-m64/-mx32. */
2876 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2878 if ((isa & OPTION_MASK_ABI_64) != 0)
2879 abi = "-m64";
2880 else
2881 abi = "-mx32";
2882 isa &= ~ (OPTION_MASK_ISA_64BIT
2883 | OPTION_MASK_ABI_64
2884 | OPTION_MASK_ABI_X32);
2886 else
2887 abi = "-m32";
2888 opts[num++][0] = abi;
2890 /* Pick out the options in isa options. */
2891 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2893 if ((isa & isa_opts[i].mask) != 0)
2895 opts[num++][0] = isa_opts[i].option;
2896 isa &= ~ isa_opts[i].mask;
2900 if (isa && add_nl_p)
2902 opts[num++][0] = isa_other;
2903 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2904 isa);
2907 /* Add flag options. */
2908 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2910 if ((flags & flag_opts[i].mask) != 0)
2912 opts[num++][0] = flag_opts[i].option;
2913 flags &= ~ flag_opts[i].mask;
2917 if (flags && add_nl_p)
2919 opts[num++][0] = target_other;
2920 sprintf (target_other, "(other flags: %#x)", flags);
2923 /* Add -fpmath= option. */
2924 if (fpmath)
2926 opts[num][0] = "-mfpmath=";
2927 switch ((int) fpmath)
2929 case FPMATH_387:
2930 opts[num++][1] = "387";
2931 break;
2933 case FPMATH_SSE:
2934 opts[num++][1] = "sse";
2935 break;
2937 case FPMATH_387 | FPMATH_SSE:
2938 opts[num++][1] = "sse+387";
2939 break;
2941 default:
2942 gcc_unreachable ();
2946 /* Any options? */
2947 if (num == 0)
2948 return NULL;
2950 gcc_assert (num < ARRAY_SIZE (opts));
2952 /* Size the string. */
2953 len = 0;
2954 sep_len = (add_nl_p) ? 3 : 1;
2955 for (i = 0; i < num; i++)
2957 len += sep_len;
2958 for (j = 0; j < 2; j++)
2959 if (opts[i][j])
2960 len += strlen (opts[i][j]);
2963 /* Build the string. */
2964 ret = ptr = (char *) xmalloc (len);
2965 line_len = 0;
2967 for (i = 0; i < num; i++)
2969 size_t len2[2];
2971 for (j = 0; j < 2; j++)
2972 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2974 if (i != 0)
2976 *ptr++ = ' ';
2977 line_len++;
2979 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2981 *ptr++ = '\\';
2982 *ptr++ = '\n';
2983 line_len = 0;
2987 for (j = 0; j < 2; j++)
2988 if (opts[i][j])
2990 memcpy (ptr, opts[i][j], len2[j]);
2991 ptr += len2[j];
2992 line_len += len2[j];
2996 *ptr = '\0';
2997 gcc_assert (ret + len >= ptr);
2999 return ret;
3002 /* Return true, if profiling code should be emitted before
3003 prologue. Otherwise it returns false.
3004 Note: For x86 with "hotfix" it is sorried. */
3005 static bool
3006 ix86_profile_before_prologue (void)
3008 return flag_fentry != 0;
3011 /* Function that is callable from the debugger to print the current
3012 options. */
3013 void
3014 ix86_debug_options (void)
3016 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
3017 ix86_arch_string, ix86_tune_string,
3018 ix86_fpmath, true);
3020 if (opts)
3022 fprintf (stderr, "%s\n\n", opts);
3023 free (opts);
3025 else
3026 fputs ("<no options>\n\n", stderr);
3028 return;
3031 static const char *stringop_alg_names[] = {
3032 #define DEF_ENUM
3033 #define DEF_ALG(alg, name) #name,
3034 #include "stringop.def"
3035 #undef DEF_ENUM
3036 #undef DEF_ALG
3039 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3040 The string is of the following form (or comma separated list of it):
3042 strategy_alg:max_size:[align|noalign]
3044 where the full size range for the strategy is either [0, max_size] or
3045 [min_size, max_size], in which min_size is the max_size + 1 of the
3046 preceding range. The last size range must have max_size == -1.
3048 Examples:
3051 -mmemcpy-strategy=libcall:-1:noalign
3053 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3057 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3059 This is to tell the compiler to use the following strategy for memset
3060 1) when the expected size is between [1, 16], use rep_8byte strategy;
3061 2) when the size is between [17, 2048], use vector_loop;
3062 3) when the size is > 2048, use libcall. */
3064 struct stringop_size_range
3066 int min;
3067 int max;
3068 stringop_alg alg;
3069 bool noalign;
3072 static void
3073 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3075 const struct stringop_algs *default_algs;
3076 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3077 char *curr_range_str, *next_range_str;
3078 int i = 0, n = 0;
3080 if (is_memset)
3081 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3082 else
3083 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3085 curr_range_str = strategy_str;
3089 int mins, maxs;
3090 stringop_alg alg;
3091 char alg_name[128];
3092 char align[16];
3093 next_range_str = strchr (curr_range_str, ',');
3094 if (next_range_str)
3095 *next_range_str++ = '\0';
3097 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
3098 alg_name, &maxs, align))
3100 error ("wrong arg %s to option %s", curr_range_str,
3101 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
3102 return;
3105 if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
3107 error ("size ranges of option %s should be increasing",
3108 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
3109 return;
3112 for (i = 0; i < last_alg; i++)
3114 if (!strcmp (alg_name, stringop_alg_names[i]))
3116 alg = (stringop_alg) i;
3117 break;
3121 if (i == last_alg)
3123 error ("wrong stringop strategy name %s specified for option %s",
3124 alg_name,
3125 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
3126 return;
3129 input_ranges[n].min = mins;
3130 input_ranges[n].max = maxs;
3131 input_ranges[n].alg = alg;
3132 if (!strcmp (align, "align"))
3133 input_ranges[n].noalign = false;
3134 else if (!strcmp (align, "noalign"))
3135 input_ranges[n].noalign = true;
3136 else
3138 error ("unknown alignment %s specified for option %s",
3139 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
3140 return;
3142 n++;
3143 curr_range_str = next_range_str;
3145 while (curr_range_str);
3147 if (input_ranges[n - 1].max != -1)
3149 error ("the max value for the last size range should be -1"
3150 " for option %s",
3151 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
3152 return;
3155 if (n > MAX_STRINGOP_ALGS)
3157 error ("too many size ranges specified in option %s",
3158 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
3159 return;
3162 /* Now override the default algs array. */
3163 for (i = 0; i < n; i++)
3165 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3166 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3167 = input_ranges[i].alg;
3168 *const_cast<int *>(&default_algs->size[i].noalign)
3169 = input_ranges[i].noalign;
3174 /* Override various settings based on options. If MAIN_ARGS_P, the
3175 options are from the command line, otherwise they are from
3176 attributes. */
3178 static void
3179 ix86_option_override_internal (bool main_args_p)
3181 int i;
3182 unsigned int ix86_arch_mask, ix86_tune_mask;
3183 const bool ix86_tune_specified = (ix86_tune_string != NULL);
3184 const char *prefix;
3185 const char *suffix;
3186 const char *sw;
3188 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3189 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3190 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3191 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3192 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3193 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3194 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3195 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3196 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3197 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3198 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3199 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3200 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3201 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3202 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3203 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3204 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3205 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3206 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3207 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3208 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3209 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3210 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3211 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3212 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3213 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3214 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3215 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3216 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3217 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3218 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3219 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3220 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3221 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3222 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3223 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3224 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3225 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3226 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3227 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3229 /* if this reaches 64, need to widen struct pta flags below */
3231 static struct pta
3233 const char *const name; /* processor name or nickname. */
3234 const enum processor_type processor;
3235 const enum attr_cpu schedule;
3236 const unsigned HOST_WIDE_INT flags;
3238 const processor_alias_table[] =
3240 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3241 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3242 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3243 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3244 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3245 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3246 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3247 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3248 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3249 PTA_MMX | PTA_SSE | PTA_FXSR},
3250 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3251 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3252 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3253 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3254 PTA_MMX | PTA_SSE | PTA_FXSR},
3255 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3256 PTA_MMX | PTA_SSE | PTA_FXSR},
3257 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3258 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3259 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3260 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3261 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3262 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3263 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3264 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3265 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3266 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3267 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3268 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3269 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3270 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3271 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3272 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3273 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3274 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
3275 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3276 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3277 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3278 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3279 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
3280 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3281 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3282 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3283 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3284 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3285 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3286 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3287 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3288 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3289 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3290 | PTA_XSAVEOPT},
3291 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3292 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3293 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3294 {"slm", PROCESSOR_SLM, CPU_SLM,
3295 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3296 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3297 | PTA_FXSR},
3298 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3299 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3300 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3301 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3302 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3303 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3304 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3305 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3306 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3307 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3308 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3309 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3310 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3311 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3312 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3313 {"x86-64", PROCESSOR_K8, CPU_K8,
3314 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3315 {"k8", PROCESSOR_K8, CPU_K8,
3316 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3317 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3318 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3319 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3320 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3321 {"opteron", PROCESSOR_K8, CPU_K8,
3322 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3323 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3324 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3325 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3326 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3327 {"athlon64", PROCESSOR_K8, CPU_K8,
3328 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3329 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3330 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3331 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3332 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3333 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3334 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3335 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3336 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3337 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3338 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3339 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3340 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3341 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3342 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3343 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3344 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3345 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3346 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3347 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3348 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3349 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3350 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3351 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3352 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3353 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3354 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3355 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3356 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3357 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3358 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3359 | PTA_XSAVEOPT | PTA_FSGSBASE},
3360 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3361 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3362 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3363 | PTA_FXSR | PTA_XSAVE},
3364 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3365 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3366 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3367 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3368 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3369 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3371 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3372 PTA_HLE /* flags are only used for -march switch. */ },
3373 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3374 PTA_64BIT
3375 | PTA_HLE /* flags are only used for -march switch. */ },
3378 /* -mrecip options. */
3379 static struct
3381 const char *string; /* option name */
3382 unsigned int mask; /* mask bits to set */
3384 const recip_options[] =
3386 { "all", RECIP_MASK_ALL },
3387 { "none", RECIP_MASK_NONE },
3388 { "div", RECIP_MASK_DIV },
3389 { "sqrt", RECIP_MASK_SQRT },
3390 { "vec-div", RECIP_MASK_VEC_DIV },
3391 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3394 int const pta_size = ARRAY_SIZE (processor_alias_table);
3396 /* Set up prefix/suffix so the error messages refer to either the command
3397 line argument, or the attribute(target). */
3398 if (main_args_p)
3400 prefix = "-m";
3401 suffix = "";
3402 sw = "switch";
3404 else
3406 prefix = "option(\"";
3407 suffix = "\")";
3408 sw = "attribute";
3411 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3412 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3413 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3414 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3415 #ifdef TARGET_BI_ARCH
3416 else
3418 #if TARGET_BI_ARCH == 1
3419 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3420 is on and OPTION_MASK_ABI_X32 is off. We turn off
3421 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3422 -mx32. */
3423 if (TARGET_X32)
3424 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3425 #else
3426 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3427 on and OPTION_MASK_ABI_64 is off. We turn off
3428 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3429 -m64. */
3430 if (TARGET_LP64)
3431 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3432 #endif
3434 #endif
3436 if (TARGET_X32)
3438 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3439 OPTION_MASK_ABI_64 for TARGET_X32. */
3440 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3441 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3443 else if (TARGET_LP64)
3445 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3446 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3447 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3448 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3451 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3452 SUBTARGET_OVERRIDE_OPTIONS;
3453 #endif
3455 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3456 SUBSUBTARGET_OVERRIDE_OPTIONS;
3457 #endif
3459 /* -fPIC is the default for x86_64. */
3460 if (TARGET_MACHO && TARGET_64BIT)
3461 flag_pic = 2;
3463 /* Need to check -mtune=generic first. */
3464 if (ix86_tune_string)
3466 if (!strcmp (ix86_tune_string, "generic")
3467 || !strcmp (ix86_tune_string, "i686")
3468 /* As special support for cross compilers we read -mtune=native
3469 as -mtune=generic. With native compilers we won't see the
3470 -mtune=native, as it was changed by the driver. */
3471 || !strcmp (ix86_tune_string, "native"))
3473 if (TARGET_64BIT)
3474 ix86_tune_string = "generic64";
3475 else
3476 ix86_tune_string = "generic32";
3478 /* If this call is for setting the option attribute, allow the
3479 generic32/generic64 that was previously set. */
3480 else if (!main_args_p
3481 && (!strcmp (ix86_tune_string, "generic32")
3482 || !strcmp (ix86_tune_string, "generic64")))
3484 else if (!strncmp (ix86_tune_string, "generic", 7))
3485 error ("bad value (%s) for %stune=%s %s",
3486 ix86_tune_string, prefix, suffix, sw);
3487 else if (!strcmp (ix86_tune_string, "x86-64"))
3488 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3489 "%stune=k8%s or %stune=generic%s instead as appropriate",
3490 prefix, suffix, prefix, suffix, prefix, suffix);
3492 else
3494 if (ix86_arch_string)
3495 ix86_tune_string = ix86_arch_string;
3496 if (!ix86_tune_string)
3498 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3499 ix86_tune_defaulted = 1;
3502 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3503 need to use a sensible tune option. */
3504 if (!strcmp (ix86_tune_string, "generic")
3505 || !strcmp (ix86_tune_string, "x86-64")
3506 || !strcmp (ix86_tune_string, "i686"))
3508 if (TARGET_64BIT)
3509 ix86_tune_string = "generic64";
3510 else
3511 ix86_tune_string = "generic32";
3515 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3517 /* rep; movq isn't available in 32-bit code. */
3518 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3519 ix86_stringop_alg = no_stringop;
3522 if (!ix86_arch_string)
3523 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3524 else
3525 ix86_arch_specified = 1;
3527 if (global_options_set.x_ix86_pmode)
3529 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3530 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3531 error ("address mode %qs not supported in the %s bit mode",
3532 TARGET_64BIT ? "short" : "long",
3533 TARGET_64BIT ? "64" : "32");
3535 else
3536 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3538 if (!global_options_set.x_ix86_abi)
3539 ix86_abi = DEFAULT_ABI;
3541 if (global_options_set.x_ix86_cmodel)
3543 switch (ix86_cmodel)
3545 case CM_SMALL:
3546 case CM_SMALL_PIC:
3547 if (flag_pic)
3548 ix86_cmodel = CM_SMALL_PIC;
3549 if (!TARGET_64BIT)
3550 error ("code model %qs not supported in the %s bit mode",
3551 "small", "32");
3552 break;
3554 case CM_MEDIUM:
3555 case CM_MEDIUM_PIC:
3556 if (flag_pic)
3557 ix86_cmodel = CM_MEDIUM_PIC;
3558 if (!TARGET_64BIT)
3559 error ("code model %qs not supported in the %s bit mode",
3560 "medium", "32");
3561 else if (TARGET_X32)
3562 error ("code model %qs not supported in x32 mode",
3563 "medium");
3564 break;
3566 case CM_LARGE:
3567 case CM_LARGE_PIC:
3568 if (flag_pic)
3569 ix86_cmodel = CM_LARGE_PIC;
3570 if (!TARGET_64BIT)
3571 error ("code model %qs not supported in the %s bit mode",
3572 "large", "32");
3573 else if (TARGET_X32)
3574 error ("code model %qs not supported in x32 mode",
3575 "large");
3576 break;
3578 case CM_32:
3579 if (flag_pic)
3580 error ("code model %s does not support PIC mode", "32");
3581 if (TARGET_64BIT)
3582 error ("code model %qs not supported in the %s bit mode",
3583 "32", "64");
3584 break;
3586 case CM_KERNEL:
3587 if (flag_pic)
3589 error ("code model %s does not support PIC mode", "kernel");
3590 ix86_cmodel = CM_32;
3592 if (!TARGET_64BIT)
3593 error ("code model %qs not supported in the %s bit mode",
3594 "kernel", "32");
3595 break;
3597 default:
3598 gcc_unreachable ();
3601 else
3603 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3604 use of rip-relative addressing. This eliminates fixups that
3605 would otherwise be needed if this object is to be placed in a
3606 DLL, and is essentially just as efficient as direct addressing. */
3607 if (TARGET_64BIT && (TARGET_RDOS || TARGET_PECOFF))
3608 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3609 else if (TARGET_64BIT)
3610 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3611 else
3612 ix86_cmodel = CM_32;
3614 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3616 error ("-masm=intel not supported in this configuration");
3617 ix86_asm_dialect = ASM_ATT;
3619 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3620 sorry ("%i-bit mode not compiled in",
3621 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3623 for (i = 0; i < pta_size; i++)
3624 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3626 ix86_schedule = processor_alias_table[i].schedule;
3627 ix86_arch = processor_alias_table[i].processor;
3628 /* Default cpu tuning to the architecture. */
3629 ix86_tune = ix86_arch;
3631 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3632 error ("CPU you selected does not support x86-64 "
3633 "instruction set");
3635 if (processor_alias_table[i].flags & PTA_MMX
3636 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3637 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3638 if (processor_alias_table[i].flags & PTA_3DNOW
3639 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3640 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3641 if (processor_alias_table[i].flags & PTA_3DNOW_A
3642 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3643 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3644 if (processor_alias_table[i].flags & PTA_SSE
3645 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3646 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3647 if (processor_alias_table[i].flags & PTA_SSE2
3648 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3649 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3650 if (processor_alias_table[i].flags & PTA_SSE3
3651 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3652 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3653 if (processor_alias_table[i].flags & PTA_SSSE3
3654 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3655 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3656 if (processor_alias_table[i].flags & PTA_SSE4_1
3657 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3658 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3659 if (processor_alias_table[i].flags & PTA_SSE4_2
3660 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3661 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3662 if (processor_alias_table[i].flags & PTA_AVX
3663 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3664 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3665 if (processor_alias_table[i].flags & PTA_AVX2
3666 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3667 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3668 if (processor_alias_table[i].flags & PTA_FMA
3669 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3670 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3671 if (processor_alias_table[i].flags & PTA_SSE4A
3672 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3673 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3674 if (processor_alias_table[i].flags & PTA_FMA4
3675 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3676 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3677 if (processor_alias_table[i].flags & PTA_XOP
3678 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3679 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3680 if (processor_alias_table[i].flags & PTA_LWP
3681 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3682 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3683 if (processor_alias_table[i].flags & PTA_ABM
3684 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3685 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3686 if (processor_alias_table[i].flags & PTA_BMI
3687 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3688 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3689 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3690 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3691 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3692 if (processor_alias_table[i].flags & PTA_TBM
3693 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3694 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3695 if (processor_alias_table[i].flags & PTA_BMI2
3696 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3697 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3698 if (processor_alias_table[i].flags & PTA_CX16
3699 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3700 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3701 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3702 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3703 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3704 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3705 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3706 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3707 if (processor_alias_table[i].flags & PTA_MOVBE
3708 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3709 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3710 if (processor_alias_table[i].flags & PTA_AES
3711 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3712 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3713 if (processor_alias_table[i].flags & PTA_PCLMUL
3714 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3715 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3716 if (processor_alias_table[i].flags & PTA_FSGSBASE
3717 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3718 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3719 if (processor_alias_table[i].flags & PTA_RDRND
3720 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3721 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3722 if (processor_alias_table[i].flags & PTA_F16C
3723 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3724 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3725 if (processor_alias_table[i].flags & PTA_RTM
3726 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3727 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3728 if (processor_alias_table[i].flags & PTA_HLE
3729 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3730 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3731 if (processor_alias_table[i].flags & PTA_PRFCHW
3732 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3733 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3734 if (processor_alias_table[i].flags & PTA_RDSEED
3735 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3736 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3737 if (processor_alias_table[i].flags & PTA_ADX
3738 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3739 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3740 if (processor_alias_table[i].flags & PTA_FXSR
3741 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3742 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3743 if (processor_alias_table[i].flags & PTA_XSAVE
3744 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3745 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3746 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3747 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3748 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3749 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3750 x86_prefetch_sse = true;
3752 break;
3755 if (!strcmp (ix86_arch_string, "generic"))
3756 error ("generic CPU can be used only for %stune=%s %s",
3757 prefix, suffix, sw);
3758 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3759 error ("bad value (%s) for %sarch=%s %s",
3760 ix86_arch_string, prefix, suffix, sw);
3762 ix86_arch_mask = 1u << ix86_arch;
3763 for (i = 0; i < X86_ARCH_LAST; ++i)
3764 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3766 for (i = 0; i < pta_size; i++)
3767 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3769 ix86_schedule = processor_alias_table[i].schedule;
3770 ix86_tune = processor_alias_table[i].processor;
3771 if (TARGET_64BIT)
3773 if (!(processor_alias_table[i].flags & PTA_64BIT))
3775 if (ix86_tune_defaulted)
3777 ix86_tune_string = "x86-64";
3778 for (i = 0; i < pta_size; i++)
3779 if (! strcmp (ix86_tune_string,
3780 processor_alias_table[i].name))
3781 break;
3782 ix86_schedule = processor_alias_table[i].schedule;
3783 ix86_tune = processor_alias_table[i].processor;
3785 else
3786 error ("CPU you selected does not support x86-64 "
3787 "instruction set");
3790 else
3792 /* Adjust tuning when compiling for 32-bit ABI. */
3793 switch (ix86_tune)
3795 case PROCESSOR_GENERIC64:
3796 ix86_tune = PROCESSOR_GENERIC32;
3797 ix86_schedule = CPU_PENTIUMPRO;
3798 break;
3800 default:
3801 break;
3804 /* Intel CPUs have always interpreted SSE prefetch instructions as
3805 NOPs; so, we can enable SSE prefetch instructions even when
3806 -mtune (rather than -march) points us to a processor that has them.
3807 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3808 higher processors. */
3809 if (TARGET_CMOV
3810 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3811 x86_prefetch_sse = true;
3812 break;
3815 if (ix86_tune_specified && i == pta_size)
3816 error ("bad value (%s) for %stune=%s %s",
3817 ix86_tune_string, prefix, suffix, sw);
3819 ix86_tune_mask = 1u << ix86_tune;
3820 for (i = 0; i < X86_TUNE_LAST; ++i)
3821 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3823 if (ix86_tune_ctrl_string)
3825 /* parse the tune ctrl string in the following form:
3826 [^]tune_name1,[^]tune_name2,..a */
3827 char *next_feature_string = NULL;
3828 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3829 char *orig = curr_feature_string;
3830 do {
3831 bool clear = false;
3833 next_feature_string = strchr (curr_feature_string, ',');
3834 if (next_feature_string)
3835 *next_feature_string++ = '\0';
3836 if (*curr_feature_string == '^')
3838 curr_feature_string++;
3839 clear = true;
3841 for (i = 0; i < X86_TUNE_LAST; i++)
3843 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3845 ix86_tune_features[i] = !clear;
3846 break;
3849 if (i == X86_TUNE_LAST)
3850 warning (0, "Unknown parameter to option -mtune-ctrl: %s",
3851 clear ? curr_feature_string - 1 : curr_feature_string);
3852 curr_feature_string = next_feature_string;
3853 } while (curr_feature_string);
3854 free (orig);
3857 #ifndef USE_IX86_FRAME_POINTER
3858 #define USE_IX86_FRAME_POINTER 0
3859 #endif
3861 #ifndef USE_X86_64_FRAME_POINTER
3862 #define USE_X86_64_FRAME_POINTER 0
3863 #endif
3865 /* Set the default values for switches whose default depends on TARGET_64BIT
3866 in case they weren't overwritten by command line options. */
3867 if (TARGET_64BIT)
3869 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3870 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3871 if (flag_asynchronous_unwind_tables == 2)
3872 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3873 if (flag_pcc_struct_return == 2)
3874 flag_pcc_struct_return = 0;
3876 else
3878 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3879 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3880 if (flag_asynchronous_unwind_tables == 2)
3881 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3882 if (flag_pcc_struct_return == 2)
3883 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3886 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3887 if (optimize_size)
3888 ix86_cost = &ix86_size_cost;
3889 else
3890 ix86_cost = ix86_tune_cost;
3892 /* Arrange to set up i386_stack_locals for all functions. */
3893 init_machine_status = ix86_init_machine_status;
3895 /* Validate -mregparm= value. */
3896 if (global_options_set.x_ix86_regparm)
3898 if (TARGET_64BIT)
3899 warning (0, "-mregparm is ignored in 64-bit mode");
3900 if (ix86_regparm > REGPARM_MAX)
3902 error ("-mregparm=%d is not between 0 and %d",
3903 ix86_regparm, REGPARM_MAX);
3904 ix86_regparm = 0;
3907 if (TARGET_64BIT)
3908 ix86_regparm = REGPARM_MAX;
3910 /* Default align_* from the processor table. */
3911 if (align_loops == 0)
3913 align_loops = processor_target_table[ix86_tune].align_loop;
3914 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3916 if (align_jumps == 0)
3918 align_jumps = processor_target_table[ix86_tune].align_jump;
3919 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3921 if (align_functions == 0)
3923 align_functions = processor_target_table[ix86_tune].align_func;
3926 /* Provide default for -mbranch-cost= value. */
3927 if (!global_options_set.x_ix86_branch_cost)
3928 ix86_branch_cost = ix86_cost->branch_cost;
3930 if (TARGET_64BIT)
3932 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3934 /* Enable by default the SSE and MMX builtins. Do allow the user to
3935 explicitly disable any of these. In particular, disabling SSE and
3936 MMX for kernel code is extremely useful. */
3937 if (!ix86_arch_specified)
3938 ix86_isa_flags
3939 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3940 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3942 if (TARGET_RTD)
3943 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3945 else
3947 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3949 if (!ix86_arch_specified)
3950 ix86_isa_flags
3951 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3953 /* i386 ABI does not specify red zone. It still makes sense to use it
3954 when programmer takes care to stack from being destroyed. */
3955 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3956 target_flags |= MASK_NO_RED_ZONE;
3959 /* Keep nonleaf frame pointers. */
3960 if (flag_omit_frame_pointer)
3961 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3962 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3963 flag_omit_frame_pointer = 1;
3965 /* If we're doing fast math, we don't care about comparison order
3966 wrt NaNs. This lets us use a shorter comparison sequence. */
3967 if (flag_finite_math_only)
3968 target_flags &= ~MASK_IEEE_FP;
3970 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3971 since the insns won't need emulation. */
3972 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3973 target_flags &= ~MASK_NO_FANCY_MATH_387;
3975 /* Likewise, if the target doesn't have a 387, or we've specified
3976 software floating point, don't use 387 inline intrinsics. */
3977 if (!TARGET_80387)
3978 target_flags |= MASK_NO_FANCY_MATH_387;
3980 /* Turn on MMX builtins for -msse. */
3981 if (TARGET_SSE)
3982 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3984 /* Enable SSE prefetch. */
3985 if (TARGET_SSE || (TARGET_PRFCHW && !TARGET_3DNOW))
3986 x86_prefetch_sse = true;
3988 /* Enable prefetch{,w} instructions for -m3dnow. */
3989 if (TARGET_3DNOW)
3990 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW & ~ix86_isa_flags_explicit;
3992 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3993 if (TARGET_SSE4_2 || TARGET_ABM)
3994 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3996 /* Enable lzcnt instruction for -mabm. */
3997 if (TARGET_ABM)
3998 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
4000 /* Validate -mpreferred-stack-boundary= value or default it to
4001 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4002 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4003 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
4005 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
4006 int max = (TARGET_SEH ? 4 : 12);
4008 if (ix86_preferred_stack_boundary_arg < min
4009 || ix86_preferred_stack_boundary_arg > max)
4011 if (min == max)
4012 error ("-mpreferred-stack-boundary is not supported "
4013 "for this target");
4014 else
4015 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4016 ix86_preferred_stack_boundary_arg, min, max);
4018 else
4019 ix86_preferred_stack_boundary
4020 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4023 /* Set the default value for -mstackrealign. */
4024 if (ix86_force_align_arg_pointer == -1)
4025 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4027 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4029 /* Validate -mincoming-stack-boundary= value or default it to
4030 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4031 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4032 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
4034 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
4035 || ix86_incoming_stack_boundary_arg > 12)
4036 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4037 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
4038 else
4040 ix86_user_incoming_stack_boundary
4041 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4042 ix86_incoming_stack_boundary
4043 = ix86_user_incoming_stack_boundary;
4047 /* Accept -msseregparm only if at least SSE support is enabled. */
4048 if (TARGET_SSEREGPARM
4049 && ! TARGET_SSE)
4050 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
4052 if (global_options_set.x_ix86_fpmath)
4054 if (ix86_fpmath & FPMATH_SSE)
4056 if (!TARGET_SSE)
4058 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4059 ix86_fpmath = FPMATH_387;
4061 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
4063 warning (0, "387 instruction set disabled, using SSE arithmetics");
4064 ix86_fpmath = FPMATH_SSE;
4068 else
4069 ix86_fpmath = TARGET_FPMATH_DEFAULT;
4071 /* If the i387 is disabled, then do not return values in it. */
4072 if (!TARGET_80387)
4073 target_flags &= ~MASK_FLOAT_RETURNS;
4075 /* Use external vectorized library in vectorizing intrinsics. */
4076 if (global_options_set.x_ix86_veclibabi_type)
4077 switch (ix86_veclibabi_type)
4079 case ix86_veclibabi_type_svml:
4080 ix86_veclib_handler = ix86_veclibabi_svml;
4081 break;
4083 case ix86_veclibabi_type_acml:
4084 ix86_veclib_handler = ix86_veclibabi_acml;
4085 break;
4087 default:
4088 gcc_unreachable ();
4091 if ((!USE_IX86_FRAME_POINTER
4092 || (x86_accumulate_outgoing_args & ix86_tune_mask))
4093 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
4094 && !optimize_size)
4095 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4097 /* ??? Unwind info is not correct around the CFG unless either a frame
4098 pointer is present or M_A_O_A is set. Fixing this requires rewriting
4099 unwind info generation to be aware of the CFG and propagating states
4100 around edges. */
4101 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
4102 || flag_exceptions || flag_non_call_exceptions)
4103 && flag_omit_frame_pointer
4104 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4106 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
4107 warning (0, "unwind tables currently require either a frame pointer "
4108 "or %saccumulate-outgoing-args%s for correctness",
4109 prefix, suffix);
4110 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4113 /* If stack probes are required, the space used for large function
4114 arguments on the stack must also be probed, so enable
4115 -maccumulate-outgoing-args so this happens in the prologue. */
4116 if (TARGET_STACK_PROBE
4117 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4119 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
4120 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
4121 "for correctness", prefix, suffix);
4122 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4125 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4127 char *p;
4128 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4129 p = strchr (internal_label_prefix, 'X');
4130 internal_label_prefix_len = p - internal_label_prefix;
4131 *p = '\0';
4134 /* When scheduling description is not available, disable scheduler pass
4135 so it won't slow down the compilation and make x87 code slower. */
4136 if (!TARGET_SCHEDULE)
4137 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
4139 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4140 ix86_tune_cost->simultaneous_prefetches,
4141 global_options.x_param_values,
4142 global_options_set.x_param_values);
4143 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4144 ix86_tune_cost->prefetch_block,
4145 global_options.x_param_values,
4146 global_options_set.x_param_values);
4147 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4148 ix86_tune_cost->l1_cache_size,
4149 global_options.x_param_values,
4150 global_options_set.x_param_values);
4151 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4152 ix86_tune_cost->l2_cache_size,
4153 global_options.x_param_values,
4154 global_options_set.x_param_values);
4156 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4157 if (flag_prefetch_loop_arrays < 0
4158 && HAVE_prefetch
4159 && (optimize >= 3 || flag_profile_use)
4160 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4161 flag_prefetch_loop_arrays = 1;
4163 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4164 can be optimized to ap = __builtin_next_arg (0). */
4165 if (!TARGET_64BIT && !flag_split_stack)
4166 targetm.expand_builtin_va_start = NULL;
4168 if (TARGET_64BIT)
4170 ix86_gen_leave = gen_leave_rex64;
4171 if (Pmode == DImode)
4173 ix86_gen_monitor = gen_sse3_monitor64_di;
4174 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4175 ix86_gen_tls_local_dynamic_base_64
4176 = gen_tls_local_dynamic_base_64_di;
4178 else
4180 ix86_gen_monitor = gen_sse3_monitor64_si;
4181 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4182 ix86_gen_tls_local_dynamic_base_64
4183 = gen_tls_local_dynamic_base_64_si;
4186 else
4188 ix86_gen_leave = gen_leave;
4189 ix86_gen_monitor = gen_sse3_monitor;
4192 if (Pmode == DImode)
4194 ix86_gen_add3 = gen_adddi3;
4195 ix86_gen_sub3 = gen_subdi3;
4196 ix86_gen_sub3_carry = gen_subdi3_carry;
4197 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4198 ix86_gen_andsp = gen_anddi3;
4199 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4200 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4201 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4203 else
4205 ix86_gen_add3 = gen_addsi3;
4206 ix86_gen_sub3 = gen_subsi3;
4207 ix86_gen_sub3_carry = gen_subsi3_carry;
4208 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4209 ix86_gen_andsp = gen_andsi3;
4210 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4211 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4212 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4215 #ifdef USE_IX86_CLD
4216 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4217 if (!TARGET_64BIT)
4218 target_flags |= MASK_CLD & ~target_flags_explicit;
4219 #endif
4221 if (!TARGET_64BIT && flag_pic)
4223 if (flag_fentry > 0)
4224 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4225 "with -fpic");
4226 flag_fentry = 0;
4228 else if (TARGET_SEH)
4230 if (flag_fentry == 0)
4231 sorry ("-mno-fentry isn%'t compatible with SEH");
4232 flag_fentry = 1;
4234 else if (flag_fentry < 0)
4236 #if defined(PROFILE_BEFORE_PROLOGUE)
4237 flag_fentry = 1;
4238 #else
4239 flag_fentry = 0;
4240 #endif
4243 /* When not optimize for size, enable vzeroupper optimization for
4244 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4245 AVX unaligned load/store. */
4246 if (!optimize_size)
4248 if (flag_expensive_optimizations
4249 && !(target_flags_explicit & MASK_VZEROUPPER))
4250 target_flags |= MASK_VZEROUPPER;
4251 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
4252 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4253 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4254 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
4255 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4256 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4257 /* Enable 128-bit AVX instruction generation
4258 for the auto-vectorizer. */
4259 if (TARGET_AVX128_OPTIMAL
4260 && !(target_flags_explicit & MASK_PREFER_AVX128))
4261 target_flags |= MASK_PREFER_AVX128;
4264 if (ix86_recip_name)
4266 char *p = ASTRDUP (ix86_recip_name);
4267 char *q;
4268 unsigned int mask, i;
4269 bool invert;
4271 while ((q = strtok (p, ",")) != NULL)
4273 p = NULL;
4274 if (*q == '!')
4276 invert = true;
4277 q++;
4279 else
4280 invert = false;
4282 if (!strcmp (q, "default"))
4283 mask = RECIP_MASK_ALL;
4284 else
4286 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4287 if (!strcmp (q, recip_options[i].string))
4289 mask = recip_options[i].mask;
4290 break;
4293 if (i == ARRAY_SIZE (recip_options))
4295 error ("unknown option for -mrecip=%s", q);
4296 invert = false;
4297 mask = RECIP_MASK_NONE;
4301 recip_mask_explicit |= mask;
4302 if (invert)
4303 recip_mask &= ~mask;
4304 else
4305 recip_mask |= mask;
4309 if (TARGET_RECIP)
4310 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4311 else if (target_flags_explicit & MASK_RECIP)
4312 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4314 /* Default long double to 64-bit for Bionic. */
4315 if (TARGET_HAS_BIONIC
4316 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
4317 target_flags |= MASK_LONG_DOUBLE_64;
4319 /* Save the initial options in case the user does function specific
4320 options. */
4321 if (main_args_p)
4322 target_option_default_node = target_option_current_node
4323 = build_target_option_node ();
4325 /* Handle stack protector */
4326 if (!global_options_set.x_ix86_stack_protector_guard)
4327 ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4329 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4330 if (ix86_tune_memcpy_strategy)
4332 char *str = xstrdup (ix86_tune_memcpy_strategy);
4333 ix86_parse_stringop_strategy_string (str, false);
4334 free (str);
4337 if (ix86_tune_memset_strategy)
4339 char *str = xstrdup (ix86_tune_memset_strategy);
4340 ix86_parse_stringop_strategy_string (str, true);
4341 free (str);
4345 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4347 static void
4348 ix86_option_override (void)
4350 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4351 static struct register_pass_info insert_vzeroupper_info
4352 = { pass_insert_vzeroupper, "reload",
4353 1, PASS_POS_INSERT_AFTER
4356 ix86_option_override_internal (true);
4359 /* This needs to be done at start up. It's convenient to do it here. */
4360 register_pass (&insert_vzeroupper_info);
4363 /* Update register usage after having seen the compiler flags. */
4365 static void
4366 ix86_conditional_register_usage (void)
4368 int i, c_mask;
4369 unsigned int j;
4371 /* The PIC register, if it exists, is fixed. */
4372 j = PIC_OFFSET_TABLE_REGNUM;
4373 if (j != INVALID_REGNUM)
4374 fixed_regs[j] = call_used_regs[j] = 1;
4376 /* For 32-bit targets, squash the REX registers. */
4377 if (! TARGET_64BIT)
4379 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4380 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4381 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4382 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4385 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4386 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4387 : TARGET_64BIT ? (1 << 2)
4388 : (1 << 1));
4390 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4392 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4394 /* Set/reset conditionally defined registers from
4395 CALL_USED_REGISTERS initializer. */
4396 if (call_used_regs[i] > 1)
4397 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4399 /* Calculate registers of CLOBBERED_REGS register set
4400 as call used registers from GENERAL_REGS register set. */
4401 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4402 && call_used_regs[i])
4403 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4406 /* If MMX is disabled, squash the registers. */
4407 if (! TARGET_MMX)
4408 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4409 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4410 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4412 /* If SSE is disabled, squash the registers. */
4413 if (! TARGET_SSE)
4414 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4415 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4416 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4418 /* If the FPU is disabled, squash the registers. */
4419 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4420 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4421 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4422 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4426 /* Save the current options */
4428 static void
4429 ix86_function_specific_save (struct cl_target_option *ptr)
4431 ptr->arch = ix86_arch;
4432 ptr->schedule = ix86_schedule;
4433 ptr->tune = ix86_tune;
4434 ptr->branch_cost = ix86_branch_cost;
4435 ptr->tune_defaulted = ix86_tune_defaulted;
4436 ptr->arch_specified = ix86_arch_specified;
4437 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4438 ptr->ix86_target_flags_explicit = target_flags_explicit;
4439 ptr->x_recip_mask_explicit = recip_mask_explicit;
4441 /* The fields are char but the variables are not; make sure the
4442 values fit in the fields. */
4443 gcc_assert (ptr->arch == ix86_arch);
4444 gcc_assert (ptr->schedule == ix86_schedule);
4445 gcc_assert (ptr->tune == ix86_tune);
4446 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4449 /* Restore the current options */
4451 static void
4452 ix86_function_specific_restore (struct cl_target_option *ptr)
4454 enum processor_type old_tune = ix86_tune;
4455 enum processor_type old_arch = ix86_arch;
4456 unsigned int ix86_arch_mask, ix86_tune_mask;
4457 int i;
4459 ix86_arch = (enum processor_type) ptr->arch;
4460 ix86_schedule = (enum attr_cpu) ptr->schedule;
4461 ix86_tune = (enum processor_type) ptr->tune;
4462 ix86_branch_cost = ptr->branch_cost;
4463 ix86_tune_defaulted = ptr->tune_defaulted;
4464 ix86_arch_specified = ptr->arch_specified;
4465 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4466 target_flags_explicit = ptr->ix86_target_flags_explicit;
4467 recip_mask_explicit = ptr->x_recip_mask_explicit;
4469 /* Recreate the arch feature tests if the arch changed */
4470 if (old_arch != ix86_arch)
4472 ix86_arch_mask = 1u << ix86_arch;
4473 for (i = 0; i < X86_ARCH_LAST; ++i)
4474 ix86_arch_features[i]
4475 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4478 /* Recreate the tune optimization tests */
4479 if (old_tune != ix86_tune)
4481 ix86_tune_mask = 1u << ix86_tune;
4482 for (i = 0; i < X86_TUNE_LAST; ++i)
4483 ix86_tune_features[i]
4484 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4488 /* Print the current options */
4490 static void
4491 ix86_function_specific_print (FILE *file, int indent,
4492 struct cl_target_option *ptr)
4494 char *target_string
4495 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4496 NULL, NULL, ptr->x_ix86_fpmath, false);
4498 fprintf (file, "%*sarch = %d (%s)\n",
4499 indent, "",
4500 ptr->arch,
4501 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4502 ? cpu_names[ptr->arch]
4503 : "<unknown>"));
4505 fprintf (file, "%*stune = %d (%s)\n",
4506 indent, "",
4507 ptr->tune,
4508 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4509 ? cpu_names[ptr->tune]
4510 : "<unknown>"));
4512 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4514 if (target_string)
4516 fprintf (file, "%*s%s\n", indent, "", target_string);
4517 free (target_string);
4522 /* Inner function to process the attribute((target(...))), take an argument and
4523 set the current options from the argument. If we have a list, recursively go
4524 over the list. */
4526 static bool
4527 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4528 struct gcc_options *enum_opts_set)
4530 char *next_optstr;
4531 bool ret = true;
4533 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4534 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4535 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4536 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4537 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4539 enum ix86_opt_type
4541 ix86_opt_unknown,
4542 ix86_opt_yes,
4543 ix86_opt_no,
4544 ix86_opt_str,
4545 ix86_opt_enum,
4546 ix86_opt_isa
4549 static const struct
4551 const char *string;
4552 size_t len;
4553 enum ix86_opt_type type;
4554 int opt;
4555 int mask;
4556 } attrs[] = {
4557 /* isa options */
4558 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4559 IX86_ATTR_ISA ("abm", OPT_mabm),
4560 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4561 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4562 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4563 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4564 IX86_ATTR_ISA ("aes", OPT_maes),
4565 IX86_ATTR_ISA ("avx", OPT_mavx),
4566 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4567 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4568 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4569 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4570 IX86_ATTR_ISA ("sse", OPT_msse),
4571 IX86_ATTR_ISA ("sse2", OPT_msse2),
4572 IX86_ATTR_ISA ("sse3", OPT_msse3),
4573 IX86_ATTR_ISA ("sse4", OPT_msse4),
4574 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4575 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4576 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4577 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4578 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4579 IX86_ATTR_ISA ("fma", OPT_mfma),
4580 IX86_ATTR_ISA ("xop", OPT_mxop),
4581 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4582 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4583 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4584 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4585 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4586 IX86_ATTR_ISA ("hle", OPT_mhle),
4587 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4588 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4589 IX86_ATTR_ISA ("adx", OPT_madx),
4590 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4591 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4592 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4594 /* enum options */
4595 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4597 /* string options */
4598 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4599 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4601 /* flag options */
4602 IX86_ATTR_YES ("cld",
4603 OPT_mcld,
4604 MASK_CLD),
4606 IX86_ATTR_NO ("fancy-math-387",
4607 OPT_mfancy_math_387,
4608 MASK_NO_FANCY_MATH_387),
4610 IX86_ATTR_YES ("ieee-fp",
4611 OPT_mieee_fp,
4612 MASK_IEEE_FP),
4614 IX86_ATTR_YES ("inline-all-stringops",
4615 OPT_minline_all_stringops,
4616 MASK_INLINE_ALL_STRINGOPS),
4618 IX86_ATTR_YES ("inline-stringops-dynamically",
4619 OPT_minline_stringops_dynamically,
4620 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4622 IX86_ATTR_NO ("align-stringops",
4623 OPT_mno_align_stringops,
4624 MASK_NO_ALIGN_STRINGOPS),
4626 IX86_ATTR_YES ("recip",
4627 OPT_mrecip,
4628 MASK_RECIP),
4632 /* If this is a list, recurse to get the options. */
4633 if (TREE_CODE (args) == TREE_LIST)
4635 bool ret = true;
4637 for (; args; args = TREE_CHAIN (args))
4638 if (TREE_VALUE (args)
4639 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4640 p_strings, enum_opts_set))
4641 ret = false;
4643 return ret;
4646 else if (TREE_CODE (args) != STRING_CST)
4648 error ("attribute %<target%> argument not a string");
4649 return false;
4652 /* Handle multiple arguments separated by commas. */
4653 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4655 while (next_optstr && *next_optstr != '\0')
4657 char *p = next_optstr;
4658 char *orig_p = p;
4659 char *comma = strchr (next_optstr, ',');
4660 const char *opt_string;
4661 size_t len, opt_len;
4662 int opt;
4663 bool opt_set_p;
4664 char ch;
4665 unsigned i;
4666 enum ix86_opt_type type = ix86_opt_unknown;
4667 int mask = 0;
4669 if (comma)
4671 *comma = '\0';
4672 len = comma - next_optstr;
4673 next_optstr = comma + 1;
4675 else
4677 len = strlen (p);
4678 next_optstr = NULL;
4681 /* Recognize no-xxx. */
4682 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4684 opt_set_p = false;
4685 p += 3;
4686 len -= 3;
4688 else
4689 opt_set_p = true;
4691 /* Find the option. */
4692 ch = *p;
4693 opt = N_OPTS;
4694 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4696 type = attrs[i].type;
4697 opt_len = attrs[i].len;
4698 if (ch == attrs[i].string[0]
4699 && ((type != ix86_opt_str && type != ix86_opt_enum)
4700 ? len == opt_len
4701 : len > opt_len)
4702 && memcmp (p, attrs[i].string, opt_len) == 0)
4704 opt = attrs[i].opt;
4705 mask = attrs[i].mask;
4706 opt_string = attrs[i].string;
4707 break;
4711 /* Process the option. */
4712 if (opt == N_OPTS)
4714 error ("attribute(target(\"%s\")) is unknown", orig_p);
4715 ret = false;
4718 else if (type == ix86_opt_isa)
4720 struct cl_decoded_option decoded;
4722 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4723 ix86_handle_option (&global_options, &global_options_set,
4724 &decoded, input_location);
4727 else if (type == ix86_opt_yes || type == ix86_opt_no)
4729 if (type == ix86_opt_no)
4730 opt_set_p = !opt_set_p;
4732 if (opt_set_p)
4733 target_flags |= mask;
4734 else
4735 target_flags &= ~mask;
4738 else if (type == ix86_opt_str)
4740 if (p_strings[opt])
4742 error ("option(\"%s\") was already specified", opt_string);
4743 ret = false;
4745 else
4746 p_strings[opt] = xstrdup (p + opt_len);
4749 else if (type == ix86_opt_enum)
4751 bool arg_ok;
4752 int value;
4754 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4755 if (arg_ok)
4756 set_option (&global_options, enum_opts_set, opt, value,
4757 p + opt_len, DK_UNSPECIFIED, input_location,
4758 global_dc);
4759 else
4761 error ("attribute(target(\"%s\")) is unknown", orig_p);
4762 ret = false;
4766 else
4767 gcc_unreachable ();
4770 return ret;
4773 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4775 tree
4776 ix86_valid_target_attribute_tree (tree args)
4778 const char *orig_arch_string = ix86_arch_string;
4779 const char *orig_tune_string = ix86_tune_string;
4780 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4781 int orig_tune_defaulted = ix86_tune_defaulted;
4782 int orig_arch_specified = ix86_arch_specified;
4783 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4784 tree t = NULL_TREE;
4785 int i;
4786 struct cl_target_option *def
4787 = TREE_TARGET_OPTION (target_option_default_node);
4788 struct gcc_options enum_opts_set;
4790 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4792 /* Process each of the options on the chain. */
4793 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4794 &enum_opts_set))
4795 return error_mark_node;
4797 /* If the changed options are different from the default, rerun
4798 ix86_option_override_internal, and then save the options away.
4799 The string options are are attribute options, and will be undone
4800 when we copy the save structure. */
4801 if (ix86_isa_flags != def->x_ix86_isa_flags
4802 || target_flags != def->x_target_flags
4803 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4804 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4805 || enum_opts_set.x_ix86_fpmath)
4807 /* If we are using the default tune= or arch=, undo the string assigned,
4808 and use the default. */
4809 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4810 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4811 else if (!orig_arch_specified)
4812 ix86_arch_string = NULL;
4814 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4815 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4816 else if (orig_tune_defaulted)
4817 ix86_tune_string = NULL;
4819 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4820 if (enum_opts_set.x_ix86_fpmath)
4821 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4822 else if (!TARGET_64BIT && TARGET_SSE)
4824 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4825 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4828 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4829 ix86_option_override_internal (false);
4831 /* Add any builtin functions with the new isa if any. */
4832 ix86_add_new_builtins (ix86_isa_flags);
4834 /* Save the current options unless we are validating options for
4835 #pragma. */
4836 t = build_target_option_node ();
4838 ix86_arch_string = orig_arch_string;
4839 ix86_tune_string = orig_tune_string;
4840 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4842 /* Free up memory allocated to hold the strings */
4843 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4844 free (option_strings[i]);
4847 return t;
4850 /* Hook to validate attribute((target("string"))). */
4852 static bool
4853 ix86_valid_target_attribute_p (tree fndecl,
4854 tree ARG_UNUSED (name),
4855 tree args,
4856 int ARG_UNUSED (flags))
4858 struct cl_target_option cur_target;
4859 bool ret = true;
4861 /* attribute((target("default"))) does nothing, beyond
4862 affecting multi-versioning. */
4863 if (TREE_VALUE (args)
4864 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4865 && TREE_CHAIN (args) == NULL_TREE
4866 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4867 return true;
4869 tree old_optimize = build_optimization_node ();
4870 tree new_target, new_optimize;
4871 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4873 /* If the function changed the optimization levels as well as setting target
4874 options, start with the optimizations specified. */
4875 if (func_optimize && func_optimize != old_optimize)
4876 cl_optimization_restore (&global_options,
4877 TREE_OPTIMIZATION (func_optimize));
4879 /* The target attributes may also change some optimization flags, so update
4880 the optimization options if necessary. */
4881 cl_target_option_save (&cur_target, &global_options);
4882 new_target = ix86_valid_target_attribute_tree (args);
4883 new_optimize = build_optimization_node ();
4885 if (new_target == error_mark_node)
4886 ret = false;
4888 else if (fndecl && new_target)
4890 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4892 if (old_optimize != new_optimize)
4893 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4896 cl_target_option_restore (&global_options, &cur_target);
4898 if (old_optimize != new_optimize)
4899 cl_optimization_restore (&global_options,
4900 TREE_OPTIMIZATION (old_optimize));
4902 return ret;
4906 /* Hook to determine if one function can safely inline another. */
4908 static bool
4909 ix86_can_inline_p (tree caller, tree callee)
4911 bool ret = false;
4912 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4913 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4915 /* If callee has no option attributes, then it is ok to inline. */
4916 if (!callee_tree)
4917 ret = true;
4919 /* If caller has no option attributes, but callee does then it is not ok to
4920 inline. */
4921 else if (!caller_tree)
4922 ret = false;
4924 else
4926 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4927 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4929 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4930 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4931 function. */
4932 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4933 != callee_opts->x_ix86_isa_flags)
4934 ret = false;
4936 /* See if we have the same non-isa options. */
4937 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4938 ret = false;
4940 /* See if arch, tune, etc. are the same. */
4941 else if (caller_opts->arch != callee_opts->arch)
4942 ret = false;
4944 else if (caller_opts->tune != callee_opts->tune)
4945 ret = false;
4947 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4948 ret = false;
4950 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4951 ret = false;
4953 else
4954 ret = true;
4957 return ret;
4961 /* Remember the last target of ix86_set_current_function. */
4962 static GTY(()) tree ix86_previous_fndecl;
4964 /* Invalidate ix86_previous_fndecl cache. */
4965 void
4966 ix86_reset_previous_fndecl (void)
4968 ix86_previous_fndecl = NULL_TREE;
4971 /* Establish appropriate back-end context for processing the function
4972 FNDECL. The argument might be NULL to indicate processing at top
4973 level, outside of any function scope. */
4974 static void
4975 ix86_set_current_function (tree fndecl)
4977 /* Only change the context if the function changes. This hook is called
4978 several times in the course of compiling a function, and we don't want to
4979 slow things down too much or call target_reinit when it isn't safe. */
4980 if (fndecl && fndecl != ix86_previous_fndecl)
4982 tree old_tree = (ix86_previous_fndecl
4983 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4984 : NULL_TREE);
4986 tree new_tree = (fndecl
4987 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4988 : NULL_TREE);
4990 ix86_previous_fndecl = fndecl;
4991 if (old_tree == new_tree)
4994 else if (new_tree)
4996 cl_target_option_restore (&global_options,
4997 TREE_TARGET_OPTION (new_tree));
4998 target_reinit ();
5001 else if (old_tree)
5003 struct cl_target_option *def
5004 = TREE_TARGET_OPTION (target_option_current_node);
5006 cl_target_option_restore (&global_options, def);
5007 target_reinit ();
5013 /* Return true if this goes in large data/bss. */
5015 static bool
5016 ix86_in_large_data_p (tree exp)
5018 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
5019 return false;
5021 /* Functions are never large data. */
5022 if (TREE_CODE (exp) == FUNCTION_DECL)
5023 return false;
5025 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
5027 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
5028 if (strcmp (section, ".ldata") == 0
5029 || strcmp (section, ".lbss") == 0)
5030 return true;
5031 return false;
5033 else
5035 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
5037 /* If this is an incomplete type with size 0, then we can't put it
5038 in data because it might be too big when completed. */
5039 if (!size || size > ix86_section_threshold)
5040 return true;
5043 return false;
5046 /* Switch to the appropriate section for output of DECL.
5047 DECL is either a `VAR_DECL' node or a constant of some sort.
5048 RELOC indicates whether forming the initial value of DECL requires
5049 link-time relocations. */
5051 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
5052 ATTRIBUTE_UNUSED;
5054 static section *
5055 x86_64_elf_select_section (tree decl, int reloc,
5056 unsigned HOST_WIDE_INT align)
5058 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5059 && ix86_in_large_data_p (decl))
5061 const char *sname = NULL;
5062 unsigned int flags = SECTION_WRITE;
5063 switch (categorize_decl_for_section (decl, reloc))
5065 case SECCAT_DATA:
5066 sname = ".ldata";
5067 break;
5068 case SECCAT_DATA_REL:
5069 sname = ".ldata.rel";
5070 break;
5071 case SECCAT_DATA_REL_LOCAL:
5072 sname = ".ldata.rel.local";
5073 break;
5074 case SECCAT_DATA_REL_RO:
5075 sname = ".ldata.rel.ro";
5076 break;
5077 case SECCAT_DATA_REL_RO_LOCAL:
5078 sname = ".ldata.rel.ro.local";
5079 break;
5080 case SECCAT_BSS:
5081 sname = ".lbss";
5082 flags |= SECTION_BSS;
5083 break;
5084 case SECCAT_RODATA:
5085 case SECCAT_RODATA_MERGE_STR:
5086 case SECCAT_RODATA_MERGE_STR_INIT:
5087 case SECCAT_RODATA_MERGE_CONST:
5088 sname = ".lrodata";
5089 flags = 0;
5090 break;
5091 case SECCAT_SRODATA:
5092 case SECCAT_SDATA:
5093 case SECCAT_SBSS:
5094 gcc_unreachable ();
5095 case SECCAT_TEXT:
5096 case SECCAT_TDATA:
5097 case SECCAT_TBSS:
5098 /* We don't split these for medium model. Place them into
5099 default sections and hope for best. */
5100 break;
5102 if (sname)
5104 /* We might get called with string constants, but get_named_section
5105 doesn't like them as they are not DECLs. Also, we need to set
5106 flags in that case. */
5107 if (!DECL_P (decl))
5108 return get_section (sname, flags, NULL);
5109 return get_named_section (decl, sname, reloc);
5112 return default_elf_select_section (decl, reloc, align);
5115 /* Build up a unique section name, expressed as a
5116 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
5117 RELOC indicates whether the initial value of EXP requires
5118 link-time relocations. */
5120 static void ATTRIBUTE_UNUSED
5121 x86_64_elf_unique_section (tree decl, int reloc)
5123 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5124 && ix86_in_large_data_p (decl))
5126 const char *prefix = NULL;
5127 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
5128 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
5130 switch (categorize_decl_for_section (decl, reloc))
5132 case SECCAT_DATA:
5133 case SECCAT_DATA_REL:
5134 case SECCAT_DATA_REL_LOCAL:
5135 case SECCAT_DATA_REL_RO:
5136 case SECCAT_DATA_REL_RO_LOCAL:
5137 prefix = one_only ? ".ld" : ".ldata";
5138 break;
5139 case SECCAT_BSS:
5140 prefix = one_only ? ".lb" : ".lbss";
5141 break;
5142 case SECCAT_RODATA:
5143 case SECCAT_RODATA_MERGE_STR:
5144 case SECCAT_RODATA_MERGE_STR_INIT:
5145 case SECCAT_RODATA_MERGE_CONST:
5146 prefix = one_only ? ".lr" : ".lrodata";
5147 break;
5148 case SECCAT_SRODATA:
5149 case SECCAT_SDATA:
5150 case SECCAT_SBSS:
5151 gcc_unreachable ();
5152 case SECCAT_TEXT:
5153 case SECCAT_TDATA:
5154 case SECCAT_TBSS:
5155 /* We don't split these for medium model. Place them into
5156 default sections and hope for best. */
5157 break;
5159 if (prefix)
5161 const char *name, *linkonce;
5162 char *string;
5164 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
5165 name = targetm.strip_name_encoding (name);
5167 /* If we're using one_only, then there needs to be a .gnu.linkonce
5168 prefix to the section name. */
5169 linkonce = one_only ? ".gnu.linkonce" : "";
5171 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
5173 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
5174 return;
5177 default_unique_section (decl, reloc);
5180 #ifdef COMMON_ASM_OP
5181 /* This says how to output assembler code to declare an
5182 uninitialized external linkage data object.
5184 For medium model x86-64 we need to use .largecomm opcode for
5185 large objects. */
5186 void
5187 x86_elf_aligned_common (FILE *file,
5188 const char *name, unsigned HOST_WIDE_INT size,
5189 int align)
5191 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5192 && size > (unsigned int)ix86_section_threshold)
5193 fputs (".largecomm\t", file);
5194 else
5195 fputs (COMMON_ASM_OP, file);
5196 assemble_name (file, name);
5197 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
5198 size, align / BITS_PER_UNIT);
5200 #endif
5202 /* Utility function for targets to use in implementing
5203 ASM_OUTPUT_ALIGNED_BSS. */
5205 void
5206 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
5207 const char *name, unsigned HOST_WIDE_INT size,
5208 int align)
5210 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5211 && size > (unsigned int)ix86_section_threshold)
5212 switch_to_section (get_named_section (decl, ".lbss", 0));
5213 else
5214 switch_to_section (bss_section);
5215 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
5216 #ifdef ASM_DECLARE_OBJECT_NAME
5217 last_assemble_variable_decl = decl;
5218 ASM_DECLARE_OBJECT_NAME (file, name, decl);
5219 #else
5220 /* Standard thing is just output label for the object. */
5221 ASM_OUTPUT_LABEL (file, name);
5222 #endif /* ASM_DECLARE_OBJECT_NAME */
5223 ASM_OUTPUT_SKIP (file, size ? size : 1);
5226 /* Decide whether we must probe the stack before any space allocation
5227 on this target. It's essentially TARGET_STACK_PROBE except when
5228 -fstack-check causes the stack to be already probed differently. */
5230 bool
5231 ix86_target_stack_probe (void)
5233 /* Do not probe the stack twice if static stack checking is enabled. */
5234 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5235 return false;
5237 return TARGET_STACK_PROBE;
5240 /* Decide whether we can make a sibling call to a function. DECL is the
5241 declaration of the function being targeted by the call and EXP is the
5242 CALL_EXPR representing the call. */
5244 static bool
5245 ix86_function_ok_for_sibcall (tree decl, tree exp)
5247 tree type, decl_or_type;
5248 rtx a, b;
5250 /* If we are generating position-independent code, we cannot sibcall
5251 optimize any indirect call, or a direct call to a global function,
5252 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5253 if (!TARGET_MACHO
5254 && !TARGET_64BIT
5255 && flag_pic
5256 && (!decl || !targetm.binds_local_p (decl)))
5257 return false;
5259 /* If we need to align the outgoing stack, then sibcalling would
5260 unalign the stack, which may break the called function. */
5261 if (ix86_minimum_incoming_stack_boundary (true)
5262 < PREFERRED_STACK_BOUNDARY)
5263 return false;
5265 if (decl)
5267 decl_or_type = decl;
5268 type = TREE_TYPE (decl);
5270 else
5272 /* We're looking at the CALL_EXPR, we need the type of the function. */
5273 type = CALL_EXPR_FN (exp); /* pointer expression */
5274 type = TREE_TYPE (type); /* pointer type */
5275 type = TREE_TYPE (type); /* function type */
5276 decl_or_type = type;
5279 /* Check that the return value locations are the same. Like
5280 if we are returning floats on the 80387 register stack, we cannot
5281 make a sibcall from a function that doesn't return a float to a
5282 function that does or, conversely, from a function that does return
5283 a float to a function that doesn't; the necessary stack adjustment
5284 would not be executed. This is also the place we notice
5285 differences in the return value ABI. Note that it is ok for one
5286 of the functions to have void return type as long as the return
5287 value of the other is passed in a register. */
5288 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5289 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5290 cfun->decl, false);
5291 if (STACK_REG_P (a) || STACK_REG_P (b))
5293 if (!rtx_equal_p (a, b))
5294 return false;
5296 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5298 else if (!rtx_equal_p (a, b))
5299 return false;
5301 if (TARGET_64BIT)
5303 /* The SYSV ABI has more call-clobbered registers;
5304 disallow sibcalls from MS to SYSV. */
5305 if (cfun->machine->call_abi == MS_ABI
5306 && ix86_function_type_abi (type) == SYSV_ABI)
5307 return false;
5309 else
5311 /* If this call is indirect, we'll need to be able to use a
5312 call-clobbered register for the address of the target function.
5313 Make sure that all such registers are not used for passing
5314 parameters. Note that DLLIMPORT functions are indirect. */
5315 if (!decl
5316 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5318 if (ix86_function_regparm (type, NULL) >= 3)
5320 /* ??? Need to count the actual number of registers to be used,
5321 not the possible number of registers. Fix later. */
5322 return false;
5327 /* Otherwise okay. That also includes certain types of indirect calls. */
5328 return true;
5331 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5332 and "sseregparm" calling convention attributes;
5333 arguments as in struct attribute_spec.handler. */
5335 static tree
5336 ix86_handle_cconv_attribute (tree *node, tree name,
5337 tree args,
5338 int flags ATTRIBUTE_UNUSED,
5339 bool *no_add_attrs)
5341 if (TREE_CODE (*node) != FUNCTION_TYPE
5342 && TREE_CODE (*node) != METHOD_TYPE
5343 && TREE_CODE (*node) != FIELD_DECL
5344 && TREE_CODE (*node) != TYPE_DECL)
5346 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5347 name);
5348 *no_add_attrs = true;
5349 return NULL_TREE;
5352 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5353 if (is_attribute_p ("regparm", name))
5355 tree cst;
5357 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5359 error ("fastcall and regparm attributes are not compatible");
5362 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5364 error ("regparam and thiscall attributes are not compatible");
5367 cst = TREE_VALUE (args);
5368 if (TREE_CODE (cst) != INTEGER_CST)
5370 warning (OPT_Wattributes,
5371 "%qE attribute requires an integer constant argument",
5372 name);
5373 *no_add_attrs = true;
5375 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5377 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5378 name, REGPARM_MAX);
5379 *no_add_attrs = true;
5382 return NULL_TREE;
5385 if (TARGET_64BIT)
5387 /* Do not warn when emulating the MS ABI. */
5388 if ((TREE_CODE (*node) != FUNCTION_TYPE
5389 && TREE_CODE (*node) != METHOD_TYPE)
5390 || ix86_function_type_abi (*node) != MS_ABI)
5391 warning (OPT_Wattributes, "%qE attribute ignored",
5392 name);
5393 *no_add_attrs = true;
5394 return NULL_TREE;
5397 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5398 if (is_attribute_p ("fastcall", name))
5400 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5402 error ("fastcall and cdecl attributes are not compatible");
5404 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5406 error ("fastcall and stdcall attributes are not compatible");
5408 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5410 error ("fastcall and regparm attributes are not compatible");
5412 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5414 error ("fastcall and thiscall attributes are not compatible");
5418 /* Can combine stdcall with fastcall (redundant), regparm and
5419 sseregparm. */
5420 else if (is_attribute_p ("stdcall", name))
5422 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5424 error ("stdcall and cdecl attributes are not compatible");
5426 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5428 error ("stdcall and fastcall attributes are not compatible");
5430 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5432 error ("stdcall and thiscall attributes are not compatible");
5436 /* Can combine cdecl with regparm and sseregparm. */
5437 else if (is_attribute_p ("cdecl", name))
5439 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5441 error ("stdcall and cdecl attributes are not compatible");
5443 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5445 error ("fastcall and cdecl attributes are not compatible");
5447 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5449 error ("cdecl and thiscall attributes are not compatible");
5452 else if (is_attribute_p ("thiscall", name))
5454 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5455 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5456 name);
5457 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5459 error ("stdcall and thiscall attributes are not compatible");
5461 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5463 error ("fastcall and thiscall attributes are not compatible");
5465 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5467 error ("cdecl and thiscall attributes are not compatible");
5471 /* Can combine sseregparm with all attributes. */
5473 return NULL_TREE;
5476 /* The transactional memory builtins are implicitly regparm or fastcall
5477 depending on the ABI. Override the generic do-nothing attribute that
5478 these builtins were declared with, and replace it with one of the two
5479 attributes that we expect elsewhere. */
5481 static tree
5482 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5483 tree args ATTRIBUTE_UNUSED,
5484 int flags ATTRIBUTE_UNUSED,
5485 bool *no_add_attrs)
5487 tree alt;
5489 /* In no case do we want to add the placeholder attribute. */
5490 *no_add_attrs = true;
5492 /* The 64-bit ABI is unchanged for transactional memory. */
5493 if (TARGET_64BIT)
5494 return NULL_TREE;
5496 /* ??? Is there a better way to validate 32-bit windows? We have
5497 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5498 if (CHECK_STACK_LIMIT > 0)
5499 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5500 else
5502 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5503 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5505 decl_attributes (node, alt, flags);
5507 return NULL_TREE;
5510 /* This function determines from TYPE the calling-convention. */
5512 unsigned int
5513 ix86_get_callcvt (const_tree type)
5515 unsigned int ret = 0;
5516 bool is_stdarg;
5517 tree attrs;
5519 if (TARGET_64BIT)
5520 return IX86_CALLCVT_CDECL;
5522 attrs = TYPE_ATTRIBUTES (type);
5523 if (attrs != NULL_TREE)
5525 if (lookup_attribute ("cdecl", attrs))
5526 ret |= IX86_CALLCVT_CDECL;
5527 else if (lookup_attribute ("stdcall", attrs))
5528 ret |= IX86_CALLCVT_STDCALL;
5529 else if (lookup_attribute ("fastcall", attrs))
5530 ret |= IX86_CALLCVT_FASTCALL;
5531 else if (lookup_attribute ("thiscall", attrs))
5532 ret |= IX86_CALLCVT_THISCALL;
5534 /* Regparam isn't allowed for thiscall and fastcall. */
5535 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5537 if (lookup_attribute ("regparm", attrs))
5538 ret |= IX86_CALLCVT_REGPARM;
5539 if (lookup_attribute ("sseregparm", attrs))
5540 ret |= IX86_CALLCVT_SSEREGPARM;
5543 if (IX86_BASE_CALLCVT(ret) != 0)
5544 return ret;
5547 is_stdarg = stdarg_p (type);
5548 if (TARGET_RTD && !is_stdarg)
5549 return IX86_CALLCVT_STDCALL | ret;
5551 if (ret != 0
5552 || is_stdarg
5553 || TREE_CODE (type) != METHOD_TYPE
5554 || ix86_function_type_abi (type) != MS_ABI)
5555 return IX86_CALLCVT_CDECL | ret;
5557 return IX86_CALLCVT_THISCALL;
5560 /* Return 0 if the attributes for two types are incompatible, 1 if they
5561 are compatible, and 2 if they are nearly compatible (which causes a
5562 warning to be generated). */
5564 static int
5565 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5567 unsigned int ccvt1, ccvt2;
5569 if (TREE_CODE (type1) != FUNCTION_TYPE
5570 && TREE_CODE (type1) != METHOD_TYPE)
5571 return 1;
5573 ccvt1 = ix86_get_callcvt (type1);
5574 ccvt2 = ix86_get_callcvt (type2);
5575 if (ccvt1 != ccvt2)
5576 return 0;
5577 if (ix86_function_regparm (type1, NULL)
5578 != ix86_function_regparm (type2, NULL))
5579 return 0;
5581 return 1;
5584 /* Return the regparm value for a function with the indicated TYPE and DECL.
5585 DECL may be NULL when calling function indirectly
5586 or considering a libcall. */
5588 static int
5589 ix86_function_regparm (const_tree type, const_tree decl)
5591 tree attr;
5592 int regparm;
5593 unsigned int ccvt;
5595 if (TARGET_64BIT)
5596 return (ix86_function_type_abi (type) == SYSV_ABI
5597 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5598 ccvt = ix86_get_callcvt (type);
5599 regparm = ix86_regparm;
5601 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5603 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5604 if (attr)
5606 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5607 return regparm;
5610 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5611 return 2;
5612 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5613 return 1;
5615 /* Use register calling convention for local functions when possible. */
5616 if (decl
5617 && TREE_CODE (decl) == FUNCTION_DECL
5618 && optimize
5619 && !(profile_flag && !flag_fentry))
5621 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5622 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5623 if (i && i->local && i->can_change_signature)
5625 int local_regparm, globals = 0, regno;
5627 /* Make sure no regparm register is taken by a
5628 fixed register variable. */
5629 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5630 if (fixed_regs[local_regparm])
5631 break;
5633 /* We don't want to use regparm(3) for nested functions as
5634 these use a static chain pointer in the third argument. */
5635 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5636 local_regparm = 2;
5638 /* In 32-bit mode save a register for the split stack. */
5639 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5640 local_regparm = 2;
5642 /* Each fixed register usage increases register pressure,
5643 so less registers should be used for argument passing.
5644 This functionality can be overriden by an explicit
5645 regparm value. */
5646 for (regno = AX_REG; regno <= DI_REG; regno++)
5647 if (fixed_regs[regno])
5648 globals++;
5650 local_regparm
5651 = globals < local_regparm ? local_regparm - globals : 0;
5653 if (local_regparm > regparm)
5654 regparm = local_regparm;
5658 return regparm;
5661 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5662 DFmode (2) arguments in SSE registers for a function with the
5663 indicated TYPE and DECL. DECL may be NULL when calling function
5664 indirectly or considering a libcall. Otherwise return 0. */
5666 static int
5667 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5669 gcc_assert (!TARGET_64BIT);
5671 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5672 by the sseregparm attribute. */
5673 if (TARGET_SSEREGPARM
5674 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5676 if (!TARGET_SSE)
5678 if (warn)
5680 if (decl)
5681 error ("calling %qD with attribute sseregparm without "
5682 "SSE/SSE2 enabled", decl);
5683 else
5684 error ("calling %qT with attribute sseregparm without "
5685 "SSE/SSE2 enabled", type);
5687 return 0;
5690 return 2;
5693 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5694 (and DFmode for SSE2) arguments in SSE registers. */
5695 if (decl && TARGET_SSE_MATH && optimize
5696 && !(profile_flag && !flag_fentry))
5698 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5699 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5700 if (i && i->local && i->can_change_signature)
5701 return TARGET_SSE2 ? 2 : 1;
5704 return 0;
5707 /* Return true if EAX is live at the start of the function. Used by
5708 ix86_expand_prologue to determine if we need special help before
5709 calling allocate_stack_worker. */
5711 static bool
5712 ix86_eax_live_at_start_p (void)
5714 /* Cheat. Don't bother working forward from ix86_function_regparm
5715 to the function type to whether an actual argument is located in
5716 eax. Instead just look at cfg info, which is still close enough
5717 to correct at this point. This gives false positives for broken
5718 functions that might use uninitialized data that happens to be
5719 allocated in eax, but who cares? */
5720 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5723 static bool
5724 ix86_keep_aggregate_return_pointer (tree fntype)
5726 tree attr;
5728 if (!TARGET_64BIT)
5730 attr = lookup_attribute ("callee_pop_aggregate_return",
5731 TYPE_ATTRIBUTES (fntype));
5732 if (attr)
5733 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5735 /* For 32-bit MS-ABI the default is to keep aggregate
5736 return pointer. */
5737 if (ix86_function_type_abi (fntype) == MS_ABI)
5738 return true;
5740 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5743 /* Value is the number of bytes of arguments automatically
5744 popped when returning from a subroutine call.
5745 FUNDECL is the declaration node of the function (as a tree),
5746 FUNTYPE is the data type of the function (as a tree),
5747 or for a library call it is an identifier node for the subroutine name.
5748 SIZE is the number of bytes of arguments passed on the stack.
5750 On the 80386, the RTD insn may be used to pop them if the number
5751 of args is fixed, but if the number is variable then the caller
5752 must pop them all. RTD can't be used for library calls now
5753 because the library is compiled with the Unix compiler.
5754 Use of RTD is a selectable option, since it is incompatible with
5755 standard Unix calling sequences. If the option is not selected,
5756 the caller must always pop the args.
5758 The attribute stdcall is equivalent to RTD on a per module basis. */
5760 static int
5761 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5763 unsigned int ccvt;
5765 /* None of the 64-bit ABIs pop arguments. */
5766 if (TARGET_64BIT)
5767 return 0;
5769 ccvt = ix86_get_callcvt (funtype);
5771 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5772 | IX86_CALLCVT_THISCALL)) != 0
5773 && ! stdarg_p (funtype))
5774 return size;
5776 /* Lose any fake structure return argument if it is passed on the stack. */
5777 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5778 && !ix86_keep_aggregate_return_pointer (funtype))
5780 int nregs = ix86_function_regparm (funtype, fundecl);
5781 if (nregs == 0)
5782 return GET_MODE_SIZE (Pmode);
5785 return 0;
5788 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5790 static bool
5791 ix86_legitimate_combined_insn (rtx insn)
5793 /* Check operand constraints in case hard registers were propagated
5794 into insn pattern. This check prevents combine pass from
5795 generating insn patterns with invalid hard register operands.
5796 These invalid insns can eventually confuse reload to error out
5797 with a spill failure. See also PRs 46829 and 46843. */
5798 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5800 int i;
5802 extract_insn (insn);
5803 preprocess_constraints ();
5805 for (i = 0; i < recog_data.n_operands; i++)
5807 rtx op = recog_data.operand[i];
5808 enum machine_mode mode = GET_MODE (op);
5809 struct operand_alternative *op_alt;
5810 int offset = 0;
5811 bool win;
5812 int j;
5814 /* A unary operator may be accepted by the predicate, but it
5815 is irrelevant for matching constraints. */
5816 if (UNARY_P (op))
5817 op = XEXP (op, 0);
5819 if (GET_CODE (op) == SUBREG)
5821 if (REG_P (SUBREG_REG (op))
5822 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5823 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5824 GET_MODE (SUBREG_REG (op)),
5825 SUBREG_BYTE (op),
5826 GET_MODE (op));
5827 op = SUBREG_REG (op);
5830 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5831 continue;
5833 op_alt = recog_op_alt[i];
5835 /* Operand has no constraints, anything is OK. */
5836 win = !recog_data.n_alternatives;
5838 for (j = 0; j < recog_data.n_alternatives; j++)
5840 if (op_alt[j].anything_ok
5841 || (op_alt[j].matches != -1
5842 && operands_match_p
5843 (recog_data.operand[i],
5844 recog_data.operand[op_alt[j].matches]))
5845 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5847 win = true;
5848 break;
5852 if (!win)
5853 return false;
5857 return true;
5860 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5862 static unsigned HOST_WIDE_INT
5863 ix86_asan_shadow_offset (void)
5865 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5866 : HOST_WIDE_INT_C (0x7fff8000))
5867 : (HOST_WIDE_INT_1 << 29);
5870 /* Argument support functions. */
5872 /* Return true when register may be used to pass function parameters. */
5873 bool
5874 ix86_function_arg_regno_p (int regno)
5876 int i;
5877 const int *parm_regs;
5879 if (!TARGET_64BIT)
5881 if (TARGET_MACHO)
5882 return (regno < REGPARM_MAX
5883 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5884 else
5885 return (regno < REGPARM_MAX
5886 || (TARGET_MMX && MMX_REGNO_P (regno)
5887 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5888 || (TARGET_SSE && SSE_REGNO_P (regno)
5889 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5892 if (TARGET_MACHO)
5894 if (SSE_REGNO_P (regno) && TARGET_SSE)
5895 return true;
5897 else
5899 if (TARGET_SSE && SSE_REGNO_P (regno)
5900 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5901 return true;
5904 /* TODO: The function should depend on current function ABI but
5905 builtins.c would need updating then. Therefore we use the
5906 default ABI. */
5908 /* RAX is used as hidden argument to va_arg functions. */
5909 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5910 return true;
5912 if (ix86_abi == MS_ABI)
5913 parm_regs = x86_64_ms_abi_int_parameter_registers;
5914 else
5915 parm_regs = x86_64_int_parameter_registers;
5916 for (i = 0; i < (ix86_abi == MS_ABI
5917 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5918 if (regno == parm_regs[i])
5919 return true;
5920 return false;
5923 /* Return if we do not know how to pass TYPE solely in registers. */
5925 static bool
5926 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5928 if (must_pass_in_stack_var_size_or_pad (mode, type))
5929 return true;
5931 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5932 The layout_type routine is crafty and tries to trick us into passing
5933 currently unsupported vector types on the stack by using TImode. */
5934 return (!TARGET_64BIT && mode == TImode
5935 && type && TREE_CODE (type) != VECTOR_TYPE);
5938 /* It returns the size, in bytes, of the area reserved for arguments passed
5939 in registers for the function represented by fndecl dependent to the used
5940 abi format. */
5942 ix86_reg_parm_stack_space (const_tree fndecl)
5944 enum calling_abi call_abi = SYSV_ABI;
5945 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5946 call_abi = ix86_function_abi (fndecl);
5947 else
5948 call_abi = ix86_function_type_abi (fndecl);
5949 if (TARGET_64BIT && call_abi == MS_ABI)
5950 return 32;
5951 return 0;
5954 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5955 call abi used. */
5956 enum calling_abi
5957 ix86_function_type_abi (const_tree fntype)
5959 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5961 enum calling_abi abi = ix86_abi;
5962 if (abi == SYSV_ABI)
5964 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5965 abi = MS_ABI;
5967 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5968 abi = SYSV_ABI;
5969 return abi;
5971 return ix86_abi;
5974 static bool
5975 ix86_function_ms_hook_prologue (const_tree fn)
5977 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5979 if (decl_function_context (fn) != NULL_TREE)
5980 error_at (DECL_SOURCE_LOCATION (fn),
5981 "ms_hook_prologue is not compatible with nested function");
5982 else
5983 return true;
5985 return false;
5988 static enum calling_abi
5989 ix86_function_abi (const_tree fndecl)
5991 if (! fndecl)
5992 return ix86_abi;
5993 return ix86_function_type_abi (TREE_TYPE (fndecl));
5996 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5997 call abi used. */
5998 enum calling_abi
5999 ix86_cfun_abi (void)
6001 if (! cfun)
6002 return ix86_abi;
6003 return cfun->machine->call_abi;
6006 /* Write the extra assembler code needed to declare a function properly. */
6008 void
6009 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
6010 tree decl)
6012 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
6014 if (is_ms_hook)
6016 int i, filler_count = (TARGET_64BIT ? 32 : 16);
6017 unsigned int filler_cc = 0xcccccccc;
6019 for (i = 0; i < filler_count; i += 4)
6020 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
6023 #ifdef SUBTARGET_ASM_UNWIND_INIT
6024 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
6025 #endif
6027 ASM_OUTPUT_LABEL (asm_out_file, fname);
6029 /* Output magic byte marker, if hot-patch attribute is set. */
6030 if (is_ms_hook)
6032 if (TARGET_64BIT)
6034 /* leaq [%rsp + 0], %rsp */
6035 asm_fprintf (asm_out_file, ASM_BYTE
6036 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
6038 else
6040 /* movl.s %edi, %edi
6041 push %ebp
6042 movl.s %esp, %ebp */
6043 asm_fprintf (asm_out_file, ASM_BYTE
6044 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
6049 /* regclass.c */
6050 extern void init_regs (void);
6052 /* Implementation of call abi switching target hook. Specific to FNDECL
6053 the specific call register sets are set. See also
6054 ix86_conditional_register_usage for more details. */
6055 void
6056 ix86_call_abi_override (const_tree fndecl)
6058 if (fndecl == NULL_TREE)
6059 cfun->machine->call_abi = ix86_abi;
6060 else
6061 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
6064 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
6065 expensive re-initialization of init_regs each time we switch function context
6066 since this is needed only during RTL expansion. */
6067 static void
6068 ix86_maybe_switch_abi (void)
6070 if (TARGET_64BIT &&
6071 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
6072 reinit_regs ();
6075 /* Initialize a variable CUM of type CUMULATIVE_ARGS
6076 for a call to a function whose data type is FNTYPE.
6077 For a library call, FNTYPE is 0. */
6079 void
6080 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
6081 tree fntype, /* tree ptr for function decl */
6082 rtx libname, /* SYMBOL_REF of library name or 0 */
6083 tree fndecl,
6084 int caller)
6086 struct cgraph_local_info *i;
6088 memset (cum, 0, sizeof (*cum));
6090 if (fndecl)
6092 i = cgraph_local_info (fndecl);
6093 cum->call_abi = ix86_function_abi (fndecl);
6095 else
6097 i = NULL;
6098 cum->call_abi = ix86_function_type_abi (fntype);
6101 cum->caller = caller;
6103 /* Set up the number of registers to use for passing arguments. */
6105 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
6106 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
6107 "or subtarget optimization implying it");
6108 cum->nregs = ix86_regparm;
6109 if (TARGET_64BIT)
6111 cum->nregs = (cum->call_abi == SYSV_ABI
6112 ? X86_64_REGPARM_MAX
6113 : X86_64_MS_REGPARM_MAX);
6115 if (TARGET_SSE)
6117 cum->sse_nregs = SSE_REGPARM_MAX;
6118 if (TARGET_64BIT)
6120 cum->sse_nregs = (cum->call_abi == SYSV_ABI
6121 ? X86_64_SSE_REGPARM_MAX
6122 : X86_64_MS_SSE_REGPARM_MAX);
6125 if (TARGET_MMX)
6126 cum->mmx_nregs = MMX_REGPARM_MAX;
6127 cum->warn_avx = true;
6128 cum->warn_sse = true;
6129 cum->warn_mmx = true;
6131 /* Because type might mismatch in between caller and callee, we need to
6132 use actual type of function for local calls.
6133 FIXME: cgraph_analyze can be told to actually record if function uses
6134 va_start so for local functions maybe_vaarg can be made aggressive
6135 helping K&R code.
6136 FIXME: once typesytem is fixed, we won't need this code anymore. */
6137 if (i && i->local && i->can_change_signature)
6138 fntype = TREE_TYPE (fndecl);
6139 cum->maybe_vaarg = (fntype
6140 ? (!prototype_p (fntype) || stdarg_p (fntype))
6141 : !libname);
6143 if (!TARGET_64BIT)
6145 /* If there are variable arguments, then we won't pass anything
6146 in registers in 32-bit mode. */
6147 if (stdarg_p (fntype))
6149 cum->nregs = 0;
6150 cum->sse_nregs = 0;
6151 cum->mmx_nregs = 0;
6152 cum->warn_avx = 0;
6153 cum->warn_sse = 0;
6154 cum->warn_mmx = 0;
6155 return;
6158 /* Use ecx and edx registers if function has fastcall attribute,
6159 else look for regparm information. */
6160 if (fntype)
6162 unsigned int ccvt = ix86_get_callcvt (fntype);
6163 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6165 cum->nregs = 1;
6166 cum->fastcall = 1; /* Same first register as in fastcall. */
6168 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6170 cum->nregs = 2;
6171 cum->fastcall = 1;
6173 else
6174 cum->nregs = ix86_function_regparm (fntype, fndecl);
6177 /* Set up the number of SSE registers used for passing SFmode
6178 and DFmode arguments. Warn for mismatching ABI. */
6179 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
6183 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
6184 But in the case of vector types, it is some vector mode.
6186 When we have only some of our vector isa extensions enabled, then there
6187 are some modes for which vector_mode_supported_p is false. For these
6188 modes, the generic vector support in gcc will choose some non-vector mode
6189 in order to implement the type. By computing the natural mode, we'll
6190 select the proper ABI location for the operand and not depend on whatever
6191 the middle-end decides to do with these vector types.
6193 The midde-end can't deal with the vector types > 16 bytes. In this
6194 case, we return the original mode and warn ABI change if CUM isn't
6195 NULL. */
6197 static enum machine_mode
6198 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
6200 enum machine_mode mode = TYPE_MODE (type);
6202 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
6204 HOST_WIDE_INT size = int_size_in_bytes (type);
6205 if ((size == 8 || size == 16 || size == 32)
6206 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
6207 && TYPE_VECTOR_SUBPARTS (type) > 1)
6209 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
6211 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
6212 mode = MIN_MODE_VECTOR_FLOAT;
6213 else
6214 mode = MIN_MODE_VECTOR_INT;
6216 /* Get the mode which has this inner mode and number of units. */
6217 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
6218 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
6219 && GET_MODE_INNER (mode) == innermode)
6221 if (size == 32 && !TARGET_AVX)
6223 static bool warnedavx;
6225 if (cum
6226 && !warnedavx
6227 && cum->warn_avx)
6229 warnedavx = true;
6230 warning (0, "AVX vector argument without AVX "
6231 "enabled changes the ABI");
6233 return TYPE_MODE (type);
6235 else if ((size == 8 || size == 16) && !TARGET_SSE)
6237 static bool warnedsse;
6239 if (cum
6240 && !warnedsse
6241 && cum->warn_sse)
6243 warnedsse = true;
6244 warning (0, "SSE vector argument without SSE "
6245 "enabled changes the ABI");
6247 return mode;
6249 else
6250 return mode;
6253 gcc_unreachable ();
6257 return mode;
6260 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6261 this may not agree with the mode that the type system has chosen for the
6262 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6263 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6265 static rtx
6266 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6267 unsigned int regno)
6269 rtx tmp;
6271 if (orig_mode != BLKmode)
6272 tmp = gen_rtx_REG (orig_mode, regno);
6273 else
6275 tmp = gen_rtx_REG (mode, regno);
6276 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6277 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6280 return tmp;
6283 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6284 of this code is to classify each 8bytes of incoming argument by the register
6285 class and assign registers accordingly. */
6287 /* Return the union class of CLASS1 and CLASS2.
6288 See the x86-64 PS ABI for details. */
6290 static enum x86_64_reg_class
6291 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6293 /* Rule #1: If both classes are equal, this is the resulting class. */
6294 if (class1 == class2)
6295 return class1;
6297 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6298 the other class. */
6299 if (class1 == X86_64_NO_CLASS)
6300 return class2;
6301 if (class2 == X86_64_NO_CLASS)
6302 return class1;
6304 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6305 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6306 return X86_64_MEMORY_CLASS;
6308 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6309 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6310 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6311 return X86_64_INTEGERSI_CLASS;
6312 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6313 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6314 return X86_64_INTEGER_CLASS;
6316 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6317 MEMORY is used. */
6318 if (class1 == X86_64_X87_CLASS
6319 || class1 == X86_64_X87UP_CLASS
6320 || class1 == X86_64_COMPLEX_X87_CLASS
6321 || class2 == X86_64_X87_CLASS
6322 || class2 == X86_64_X87UP_CLASS
6323 || class2 == X86_64_COMPLEX_X87_CLASS)
6324 return X86_64_MEMORY_CLASS;
6326 /* Rule #6: Otherwise class SSE is used. */
6327 return X86_64_SSE_CLASS;
6330 /* Classify the argument of type TYPE and mode MODE.
6331 CLASSES will be filled by the register class used to pass each word
6332 of the operand. The number of words is returned. In case the parameter
6333 should be passed in memory, 0 is returned. As a special case for zero
6334 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6336 BIT_OFFSET is used internally for handling records and specifies offset
6337 of the offset in bits modulo 256 to avoid overflow cases.
6339 See the x86-64 PS ABI for details.
6342 static int
6343 classify_argument (enum machine_mode mode, const_tree type,
6344 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6346 HOST_WIDE_INT bytes =
6347 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6348 int words
6349 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6351 /* Variable sized entities are always passed/returned in memory. */
6352 if (bytes < 0)
6353 return 0;
6355 if (mode != VOIDmode
6356 && targetm.calls.must_pass_in_stack (mode, type))
6357 return 0;
6359 if (type && AGGREGATE_TYPE_P (type))
6361 int i;
6362 tree field;
6363 enum x86_64_reg_class subclasses[MAX_CLASSES];
6365 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6366 if (bytes > 32)
6367 return 0;
6369 for (i = 0; i < words; i++)
6370 classes[i] = X86_64_NO_CLASS;
6372 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6373 signalize memory class, so handle it as special case. */
6374 if (!words)
6376 classes[0] = X86_64_NO_CLASS;
6377 return 1;
6380 /* Classify each field of record and merge classes. */
6381 switch (TREE_CODE (type))
6383 case RECORD_TYPE:
6384 /* And now merge the fields of structure. */
6385 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6387 if (TREE_CODE (field) == FIELD_DECL)
6389 int num;
6391 if (TREE_TYPE (field) == error_mark_node)
6392 continue;
6394 /* Bitfields are always classified as integer. Handle them
6395 early, since later code would consider them to be
6396 misaligned integers. */
6397 if (DECL_BIT_FIELD (field))
6399 for (i = (int_bit_position (field)
6400 + (bit_offset % 64)) / 8 / 8;
6401 i < ((int_bit_position (field) + (bit_offset % 64))
6402 + tree_low_cst (DECL_SIZE (field), 0)
6403 + 63) / 8 / 8; i++)
6404 classes[i] =
6405 merge_classes (X86_64_INTEGER_CLASS,
6406 classes[i]);
6408 else
6410 int pos;
6412 type = TREE_TYPE (field);
6414 /* Flexible array member is ignored. */
6415 if (TYPE_MODE (type) == BLKmode
6416 && TREE_CODE (type) == ARRAY_TYPE
6417 && TYPE_SIZE (type) == NULL_TREE
6418 && TYPE_DOMAIN (type) != NULL_TREE
6419 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6420 == NULL_TREE))
6422 static bool warned;
6424 if (!warned && warn_psabi)
6426 warned = true;
6427 inform (input_location,
6428 "the ABI of passing struct with"
6429 " a flexible array member has"
6430 " changed in GCC 4.4");
6432 continue;
6434 num = classify_argument (TYPE_MODE (type), type,
6435 subclasses,
6436 (int_bit_position (field)
6437 + bit_offset) % 256);
6438 if (!num)
6439 return 0;
6440 pos = (int_bit_position (field)
6441 + (bit_offset % 64)) / 8 / 8;
6442 for (i = 0; i < num && (i + pos) < words; i++)
6443 classes[i + pos] =
6444 merge_classes (subclasses[i], classes[i + pos]);
6448 break;
6450 case ARRAY_TYPE:
6451 /* Arrays are handled as small records. */
6453 int num;
6454 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6455 TREE_TYPE (type), subclasses, bit_offset);
6456 if (!num)
6457 return 0;
6459 /* The partial classes are now full classes. */
6460 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6461 subclasses[0] = X86_64_SSE_CLASS;
6462 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6463 && !((bit_offset % 64) == 0 && bytes == 4))
6464 subclasses[0] = X86_64_INTEGER_CLASS;
6466 for (i = 0; i < words; i++)
6467 classes[i] = subclasses[i % num];
6469 break;
6471 case UNION_TYPE:
6472 case QUAL_UNION_TYPE:
6473 /* Unions are similar to RECORD_TYPE but offset is always 0.
6475 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6477 if (TREE_CODE (field) == FIELD_DECL)
6479 int num;
6481 if (TREE_TYPE (field) == error_mark_node)
6482 continue;
6484 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6485 TREE_TYPE (field), subclasses,
6486 bit_offset);
6487 if (!num)
6488 return 0;
6489 for (i = 0; i < num; i++)
6490 classes[i] = merge_classes (subclasses[i], classes[i]);
6493 break;
6495 default:
6496 gcc_unreachable ();
6499 if (words > 2)
6501 /* When size > 16 bytes, if the first one isn't
6502 X86_64_SSE_CLASS or any other ones aren't
6503 X86_64_SSEUP_CLASS, everything should be passed in
6504 memory. */
6505 if (classes[0] != X86_64_SSE_CLASS)
6506 return 0;
6508 for (i = 1; i < words; i++)
6509 if (classes[i] != X86_64_SSEUP_CLASS)
6510 return 0;
6513 /* Final merger cleanup. */
6514 for (i = 0; i < words; i++)
6516 /* If one class is MEMORY, everything should be passed in
6517 memory. */
6518 if (classes[i] == X86_64_MEMORY_CLASS)
6519 return 0;
6521 /* The X86_64_SSEUP_CLASS should be always preceded by
6522 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6523 if (classes[i] == X86_64_SSEUP_CLASS
6524 && classes[i - 1] != X86_64_SSE_CLASS
6525 && classes[i - 1] != X86_64_SSEUP_CLASS)
6527 /* The first one should never be X86_64_SSEUP_CLASS. */
6528 gcc_assert (i != 0);
6529 classes[i] = X86_64_SSE_CLASS;
6532 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6533 everything should be passed in memory. */
6534 if (classes[i] == X86_64_X87UP_CLASS
6535 && (classes[i - 1] != X86_64_X87_CLASS))
6537 static bool warned;
6539 /* The first one should never be X86_64_X87UP_CLASS. */
6540 gcc_assert (i != 0);
6541 if (!warned && warn_psabi)
6543 warned = true;
6544 inform (input_location,
6545 "the ABI of passing union with long double"
6546 " has changed in GCC 4.4");
6548 return 0;
6551 return words;
6554 /* Compute alignment needed. We align all types to natural boundaries with
6555 exception of XFmode that is aligned to 64bits. */
6556 if (mode != VOIDmode && mode != BLKmode)
6558 int mode_alignment = GET_MODE_BITSIZE (mode);
6560 if (mode == XFmode)
6561 mode_alignment = 128;
6562 else if (mode == XCmode)
6563 mode_alignment = 256;
6564 if (COMPLEX_MODE_P (mode))
6565 mode_alignment /= 2;
6566 /* Misaligned fields are always returned in memory. */
6567 if (bit_offset % mode_alignment)
6568 return 0;
6571 /* for V1xx modes, just use the base mode */
6572 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6573 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6574 mode = GET_MODE_INNER (mode);
6576 /* Classification of atomic types. */
6577 switch (mode)
6579 case SDmode:
6580 case DDmode:
6581 classes[0] = X86_64_SSE_CLASS;
6582 return 1;
6583 case TDmode:
6584 classes[0] = X86_64_SSE_CLASS;
6585 classes[1] = X86_64_SSEUP_CLASS;
6586 return 2;
6587 case DImode:
6588 case SImode:
6589 case HImode:
6590 case QImode:
6591 case CSImode:
6592 case CHImode:
6593 case CQImode:
6595 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6597 if (size <= 32)
6599 classes[0] = X86_64_INTEGERSI_CLASS;
6600 return 1;
6602 else if (size <= 64)
6604 classes[0] = X86_64_INTEGER_CLASS;
6605 return 1;
6607 else if (size <= 64+32)
6609 classes[0] = X86_64_INTEGER_CLASS;
6610 classes[1] = X86_64_INTEGERSI_CLASS;
6611 return 2;
6613 else if (size <= 64+64)
6615 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6616 return 2;
6618 else
6619 gcc_unreachable ();
6621 case CDImode:
6622 case TImode:
6623 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6624 return 2;
6625 case COImode:
6626 case OImode:
6627 /* OImode shouldn't be used directly. */
6628 gcc_unreachable ();
6629 case CTImode:
6630 return 0;
6631 case SFmode:
6632 if (!(bit_offset % 64))
6633 classes[0] = X86_64_SSESF_CLASS;
6634 else
6635 classes[0] = X86_64_SSE_CLASS;
6636 return 1;
6637 case DFmode:
6638 classes[0] = X86_64_SSEDF_CLASS;
6639 return 1;
6640 case XFmode:
6641 classes[0] = X86_64_X87_CLASS;
6642 classes[1] = X86_64_X87UP_CLASS;
6643 return 2;
6644 case TFmode:
6645 classes[0] = X86_64_SSE_CLASS;
6646 classes[1] = X86_64_SSEUP_CLASS;
6647 return 2;
6648 case SCmode:
6649 classes[0] = X86_64_SSE_CLASS;
6650 if (!(bit_offset % 64))
6651 return 1;
6652 else
6654 static bool warned;
6656 if (!warned && warn_psabi)
6658 warned = true;
6659 inform (input_location,
6660 "the ABI of passing structure with complex float"
6661 " member has changed in GCC 4.4");
6663 classes[1] = X86_64_SSESF_CLASS;
6664 return 2;
6666 case DCmode:
6667 classes[0] = X86_64_SSEDF_CLASS;
6668 classes[1] = X86_64_SSEDF_CLASS;
6669 return 2;
6670 case XCmode:
6671 classes[0] = X86_64_COMPLEX_X87_CLASS;
6672 return 1;
6673 case TCmode:
6674 /* This modes is larger than 16 bytes. */
6675 return 0;
6676 case V8SFmode:
6677 case V8SImode:
6678 case V32QImode:
6679 case V16HImode:
6680 case V4DFmode:
6681 case V4DImode:
6682 classes[0] = X86_64_SSE_CLASS;
6683 classes[1] = X86_64_SSEUP_CLASS;
6684 classes[2] = X86_64_SSEUP_CLASS;
6685 classes[3] = X86_64_SSEUP_CLASS;
6686 return 4;
6687 case V4SFmode:
6688 case V4SImode:
6689 case V16QImode:
6690 case V8HImode:
6691 case V2DFmode:
6692 case V2DImode:
6693 classes[0] = X86_64_SSE_CLASS;
6694 classes[1] = X86_64_SSEUP_CLASS;
6695 return 2;
6696 case V1TImode:
6697 case V1DImode:
6698 case V2SFmode:
6699 case V2SImode:
6700 case V4HImode:
6701 case V8QImode:
6702 classes[0] = X86_64_SSE_CLASS;
6703 return 1;
6704 case BLKmode:
6705 case VOIDmode:
6706 return 0;
6707 default:
6708 gcc_assert (VECTOR_MODE_P (mode));
6710 if (bytes > 16)
6711 return 0;
6713 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6715 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6716 classes[0] = X86_64_INTEGERSI_CLASS;
6717 else
6718 classes[0] = X86_64_INTEGER_CLASS;
6719 classes[1] = X86_64_INTEGER_CLASS;
6720 return 1 + (bytes > 8);
6724 /* Examine the argument and return set number of register required in each
6725 class. Return 0 iff parameter should be passed in memory. */
6726 static int
6727 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6728 int *int_nregs, int *sse_nregs)
6730 enum x86_64_reg_class regclass[MAX_CLASSES];
6731 int n = classify_argument (mode, type, regclass, 0);
6733 *int_nregs = 0;
6734 *sse_nregs = 0;
6735 if (!n)
6736 return 0;
6737 for (n--; n >= 0; n--)
6738 switch (regclass[n])
6740 case X86_64_INTEGER_CLASS:
6741 case X86_64_INTEGERSI_CLASS:
6742 (*int_nregs)++;
6743 break;
6744 case X86_64_SSE_CLASS:
6745 case X86_64_SSESF_CLASS:
6746 case X86_64_SSEDF_CLASS:
6747 (*sse_nregs)++;
6748 break;
6749 case X86_64_NO_CLASS:
6750 case X86_64_SSEUP_CLASS:
6751 break;
6752 case X86_64_X87_CLASS:
6753 case X86_64_X87UP_CLASS:
6754 if (!in_return)
6755 return 0;
6756 break;
6757 case X86_64_COMPLEX_X87_CLASS:
6758 return in_return ? 2 : 0;
6759 case X86_64_MEMORY_CLASS:
6760 gcc_unreachable ();
6762 return 1;
6765 /* Construct container for the argument used by GCC interface. See
6766 FUNCTION_ARG for the detailed description. */
6768 static rtx
6769 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6770 const_tree type, int in_return, int nintregs, int nsseregs,
6771 const int *intreg, int sse_regno)
6773 /* The following variables hold the static issued_error state. */
6774 static bool issued_sse_arg_error;
6775 static bool issued_sse_ret_error;
6776 static bool issued_x87_ret_error;
6778 enum machine_mode tmpmode;
6779 int bytes =
6780 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6781 enum x86_64_reg_class regclass[MAX_CLASSES];
6782 int n;
6783 int i;
6784 int nexps = 0;
6785 int needed_sseregs, needed_intregs;
6786 rtx exp[MAX_CLASSES];
6787 rtx ret;
6789 n = classify_argument (mode, type, regclass, 0);
6790 if (!n)
6791 return NULL;
6792 if (!examine_argument (mode, type, in_return, &needed_intregs,
6793 &needed_sseregs))
6794 return NULL;
6795 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6796 return NULL;
6798 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6799 some less clueful developer tries to use floating-point anyway. */
6800 if (needed_sseregs && !TARGET_SSE)
6802 if (in_return)
6804 if (!issued_sse_ret_error)
6806 error ("SSE register return with SSE disabled");
6807 issued_sse_ret_error = true;
6810 else if (!issued_sse_arg_error)
6812 error ("SSE register argument with SSE disabled");
6813 issued_sse_arg_error = true;
6815 return NULL;
6818 /* Likewise, error if the ABI requires us to return values in the
6819 x87 registers and the user specified -mno-80387. */
6820 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6821 for (i = 0; i < n; i++)
6822 if (regclass[i] == X86_64_X87_CLASS
6823 || regclass[i] == X86_64_X87UP_CLASS
6824 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6826 if (!issued_x87_ret_error)
6828 error ("x87 register return with x87 disabled");
6829 issued_x87_ret_error = true;
6831 return NULL;
6834 /* First construct simple cases. Avoid SCmode, since we want to use
6835 single register to pass this type. */
6836 if (n == 1 && mode != SCmode)
6837 switch (regclass[0])
6839 case X86_64_INTEGER_CLASS:
6840 case X86_64_INTEGERSI_CLASS:
6841 return gen_rtx_REG (mode, intreg[0]);
6842 case X86_64_SSE_CLASS:
6843 case X86_64_SSESF_CLASS:
6844 case X86_64_SSEDF_CLASS:
6845 if (mode != BLKmode)
6846 return gen_reg_or_parallel (mode, orig_mode,
6847 SSE_REGNO (sse_regno));
6848 break;
6849 case X86_64_X87_CLASS:
6850 case X86_64_COMPLEX_X87_CLASS:
6851 return gen_rtx_REG (mode, FIRST_STACK_REG);
6852 case X86_64_NO_CLASS:
6853 /* Zero sized array, struct or class. */
6854 return NULL;
6855 default:
6856 gcc_unreachable ();
6858 if (n == 2
6859 && regclass[0] == X86_64_SSE_CLASS
6860 && regclass[1] == X86_64_SSEUP_CLASS
6861 && mode != BLKmode)
6862 return gen_reg_or_parallel (mode, orig_mode,
6863 SSE_REGNO (sse_regno));
6864 if (n == 4
6865 && regclass[0] == X86_64_SSE_CLASS
6866 && regclass[1] == X86_64_SSEUP_CLASS
6867 && regclass[2] == X86_64_SSEUP_CLASS
6868 && regclass[3] == X86_64_SSEUP_CLASS
6869 && mode != BLKmode)
6870 return gen_reg_or_parallel (mode, orig_mode,
6871 SSE_REGNO (sse_regno));
6872 if (n == 2
6873 && regclass[0] == X86_64_X87_CLASS
6874 && regclass[1] == X86_64_X87UP_CLASS)
6875 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6877 if (n == 2
6878 && regclass[0] == X86_64_INTEGER_CLASS
6879 && regclass[1] == X86_64_INTEGER_CLASS
6880 && (mode == CDImode || mode == TImode || mode == TFmode)
6881 && intreg[0] + 1 == intreg[1])
6882 return gen_rtx_REG (mode, intreg[0]);
6884 /* Otherwise figure out the entries of the PARALLEL. */
6885 for (i = 0; i < n; i++)
6887 int pos;
6889 switch (regclass[i])
6891 case X86_64_NO_CLASS:
6892 break;
6893 case X86_64_INTEGER_CLASS:
6894 case X86_64_INTEGERSI_CLASS:
6895 /* Merge TImodes on aligned occasions here too. */
6896 if (i * 8 + 8 > bytes)
6897 tmpmode
6898 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6899 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6900 tmpmode = SImode;
6901 else
6902 tmpmode = DImode;
6903 /* We've requested 24 bytes we
6904 don't have mode for. Use DImode. */
6905 if (tmpmode == BLKmode)
6906 tmpmode = DImode;
6907 exp [nexps++]
6908 = gen_rtx_EXPR_LIST (VOIDmode,
6909 gen_rtx_REG (tmpmode, *intreg),
6910 GEN_INT (i*8));
6911 intreg++;
6912 break;
6913 case X86_64_SSESF_CLASS:
6914 exp [nexps++]
6915 = gen_rtx_EXPR_LIST (VOIDmode,
6916 gen_rtx_REG (SFmode,
6917 SSE_REGNO (sse_regno)),
6918 GEN_INT (i*8));
6919 sse_regno++;
6920 break;
6921 case X86_64_SSEDF_CLASS:
6922 exp [nexps++]
6923 = gen_rtx_EXPR_LIST (VOIDmode,
6924 gen_rtx_REG (DFmode,
6925 SSE_REGNO (sse_regno)),
6926 GEN_INT (i*8));
6927 sse_regno++;
6928 break;
6929 case X86_64_SSE_CLASS:
6930 pos = i;
6931 switch (n)
6933 case 1:
6934 tmpmode = DImode;
6935 break;
6936 case 2:
6937 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6939 tmpmode = TImode;
6940 i++;
6942 else
6943 tmpmode = DImode;
6944 break;
6945 case 4:
6946 gcc_assert (i == 0
6947 && regclass[1] == X86_64_SSEUP_CLASS
6948 && regclass[2] == X86_64_SSEUP_CLASS
6949 && regclass[3] == X86_64_SSEUP_CLASS);
6950 tmpmode = OImode;
6951 i += 3;
6952 break;
6953 default:
6954 gcc_unreachable ();
6956 exp [nexps++]
6957 = gen_rtx_EXPR_LIST (VOIDmode,
6958 gen_rtx_REG (tmpmode,
6959 SSE_REGNO (sse_regno)),
6960 GEN_INT (pos*8));
6961 sse_regno++;
6962 break;
6963 default:
6964 gcc_unreachable ();
6968 /* Empty aligned struct, union or class. */
6969 if (nexps == 0)
6970 return NULL;
6972 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6973 for (i = 0; i < nexps; i++)
6974 XVECEXP (ret, 0, i) = exp [i];
6975 return ret;
6978 /* Update the data in CUM to advance over an argument of mode MODE
6979 and data type TYPE. (TYPE is null for libcalls where that information
6980 may not be available.) */
6982 static void
6983 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6984 const_tree type, HOST_WIDE_INT bytes,
6985 HOST_WIDE_INT words)
6987 switch (mode)
6989 default:
6990 break;
6992 case BLKmode:
6993 if (bytes < 0)
6994 break;
6995 /* FALLTHRU */
6997 case DImode:
6998 case SImode:
6999 case HImode:
7000 case QImode:
7001 cum->words += words;
7002 cum->nregs -= words;
7003 cum->regno += words;
7005 if (cum->nregs <= 0)
7007 cum->nregs = 0;
7008 cum->regno = 0;
7010 break;
7012 case OImode:
7013 /* OImode shouldn't be used directly. */
7014 gcc_unreachable ();
7016 case DFmode:
7017 if (cum->float_in_sse < 2)
7018 break;
7019 case SFmode:
7020 if (cum->float_in_sse < 1)
7021 break;
7022 /* FALLTHRU */
7024 case V8SFmode:
7025 case V8SImode:
7026 case V32QImode:
7027 case V16HImode:
7028 case V4DFmode:
7029 case V4DImode:
7030 case TImode:
7031 case V16QImode:
7032 case V8HImode:
7033 case V4SImode:
7034 case V2DImode:
7035 case V4SFmode:
7036 case V2DFmode:
7037 if (!type || !AGGREGATE_TYPE_P (type))
7039 cum->sse_words += words;
7040 cum->sse_nregs -= 1;
7041 cum->sse_regno += 1;
7042 if (cum->sse_nregs <= 0)
7044 cum->sse_nregs = 0;
7045 cum->sse_regno = 0;
7048 break;
7050 case V8QImode:
7051 case V4HImode:
7052 case V2SImode:
7053 case V2SFmode:
7054 case V1TImode:
7055 case V1DImode:
7056 if (!type || !AGGREGATE_TYPE_P (type))
7058 cum->mmx_words += words;
7059 cum->mmx_nregs -= 1;
7060 cum->mmx_regno += 1;
7061 if (cum->mmx_nregs <= 0)
7063 cum->mmx_nregs = 0;
7064 cum->mmx_regno = 0;
7067 break;
7071 static void
7072 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
7073 const_tree type, HOST_WIDE_INT words, bool named)
7075 int int_nregs, sse_nregs;
7077 /* Unnamed 256bit vector mode parameters are passed on stack. */
7078 if (!named && VALID_AVX256_REG_MODE (mode))
7079 return;
7081 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
7082 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
7084 cum->nregs -= int_nregs;
7085 cum->sse_nregs -= sse_nregs;
7086 cum->regno += int_nregs;
7087 cum->sse_regno += sse_nregs;
7089 else
7091 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
7092 cum->words = (cum->words + align - 1) & ~(align - 1);
7093 cum->words += words;
7097 static void
7098 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
7099 HOST_WIDE_INT words)
7101 /* Otherwise, this should be passed indirect. */
7102 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
7104 cum->words += words;
7105 if (cum->nregs > 0)
7107 cum->nregs -= 1;
7108 cum->regno += 1;
7112 /* Update the data in CUM to advance over an argument of mode MODE and
7113 data type TYPE. (TYPE is null for libcalls where that information
7114 may not be available.) */
7116 static void
7117 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
7118 const_tree type, bool named)
7120 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7121 HOST_WIDE_INT bytes, words;
7123 if (mode == BLKmode)
7124 bytes = int_size_in_bytes (type);
7125 else
7126 bytes = GET_MODE_SIZE (mode);
7127 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7129 if (type)
7130 mode = type_natural_mode (type, NULL);
7132 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7133 function_arg_advance_ms_64 (cum, bytes, words);
7134 else if (TARGET_64BIT)
7135 function_arg_advance_64 (cum, mode, type, words, named);
7136 else
7137 function_arg_advance_32 (cum, mode, type, bytes, words);
7140 /* Define where to put the arguments to a function.
7141 Value is zero to push the argument on the stack,
7142 or a hard register in which to store the argument.
7144 MODE is the argument's machine mode.
7145 TYPE is the data type of the argument (as a tree).
7146 This is null for libcalls where that information may
7147 not be available.
7148 CUM is a variable of type CUMULATIVE_ARGS which gives info about
7149 the preceding args and about the function being called.
7150 NAMED is nonzero if this argument is a named parameter
7151 (otherwise it is an extra parameter matching an ellipsis). */
7153 static rtx
7154 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7155 enum machine_mode orig_mode, const_tree type,
7156 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
7158 static bool warnedsse, warnedmmx;
7160 /* Avoid the AL settings for the Unix64 ABI. */
7161 if (mode == VOIDmode)
7162 return constm1_rtx;
7164 switch (mode)
7166 default:
7167 break;
7169 case BLKmode:
7170 if (bytes < 0)
7171 break;
7172 /* FALLTHRU */
7173 case DImode:
7174 case SImode:
7175 case HImode:
7176 case QImode:
7177 if (words <= cum->nregs)
7179 int regno = cum->regno;
7181 /* Fastcall allocates the first two DWORD (SImode) or
7182 smaller arguments to ECX and EDX if it isn't an
7183 aggregate type . */
7184 if (cum->fastcall)
7186 if (mode == BLKmode
7187 || mode == DImode
7188 || (type && AGGREGATE_TYPE_P (type)))
7189 break;
7191 /* ECX not EAX is the first allocated register. */
7192 if (regno == AX_REG)
7193 regno = CX_REG;
7195 return gen_rtx_REG (mode, regno);
7197 break;
7199 case DFmode:
7200 if (cum->float_in_sse < 2)
7201 break;
7202 case SFmode:
7203 if (cum->float_in_sse < 1)
7204 break;
7205 /* FALLTHRU */
7206 case TImode:
7207 /* In 32bit, we pass TImode in xmm registers. */
7208 case V16QImode:
7209 case V8HImode:
7210 case V4SImode:
7211 case V2DImode:
7212 case V4SFmode:
7213 case V2DFmode:
7214 if (!type || !AGGREGATE_TYPE_P (type))
7216 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
7218 warnedsse = true;
7219 warning (0, "SSE vector argument without SSE enabled "
7220 "changes the ABI");
7222 if (cum->sse_nregs)
7223 return gen_reg_or_parallel (mode, orig_mode,
7224 cum->sse_regno + FIRST_SSE_REG);
7226 break;
7228 case OImode:
7229 /* OImode shouldn't be used directly. */
7230 gcc_unreachable ();
7232 case V8SFmode:
7233 case V8SImode:
7234 case V32QImode:
7235 case V16HImode:
7236 case V4DFmode:
7237 case V4DImode:
7238 if (!type || !AGGREGATE_TYPE_P (type))
7240 if (cum->sse_nregs)
7241 return gen_reg_or_parallel (mode, orig_mode,
7242 cum->sse_regno + FIRST_SSE_REG);
7244 break;
7246 case V8QImode:
7247 case V4HImode:
7248 case V2SImode:
7249 case V2SFmode:
7250 case V1TImode:
7251 case V1DImode:
7252 if (!type || !AGGREGATE_TYPE_P (type))
7254 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7256 warnedmmx = true;
7257 warning (0, "MMX vector argument without MMX enabled "
7258 "changes the ABI");
7260 if (cum->mmx_nregs)
7261 return gen_reg_or_parallel (mode, orig_mode,
7262 cum->mmx_regno + FIRST_MMX_REG);
7264 break;
7267 return NULL_RTX;
7270 static rtx
7271 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7272 enum machine_mode orig_mode, const_tree type, bool named)
7274 /* Handle a hidden AL argument containing number of registers
7275 for varargs x86-64 functions. */
7276 if (mode == VOIDmode)
7277 return GEN_INT (cum->maybe_vaarg
7278 ? (cum->sse_nregs < 0
7279 ? X86_64_SSE_REGPARM_MAX
7280 : cum->sse_regno)
7281 : -1);
7283 switch (mode)
7285 default:
7286 break;
7288 case V8SFmode:
7289 case V8SImode:
7290 case V32QImode:
7291 case V16HImode:
7292 case V4DFmode:
7293 case V4DImode:
7294 /* Unnamed 256bit vector mode parameters are passed on stack. */
7295 if (!named)
7296 return NULL;
7297 break;
7300 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7301 cum->sse_nregs,
7302 &x86_64_int_parameter_registers [cum->regno],
7303 cum->sse_regno);
7306 static rtx
7307 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7308 enum machine_mode orig_mode, bool named,
7309 HOST_WIDE_INT bytes)
7311 unsigned int regno;
7313 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7314 We use value of -2 to specify that current function call is MSABI. */
7315 if (mode == VOIDmode)
7316 return GEN_INT (-2);
7318 /* If we've run out of registers, it goes on the stack. */
7319 if (cum->nregs == 0)
7320 return NULL_RTX;
7322 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7324 /* Only floating point modes are passed in anything but integer regs. */
7325 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7327 if (named)
7328 regno = cum->regno + FIRST_SSE_REG;
7329 else
7331 rtx t1, t2;
7333 /* Unnamed floating parameters are passed in both the
7334 SSE and integer registers. */
7335 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7336 t2 = gen_rtx_REG (mode, regno);
7337 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7338 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7339 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7342 /* Handle aggregated types passed in register. */
7343 if (orig_mode == BLKmode)
7345 if (bytes > 0 && bytes <= 8)
7346 mode = (bytes > 4 ? DImode : SImode);
7347 if (mode == BLKmode)
7348 mode = DImode;
7351 return gen_reg_or_parallel (mode, orig_mode, regno);
7354 /* Return where to put the arguments to a function.
7355 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7357 MODE is the argument's machine mode. TYPE is the data type of the
7358 argument. It is null for libcalls where that information may not be
7359 available. CUM gives information about the preceding args and about
7360 the function being called. NAMED is nonzero if this argument is a
7361 named parameter (otherwise it is an extra parameter matching an
7362 ellipsis). */
7364 static rtx
7365 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7366 const_tree type, bool named)
7368 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7369 enum machine_mode mode = omode;
7370 HOST_WIDE_INT bytes, words;
7371 rtx arg;
7373 if (mode == BLKmode)
7374 bytes = int_size_in_bytes (type);
7375 else
7376 bytes = GET_MODE_SIZE (mode);
7377 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7379 /* To simplify the code below, represent vector types with a vector mode
7380 even if MMX/SSE are not active. */
7381 if (type && TREE_CODE (type) == VECTOR_TYPE)
7382 mode = type_natural_mode (type, cum);
7384 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7385 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7386 else if (TARGET_64BIT)
7387 arg = function_arg_64 (cum, mode, omode, type, named);
7388 else
7389 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7391 return arg;
7394 /* A C expression that indicates when an argument must be passed by
7395 reference. If nonzero for an argument, a copy of that argument is
7396 made in memory and a pointer to the argument is passed instead of
7397 the argument itself. The pointer is passed in whatever way is
7398 appropriate for passing a pointer to that type. */
7400 static bool
7401 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7402 enum machine_mode mode ATTRIBUTE_UNUSED,
7403 const_tree type, bool named ATTRIBUTE_UNUSED)
7405 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7407 /* See Windows x64 Software Convention. */
7408 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7410 int msize = (int) GET_MODE_SIZE (mode);
7411 if (type)
7413 /* Arrays are passed by reference. */
7414 if (TREE_CODE (type) == ARRAY_TYPE)
7415 return true;
7417 if (AGGREGATE_TYPE_P (type))
7419 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7420 are passed by reference. */
7421 msize = int_size_in_bytes (type);
7425 /* __m128 is passed by reference. */
7426 switch (msize) {
7427 case 1: case 2: case 4: case 8:
7428 break;
7429 default:
7430 return true;
7433 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7434 return 1;
7436 return 0;
7439 /* Return true when TYPE should be 128bit aligned for 32bit argument
7440 passing ABI. XXX: This function is obsolete and is only used for
7441 checking psABI compatibility with previous versions of GCC. */
7443 static bool
7444 ix86_compat_aligned_value_p (const_tree type)
7446 enum machine_mode mode = TYPE_MODE (type);
7447 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7448 || mode == TDmode
7449 || mode == TFmode
7450 || mode == TCmode)
7451 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7452 return true;
7453 if (TYPE_ALIGN (type) < 128)
7454 return false;
7456 if (AGGREGATE_TYPE_P (type))
7458 /* Walk the aggregates recursively. */
7459 switch (TREE_CODE (type))
7461 case RECORD_TYPE:
7462 case UNION_TYPE:
7463 case QUAL_UNION_TYPE:
7465 tree field;
7467 /* Walk all the structure fields. */
7468 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7470 if (TREE_CODE (field) == FIELD_DECL
7471 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7472 return true;
7474 break;
7477 case ARRAY_TYPE:
7478 /* Just for use if some languages passes arrays by value. */
7479 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7480 return true;
7481 break;
7483 default:
7484 gcc_unreachable ();
7487 return false;
7490 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7491 XXX: This function is obsolete and is only used for checking psABI
7492 compatibility with previous versions of GCC. */
7494 static unsigned int
7495 ix86_compat_function_arg_boundary (enum machine_mode mode,
7496 const_tree type, unsigned int align)
7498 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7499 natural boundaries. */
7500 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7502 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7503 make an exception for SSE modes since these require 128bit
7504 alignment.
7506 The handling here differs from field_alignment. ICC aligns MMX
7507 arguments to 4 byte boundaries, while structure fields are aligned
7508 to 8 byte boundaries. */
7509 if (!type)
7511 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7512 align = PARM_BOUNDARY;
7514 else
7516 if (!ix86_compat_aligned_value_p (type))
7517 align = PARM_BOUNDARY;
7520 if (align > BIGGEST_ALIGNMENT)
7521 align = BIGGEST_ALIGNMENT;
7522 return align;
7525 /* Return true when TYPE should be 128bit aligned for 32bit argument
7526 passing ABI. */
7528 static bool
7529 ix86_contains_aligned_value_p (const_tree type)
7531 enum machine_mode mode = TYPE_MODE (type);
7533 if (mode == XFmode || mode == XCmode)
7534 return false;
7536 if (TYPE_ALIGN (type) < 128)
7537 return false;
7539 if (AGGREGATE_TYPE_P (type))
7541 /* Walk the aggregates recursively. */
7542 switch (TREE_CODE (type))
7544 case RECORD_TYPE:
7545 case UNION_TYPE:
7546 case QUAL_UNION_TYPE:
7548 tree field;
7550 /* Walk all the structure fields. */
7551 for (field = TYPE_FIELDS (type);
7552 field;
7553 field = DECL_CHAIN (field))
7555 if (TREE_CODE (field) == FIELD_DECL
7556 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7557 return true;
7559 break;
7562 case ARRAY_TYPE:
7563 /* Just for use if some languages passes arrays by value. */
7564 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7565 return true;
7566 break;
7568 default:
7569 gcc_unreachable ();
7572 else
7573 return TYPE_ALIGN (type) >= 128;
7575 return false;
7578 /* Gives the alignment boundary, in bits, of an argument with the
7579 specified mode and type. */
7581 static unsigned int
7582 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7584 unsigned int align;
7585 if (type)
7587 /* Since the main variant type is used for call, we convert it to
7588 the main variant type. */
7589 type = TYPE_MAIN_VARIANT (type);
7590 align = TYPE_ALIGN (type);
7592 else
7593 align = GET_MODE_ALIGNMENT (mode);
7594 if (align < PARM_BOUNDARY)
7595 align = PARM_BOUNDARY;
7596 else
7598 static bool warned;
7599 unsigned int saved_align = align;
7601 if (!TARGET_64BIT)
7603 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7604 if (!type)
7606 if (mode == XFmode || mode == XCmode)
7607 align = PARM_BOUNDARY;
7609 else if (!ix86_contains_aligned_value_p (type))
7610 align = PARM_BOUNDARY;
7612 if (align < 128)
7613 align = PARM_BOUNDARY;
7616 if (warn_psabi
7617 && !warned
7618 && align != ix86_compat_function_arg_boundary (mode, type,
7619 saved_align))
7621 warned = true;
7622 inform (input_location,
7623 "The ABI for passing parameters with %d-byte"
7624 " alignment has changed in GCC 4.6",
7625 align / BITS_PER_UNIT);
7629 return align;
7632 /* Return true if N is a possible register number of function value. */
7634 static bool
7635 ix86_function_value_regno_p (const unsigned int regno)
7637 switch (regno)
7639 case AX_REG:
7640 return true;
7642 case FIRST_FLOAT_REG:
7643 /* TODO: The function should depend on current function ABI but
7644 builtins.c would need updating then. Therefore we use the
7645 default ABI. */
7646 if (TARGET_64BIT && ix86_abi == MS_ABI)
7647 return false;
7648 return TARGET_FLOAT_RETURNS_IN_80387;
7650 case FIRST_SSE_REG:
7651 return TARGET_SSE;
7653 case FIRST_MMX_REG:
7654 if (TARGET_MACHO || TARGET_64BIT)
7655 return false;
7656 return TARGET_MMX;
7659 return false;
7662 /* Define how to find the value returned by a function.
7663 VALTYPE is the data type of the value (as a tree).
7664 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7665 otherwise, FUNC is 0. */
7667 static rtx
7668 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7669 const_tree fntype, const_tree fn)
7671 unsigned int regno;
7673 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7674 we normally prevent this case when mmx is not available. However
7675 some ABIs may require the result to be returned like DImode. */
7676 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7677 regno = FIRST_MMX_REG;
7679 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7680 we prevent this case when sse is not available. However some ABIs
7681 may require the result to be returned like integer TImode. */
7682 else if (mode == TImode
7683 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7684 regno = FIRST_SSE_REG;
7686 /* 32-byte vector modes in %ymm0. */
7687 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7688 regno = FIRST_SSE_REG;
7690 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7691 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7692 regno = FIRST_FLOAT_REG;
7693 else
7694 /* Most things go in %eax. */
7695 regno = AX_REG;
7697 /* Override FP return register with %xmm0 for local functions when
7698 SSE math is enabled or for functions with sseregparm attribute. */
7699 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7701 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7702 if ((sse_level >= 1 && mode == SFmode)
7703 || (sse_level == 2 && mode == DFmode))
7704 regno = FIRST_SSE_REG;
7707 /* OImode shouldn't be used directly. */
7708 gcc_assert (mode != OImode);
7710 return gen_rtx_REG (orig_mode, regno);
7713 static rtx
7714 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7715 const_tree valtype)
7717 rtx ret;
7719 /* Handle libcalls, which don't provide a type node. */
7720 if (valtype == NULL)
7722 unsigned int regno;
7724 switch (mode)
7726 case SFmode:
7727 case SCmode:
7728 case DFmode:
7729 case DCmode:
7730 case TFmode:
7731 case SDmode:
7732 case DDmode:
7733 case TDmode:
7734 regno = FIRST_SSE_REG;
7735 break;
7736 case XFmode:
7737 case XCmode:
7738 regno = FIRST_FLOAT_REG;
7739 break;
7740 case TCmode:
7741 return NULL;
7742 default:
7743 regno = AX_REG;
7746 return gen_rtx_REG (mode, regno);
7748 else if (POINTER_TYPE_P (valtype))
7750 /* Pointers are always returned in word_mode. */
7751 mode = word_mode;
7754 ret = construct_container (mode, orig_mode, valtype, 1,
7755 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7756 x86_64_int_return_registers, 0);
7758 /* For zero sized structures, construct_container returns NULL, but we
7759 need to keep rest of compiler happy by returning meaningful value. */
7760 if (!ret)
7761 ret = gen_rtx_REG (orig_mode, AX_REG);
7763 return ret;
7766 static rtx
7767 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7768 const_tree valtype)
7770 unsigned int regno = AX_REG;
7772 if (TARGET_SSE)
7774 switch (GET_MODE_SIZE (mode))
7776 case 16:
7777 if (valtype != NULL_TREE
7778 && !VECTOR_INTEGER_TYPE_P (valtype)
7779 && !VECTOR_INTEGER_TYPE_P (valtype)
7780 && !INTEGRAL_TYPE_P (valtype)
7781 && !VECTOR_FLOAT_TYPE_P (valtype))
7782 break;
7783 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7784 && !COMPLEX_MODE_P (mode))
7785 regno = FIRST_SSE_REG;
7786 break;
7787 case 8:
7788 case 4:
7789 if (mode == SFmode || mode == DFmode)
7790 regno = FIRST_SSE_REG;
7791 break;
7792 default:
7793 break;
7796 return gen_rtx_REG (orig_mode, regno);
7799 static rtx
7800 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7801 enum machine_mode orig_mode, enum machine_mode mode)
7803 const_tree fn, fntype;
7805 fn = NULL_TREE;
7806 if (fntype_or_decl && DECL_P (fntype_or_decl))
7807 fn = fntype_or_decl;
7808 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7810 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7811 return function_value_ms_64 (orig_mode, mode, valtype);
7812 else if (TARGET_64BIT)
7813 return function_value_64 (orig_mode, mode, valtype);
7814 else
7815 return function_value_32 (orig_mode, mode, fntype, fn);
7818 static rtx
7819 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7820 bool outgoing ATTRIBUTE_UNUSED)
7822 enum machine_mode mode, orig_mode;
7824 orig_mode = TYPE_MODE (valtype);
7825 mode = type_natural_mode (valtype, NULL);
7826 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7829 /* Pointer function arguments and return values are promoted to
7830 word_mode. */
7832 static enum machine_mode
7833 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7834 int *punsignedp, const_tree fntype,
7835 int for_return)
7837 if (type != NULL_TREE && POINTER_TYPE_P (type))
7839 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7840 return word_mode;
7842 return default_promote_function_mode (type, mode, punsignedp, fntype,
7843 for_return);
7846 /* Return true if a structure, union or array with MODE containing FIELD
7847 should be accessed using BLKmode. */
7849 static bool
7850 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7852 /* Union with XFmode must be in BLKmode. */
7853 return (mode == XFmode
7854 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7855 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7859 ix86_libcall_value (enum machine_mode mode)
7861 return ix86_function_value_1 (NULL, NULL, mode, mode);
7864 /* Return true iff type is returned in memory. */
7866 static bool ATTRIBUTE_UNUSED
7867 return_in_memory_32 (const_tree type, enum machine_mode mode)
7869 HOST_WIDE_INT size;
7871 if (mode == BLKmode)
7872 return true;
7874 size = int_size_in_bytes (type);
7876 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7877 return false;
7879 if (VECTOR_MODE_P (mode) || mode == TImode)
7881 /* User-created vectors small enough to fit in EAX. */
7882 if (size < 8)
7883 return false;
7885 /* MMX/3dNow values are returned in MM0,
7886 except when it doesn't exits or the ABI prescribes otherwise. */
7887 if (size == 8)
7888 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7890 /* SSE values are returned in XMM0, except when it doesn't exist. */
7891 if (size == 16)
7892 return !TARGET_SSE;
7894 /* AVX values are returned in YMM0, except when it doesn't exist. */
7895 if (size == 32)
7896 return !TARGET_AVX;
7899 if (mode == XFmode)
7900 return false;
7902 if (size > 12)
7903 return true;
7905 /* OImode shouldn't be used directly. */
7906 gcc_assert (mode != OImode);
7908 return false;
7911 static bool ATTRIBUTE_UNUSED
7912 return_in_memory_64 (const_tree type, enum machine_mode mode)
7914 int needed_intregs, needed_sseregs;
7915 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7918 static bool ATTRIBUTE_UNUSED
7919 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7921 HOST_WIDE_INT size = int_size_in_bytes (type);
7923 /* __m128 is returned in xmm0. */
7924 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7925 || VECTOR_FLOAT_TYPE_P (type))
7926 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7927 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7928 return false;
7930 /* Otherwise, the size must be exactly in [1248]. */
7931 return size != 1 && size != 2 && size != 4 && size != 8;
7934 static bool
7935 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7937 #ifdef SUBTARGET_RETURN_IN_MEMORY
7938 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7939 #else
7940 const enum machine_mode mode = type_natural_mode (type, NULL);
7942 if (TARGET_64BIT)
7944 if (ix86_function_type_abi (fntype) == MS_ABI)
7945 return return_in_memory_ms_64 (type, mode);
7946 else
7947 return return_in_memory_64 (type, mode);
7949 else
7950 return return_in_memory_32 (type, mode);
7951 #endif
7954 /* When returning SSE vector types, we have a choice of either
7955 (1) being abi incompatible with a -march switch, or
7956 (2) generating an error.
7957 Given no good solution, I think the safest thing is one warning.
7958 The user won't be able to use -Werror, but....
7960 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7961 called in response to actually generating a caller or callee that
7962 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7963 via aggregate_value_p for general type probing from tree-ssa. */
7965 static rtx
7966 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7968 static bool warnedsse, warnedmmx;
7970 if (!TARGET_64BIT && type)
7972 /* Look at the return type of the function, not the function type. */
7973 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7975 if (!TARGET_SSE && !warnedsse)
7977 if (mode == TImode
7978 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7980 warnedsse = true;
7981 warning (0, "SSE vector return without SSE enabled "
7982 "changes the ABI");
7986 if (!TARGET_MMX && !warnedmmx)
7988 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7990 warnedmmx = true;
7991 warning (0, "MMX vector return without MMX enabled "
7992 "changes the ABI");
7997 return NULL;
8001 /* Create the va_list data type. */
8003 /* Returns the calling convention specific va_list date type.
8004 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
8006 static tree
8007 ix86_build_builtin_va_list_abi (enum calling_abi abi)
8009 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
8011 /* For i386 we use plain pointer to argument area. */
8012 if (!TARGET_64BIT || abi == MS_ABI)
8013 return build_pointer_type (char_type_node);
8015 record = lang_hooks.types.make_type (RECORD_TYPE);
8016 type_decl = build_decl (BUILTINS_LOCATION,
8017 TYPE_DECL, get_identifier ("__va_list_tag"), record);
8019 f_gpr = build_decl (BUILTINS_LOCATION,
8020 FIELD_DECL, get_identifier ("gp_offset"),
8021 unsigned_type_node);
8022 f_fpr = build_decl (BUILTINS_LOCATION,
8023 FIELD_DECL, get_identifier ("fp_offset"),
8024 unsigned_type_node);
8025 f_ovf = build_decl (BUILTINS_LOCATION,
8026 FIELD_DECL, get_identifier ("overflow_arg_area"),
8027 ptr_type_node);
8028 f_sav = build_decl (BUILTINS_LOCATION,
8029 FIELD_DECL, get_identifier ("reg_save_area"),
8030 ptr_type_node);
8032 va_list_gpr_counter_field = f_gpr;
8033 va_list_fpr_counter_field = f_fpr;
8035 DECL_FIELD_CONTEXT (f_gpr) = record;
8036 DECL_FIELD_CONTEXT (f_fpr) = record;
8037 DECL_FIELD_CONTEXT (f_ovf) = record;
8038 DECL_FIELD_CONTEXT (f_sav) = record;
8040 TYPE_STUB_DECL (record) = type_decl;
8041 TYPE_NAME (record) = type_decl;
8042 TYPE_FIELDS (record) = f_gpr;
8043 DECL_CHAIN (f_gpr) = f_fpr;
8044 DECL_CHAIN (f_fpr) = f_ovf;
8045 DECL_CHAIN (f_ovf) = f_sav;
8047 layout_type (record);
8049 /* The correct type is an array type of one element. */
8050 return build_array_type (record, build_index_type (size_zero_node));
8053 /* Setup the builtin va_list data type and for 64-bit the additional
8054 calling convention specific va_list data types. */
8056 static tree
8057 ix86_build_builtin_va_list (void)
8059 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
8061 /* Initialize abi specific va_list builtin types. */
8062 if (TARGET_64BIT)
8064 tree t;
8065 if (ix86_abi == MS_ABI)
8067 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
8068 if (TREE_CODE (t) != RECORD_TYPE)
8069 t = build_variant_type_copy (t);
8070 sysv_va_list_type_node = t;
8072 else
8074 t = ret;
8075 if (TREE_CODE (t) != RECORD_TYPE)
8076 t = build_variant_type_copy (t);
8077 sysv_va_list_type_node = t;
8079 if (ix86_abi != MS_ABI)
8081 t = ix86_build_builtin_va_list_abi (MS_ABI);
8082 if (TREE_CODE (t) != RECORD_TYPE)
8083 t = build_variant_type_copy (t);
8084 ms_va_list_type_node = t;
8086 else
8088 t = ret;
8089 if (TREE_CODE (t) != RECORD_TYPE)
8090 t = build_variant_type_copy (t);
8091 ms_va_list_type_node = t;
8095 return ret;
8098 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
8100 static void
8101 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
8103 rtx save_area, mem;
8104 alias_set_type set;
8105 int i, max;
8107 /* GPR size of varargs save area. */
8108 if (cfun->va_list_gpr_size)
8109 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
8110 else
8111 ix86_varargs_gpr_size = 0;
8113 /* FPR size of varargs save area. We don't need it if we don't pass
8114 anything in SSE registers. */
8115 if (TARGET_SSE && cfun->va_list_fpr_size)
8116 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
8117 else
8118 ix86_varargs_fpr_size = 0;
8120 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
8121 return;
8123 save_area = frame_pointer_rtx;
8124 set = get_varargs_alias_set ();
8126 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
8127 if (max > X86_64_REGPARM_MAX)
8128 max = X86_64_REGPARM_MAX;
8130 for (i = cum->regno; i < max; i++)
8132 mem = gen_rtx_MEM (word_mode,
8133 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
8134 MEM_NOTRAP_P (mem) = 1;
8135 set_mem_alias_set (mem, set);
8136 emit_move_insn (mem,
8137 gen_rtx_REG (word_mode,
8138 x86_64_int_parameter_registers[i]));
8141 if (ix86_varargs_fpr_size)
8143 enum machine_mode smode;
8144 rtx label, test;
8146 /* Now emit code to save SSE registers. The AX parameter contains number
8147 of SSE parameter registers used to call this function, though all we
8148 actually check here is the zero/non-zero status. */
8150 label = gen_label_rtx ();
8151 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
8152 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
8153 label));
8155 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
8156 we used movdqa (i.e. TImode) instead? Perhaps even better would
8157 be if we could determine the real mode of the data, via a hook
8158 into pass_stdarg. Ignore all that for now. */
8159 smode = V4SFmode;
8160 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
8161 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
8163 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
8164 if (max > X86_64_SSE_REGPARM_MAX)
8165 max = X86_64_SSE_REGPARM_MAX;
8167 for (i = cum->sse_regno; i < max; ++i)
8169 mem = plus_constant (Pmode, save_area,
8170 i * 16 + ix86_varargs_gpr_size);
8171 mem = gen_rtx_MEM (smode, mem);
8172 MEM_NOTRAP_P (mem) = 1;
8173 set_mem_alias_set (mem, set);
8174 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
8176 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
8179 emit_label (label);
8183 static void
8184 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
8186 alias_set_type set = get_varargs_alias_set ();
8187 int i;
8189 /* Reset to zero, as there might be a sysv vaarg used
8190 before. */
8191 ix86_varargs_gpr_size = 0;
8192 ix86_varargs_fpr_size = 0;
8194 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
8196 rtx reg, mem;
8198 mem = gen_rtx_MEM (Pmode,
8199 plus_constant (Pmode, virtual_incoming_args_rtx,
8200 i * UNITS_PER_WORD));
8201 MEM_NOTRAP_P (mem) = 1;
8202 set_mem_alias_set (mem, set);
8204 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
8205 emit_move_insn (mem, reg);
8209 static void
8210 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
8211 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8212 int no_rtl)
8214 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8215 CUMULATIVE_ARGS next_cum;
8216 tree fntype;
8218 /* This argument doesn't appear to be used anymore. Which is good,
8219 because the old code here didn't suppress rtl generation. */
8220 gcc_assert (!no_rtl);
8222 if (!TARGET_64BIT)
8223 return;
8225 fntype = TREE_TYPE (current_function_decl);
8227 /* For varargs, we do not want to skip the dummy va_dcl argument.
8228 For stdargs, we do want to skip the last named argument. */
8229 next_cum = *cum;
8230 if (stdarg_p (fntype))
8231 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8232 true);
8234 if (cum->call_abi == MS_ABI)
8235 setup_incoming_varargs_ms_64 (&next_cum);
8236 else
8237 setup_incoming_varargs_64 (&next_cum);
8240 /* Checks if TYPE is of kind va_list char *. */
8242 static bool
8243 is_va_list_char_pointer (tree type)
8245 tree canonic;
8247 /* For 32-bit it is always true. */
8248 if (!TARGET_64BIT)
8249 return true;
8250 canonic = ix86_canonical_va_list_type (type);
8251 return (canonic == ms_va_list_type_node
8252 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8255 /* Implement va_start. */
8257 static void
8258 ix86_va_start (tree valist, rtx nextarg)
8260 HOST_WIDE_INT words, n_gpr, n_fpr;
8261 tree f_gpr, f_fpr, f_ovf, f_sav;
8262 tree gpr, fpr, ovf, sav, t;
8263 tree type;
8264 rtx ovf_rtx;
8266 if (flag_split_stack
8267 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8269 unsigned int scratch_regno;
8271 /* When we are splitting the stack, we can't refer to the stack
8272 arguments using internal_arg_pointer, because they may be on
8273 the old stack. The split stack prologue will arrange to
8274 leave a pointer to the old stack arguments in a scratch
8275 register, which we here copy to a pseudo-register. The split
8276 stack prologue can't set the pseudo-register directly because
8277 it (the prologue) runs before any registers have been saved. */
8279 scratch_regno = split_stack_prologue_scratch_regno ();
8280 if (scratch_regno != INVALID_REGNUM)
8282 rtx reg, seq;
8284 reg = gen_reg_rtx (Pmode);
8285 cfun->machine->split_stack_varargs_pointer = reg;
8287 start_sequence ();
8288 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8289 seq = get_insns ();
8290 end_sequence ();
8292 push_topmost_sequence ();
8293 emit_insn_after (seq, entry_of_function ());
8294 pop_topmost_sequence ();
8298 /* Only 64bit target needs something special. */
8299 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8301 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8302 std_expand_builtin_va_start (valist, nextarg);
8303 else
8305 rtx va_r, next;
8307 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8308 next = expand_binop (ptr_mode, add_optab,
8309 cfun->machine->split_stack_varargs_pointer,
8310 crtl->args.arg_offset_rtx,
8311 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8312 convert_move (va_r, next, 0);
8314 return;
8317 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8318 f_fpr = DECL_CHAIN (f_gpr);
8319 f_ovf = DECL_CHAIN (f_fpr);
8320 f_sav = DECL_CHAIN (f_ovf);
8322 valist = build_simple_mem_ref (valist);
8323 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8324 /* The following should be folded into the MEM_REF offset. */
8325 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8326 f_gpr, NULL_TREE);
8327 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8328 f_fpr, NULL_TREE);
8329 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8330 f_ovf, NULL_TREE);
8331 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8332 f_sav, NULL_TREE);
8334 /* Count number of gp and fp argument registers used. */
8335 words = crtl->args.info.words;
8336 n_gpr = crtl->args.info.regno;
8337 n_fpr = crtl->args.info.sse_regno;
8339 if (cfun->va_list_gpr_size)
8341 type = TREE_TYPE (gpr);
8342 t = build2 (MODIFY_EXPR, type,
8343 gpr, build_int_cst (type, n_gpr * 8));
8344 TREE_SIDE_EFFECTS (t) = 1;
8345 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8348 if (TARGET_SSE && cfun->va_list_fpr_size)
8350 type = TREE_TYPE (fpr);
8351 t = build2 (MODIFY_EXPR, type, fpr,
8352 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8353 TREE_SIDE_EFFECTS (t) = 1;
8354 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8357 /* Find the overflow area. */
8358 type = TREE_TYPE (ovf);
8359 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8360 ovf_rtx = crtl->args.internal_arg_pointer;
8361 else
8362 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8363 t = make_tree (type, ovf_rtx);
8364 if (words != 0)
8365 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8366 t = build2 (MODIFY_EXPR, type, ovf, t);
8367 TREE_SIDE_EFFECTS (t) = 1;
8368 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8370 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8372 /* Find the register save area.
8373 Prologue of the function save it right above stack frame. */
8374 type = TREE_TYPE (sav);
8375 t = make_tree (type, frame_pointer_rtx);
8376 if (!ix86_varargs_gpr_size)
8377 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8378 t = build2 (MODIFY_EXPR, type, sav, t);
8379 TREE_SIDE_EFFECTS (t) = 1;
8380 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8384 /* Implement va_arg. */
8386 static tree
8387 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8388 gimple_seq *post_p)
8390 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8391 tree f_gpr, f_fpr, f_ovf, f_sav;
8392 tree gpr, fpr, ovf, sav, t;
8393 int size, rsize;
8394 tree lab_false, lab_over = NULL_TREE;
8395 tree addr, t2;
8396 rtx container;
8397 int indirect_p = 0;
8398 tree ptrtype;
8399 enum machine_mode nat_mode;
8400 unsigned int arg_boundary;
8402 /* Only 64bit target needs something special. */
8403 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8404 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8406 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8407 f_fpr = DECL_CHAIN (f_gpr);
8408 f_ovf = DECL_CHAIN (f_fpr);
8409 f_sav = DECL_CHAIN (f_ovf);
8411 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8412 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8413 valist = build_va_arg_indirect_ref (valist);
8414 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8415 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8416 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8418 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8419 if (indirect_p)
8420 type = build_pointer_type (type);
8421 size = int_size_in_bytes (type);
8422 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8424 nat_mode = type_natural_mode (type, NULL);
8425 switch (nat_mode)
8427 case V8SFmode:
8428 case V8SImode:
8429 case V32QImode:
8430 case V16HImode:
8431 case V4DFmode:
8432 case V4DImode:
8433 /* Unnamed 256bit vector mode parameters are passed on stack. */
8434 if (!TARGET_64BIT_MS_ABI)
8436 container = NULL;
8437 break;
8440 default:
8441 container = construct_container (nat_mode, TYPE_MODE (type),
8442 type, 0, X86_64_REGPARM_MAX,
8443 X86_64_SSE_REGPARM_MAX, intreg,
8445 break;
8448 /* Pull the value out of the saved registers. */
8450 addr = create_tmp_var (ptr_type_node, "addr");
8452 if (container)
8454 int needed_intregs, needed_sseregs;
8455 bool need_temp;
8456 tree int_addr, sse_addr;
8458 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8459 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8461 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8463 need_temp = (!REG_P (container)
8464 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8465 || TYPE_ALIGN (type) > 128));
8467 /* In case we are passing structure, verify that it is consecutive block
8468 on the register save area. If not we need to do moves. */
8469 if (!need_temp && !REG_P (container))
8471 /* Verify that all registers are strictly consecutive */
8472 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8474 int i;
8476 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8478 rtx slot = XVECEXP (container, 0, i);
8479 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8480 || INTVAL (XEXP (slot, 1)) != i * 16)
8481 need_temp = 1;
8484 else
8486 int i;
8488 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8490 rtx slot = XVECEXP (container, 0, i);
8491 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8492 || INTVAL (XEXP (slot, 1)) != i * 8)
8493 need_temp = 1;
8497 if (!need_temp)
8499 int_addr = addr;
8500 sse_addr = addr;
8502 else
8504 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8505 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8508 /* First ensure that we fit completely in registers. */
8509 if (needed_intregs)
8511 t = build_int_cst (TREE_TYPE (gpr),
8512 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8513 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8514 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8515 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8516 gimplify_and_add (t, pre_p);
8518 if (needed_sseregs)
8520 t = build_int_cst (TREE_TYPE (fpr),
8521 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8522 + X86_64_REGPARM_MAX * 8);
8523 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8524 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8525 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8526 gimplify_and_add (t, pre_p);
8529 /* Compute index to start of area used for integer regs. */
8530 if (needed_intregs)
8532 /* int_addr = gpr + sav; */
8533 t = fold_build_pointer_plus (sav, gpr);
8534 gimplify_assign (int_addr, t, pre_p);
8536 if (needed_sseregs)
8538 /* sse_addr = fpr + sav; */
8539 t = fold_build_pointer_plus (sav, fpr);
8540 gimplify_assign (sse_addr, t, pre_p);
8542 if (need_temp)
8544 int i, prev_size = 0;
8545 tree temp = create_tmp_var (type, "va_arg_tmp");
8547 /* addr = &temp; */
8548 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8549 gimplify_assign (addr, t, pre_p);
8551 for (i = 0; i < XVECLEN (container, 0); i++)
8553 rtx slot = XVECEXP (container, 0, i);
8554 rtx reg = XEXP (slot, 0);
8555 enum machine_mode mode = GET_MODE (reg);
8556 tree piece_type;
8557 tree addr_type;
8558 tree daddr_type;
8559 tree src_addr, src;
8560 int src_offset;
8561 tree dest_addr, dest;
8562 int cur_size = GET_MODE_SIZE (mode);
8564 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8565 prev_size = INTVAL (XEXP (slot, 1));
8566 if (prev_size + cur_size > size)
8568 cur_size = size - prev_size;
8569 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8570 if (mode == BLKmode)
8571 mode = QImode;
8573 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8574 if (mode == GET_MODE (reg))
8575 addr_type = build_pointer_type (piece_type);
8576 else
8577 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8578 true);
8579 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8580 true);
8582 if (SSE_REGNO_P (REGNO (reg)))
8584 src_addr = sse_addr;
8585 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8587 else
8589 src_addr = int_addr;
8590 src_offset = REGNO (reg) * 8;
8592 src_addr = fold_convert (addr_type, src_addr);
8593 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8595 dest_addr = fold_convert (daddr_type, addr);
8596 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8597 if (cur_size == GET_MODE_SIZE (mode))
8599 src = build_va_arg_indirect_ref (src_addr);
8600 dest = build_va_arg_indirect_ref (dest_addr);
8602 gimplify_assign (dest, src, pre_p);
8604 else
8606 tree copy
8607 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8608 3, dest_addr, src_addr,
8609 size_int (cur_size));
8610 gimplify_and_add (copy, pre_p);
8612 prev_size += cur_size;
8616 if (needed_intregs)
8618 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8619 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8620 gimplify_assign (gpr, t, pre_p);
8623 if (needed_sseregs)
8625 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8626 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8627 gimplify_assign (fpr, t, pre_p);
8630 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8632 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8635 /* ... otherwise out of the overflow area. */
8637 /* When we align parameter on stack for caller, if the parameter
8638 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8639 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8640 here with caller. */
8641 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8642 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8643 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8645 /* Care for on-stack alignment if needed. */
8646 if (arg_boundary <= 64 || size == 0)
8647 t = ovf;
8648 else
8650 HOST_WIDE_INT align = arg_boundary / 8;
8651 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8652 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8653 build_int_cst (TREE_TYPE (t), -align));
8656 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8657 gimplify_assign (addr, t, pre_p);
8659 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8660 gimplify_assign (unshare_expr (ovf), t, pre_p);
8662 if (container)
8663 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8665 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8666 addr = fold_convert (ptrtype, addr);
8668 if (indirect_p)
8669 addr = build_va_arg_indirect_ref (addr);
8670 return build_va_arg_indirect_ref (addr);
8673 /* Return true if OPNUM's MEM should be matched
8674 in movabs* patterns. */
8676 bool
8677 ix86_check_movabs (rtx insn, int opnum)
8679 rtx set, mem;
8681 set = PATTERN (insn);
8682 if (GET_CODE (set) == PARALLEL)
8683 set = XVECEXP (set, 0, 0);
8684 gcc_assert (GET_CODE (set) == SET);
8685 mem = XEXP (set, opnum);
8686 while (GET_CODE (mem) == SUBREG)
8687 mem = SUBREG_REG (mem);
8688 gcc_assert (MEM_P (mem));
8689 return volatile_ok || !MEM_VOLATILE_P (mem);
8692 /* Initialize the table of extra 80387 mathematical constants. */
8694 static void
8695 init_ext_80387_constants (void)
8697 static const char * cst[5] =
8699 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8700 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8701 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8702 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8703 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8705 int i;
8707 for (i = 0; i < 5; i++)
8709 real_from_string (&ext_80387_constants_table[i], cst[i]);
8710 /* Ensure each constant is rounded to XFmode precision. */
8711 real_convert (&ext_80387_constants_table[i],
8712 XFmode, &ext_80387_constants_table[i]);
8715 ext_80387_constants_init = 1;
8718 /* Return non-zero if the constant is something that
8719 can be loaded with a special instruction. */
8722 standard_80387_constant_p (rtx x)
8724 enum machine_mode mode = GET_MODE (x);
8726 REAL_VALUE_TYPE r;
8728 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8729 return -1;
8731 if (x == CONST0_RTX (mode))
8732 return 1;
8733 if (x == CONST1_RTX (mode))
8734 return 2;
8736 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8738 /* For XFmode constants, try to find a special 80387 instruction when
8739 optimizing for size or on those CPUs that benefit from them. */
8740 if (mode == XFmode
8741 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8743 int i;
8745 if (! ext_80387_constants_init)
8746 init_ext_80387_constants ();
8748 for (i = 0; i < 5; i++)
8749 if (real_identical (&r, &ext_80387_constants_table[i]))
8750 return i + 3;
8753 /* Load of the constant -0.0 or -1.0 will be split as
8754 fldz;fchs or fld1;fchs sequence. */
8755 if (real_isnegzero (&r))
8756 return 8;
8757 if (real_identical (&r, &dconstm1))
8758 return 9;
8760 return 0;
8763 /* Return the opcode of the special instruction to be used to load
8764 the constant X. */
8766 const char *
8767 standard_80387_constant_opcode (rtx x)
8769 switch (standard_80387_constant_p (x))
8771 case 1:
8772 return "fldz";
8773 case 2:
8774 return "fld1";
8775 case 3:
8776 return "fldlg2";
8777 case 4:
8778 return "fldln2";
8779 case 5:
8780 return "fldl2e";
8781 case 6:
8782 return "fldl2t";
8783 case 7:
8784 return "fldpi";
8785 case 8:
8786 case 9:
8787 return "#";
8788 default:
8789 gcc_unreachable ();
8793 /* Return the CONST_DOUBLE representing the 80387 constant that is
8794 loaded by the specified special instruction. The argument IDX
8795 matches the return value from standard_80387_constant_p. */
8798 standard_80387_constant_rtx (int idx)
8800 int i;
8802 if (! ext_80387_constants_init)
8803 init_ext_80387_constants ();
8805 switch (idx)
8807 case 3:
8808 case 4:
8809 case 5:
8810 case 6:
8811 case 7:
8812 i = idx - 3;
8813 break;
8815 default:
8816 gcc_unreachable ();
8819 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8820 XFmode);
8823 /* Return 1 if X is all 0s and 2 if x is all 1s
8824 in supported SSE/AVX vector mode. */
8827 standard_sse_constant_p (rtx x)
8829 enum machine_mode mode = GET_MODE (x);
8831 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8832 return 1;
8833 if (vector_all_ones_operand (x, mode))
8834 switch (mode)
8836 case V16QImode:
8837 case V8HImode:
8838 case V4SImode:
8839 case V2DImode:
8840 if (TARGET_SSE2)
8841 return 2;
8842 case V32QImode:
8843 case V16HImode:
8844 case V8SImode:
8845 case V4DImode:
8846 if (TARGET_AVX2)
8847 return 2;
8848 default:
8849 break;
8852 return 0;
8855 /* Return the opcode of the special instruction to be used to load
8856 the constant X. */
8858 const char *
8859 standard_sse_constant_opcode (rtx insn, rtx x)
8861 switch (standard_sse_constant_p (x))
8863 case 1:
8864 switch (get_attr_mode (insn))
8866 case MODE_TI:
8867 return "%vpxor\t%0, %d0";
8868 case MODE_V2DF:
8869 return "%vxorpd\t%0, %d0";
8870 case MODE_V4SF:
8871 return "%vxorps\t%0, %d0";
8873 case MODE_OI:
8874 return "vpxor\t%x0, %x0, %x0";
8875 case MODE_V4DF:
8876 return "vxorpd\t%x0, %x0, %x0";
8877 case MODE_V8SF:
8878 return "vxorps\t%x0, %x0, %x0";
8880 default:
8881 break;
8884 case 2:
8885 if (TARGET_AVX)
8886 return "vpcmpeqd\t%0, %0, %0";
8887 else
8888 return "pcmpeqd\t%0, %0";
8890 default:
8891 break;
8893 gcc_unreachable ();
8896 /* Returns true if OP contains a symbol reference */
8898 bool
8899 symbolic_reference_mentioned_p (rtx op)
8901 const char *fmt;
8902 int i;
8904 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8905 return true;
8907 fmt = GET_RTX_FORMAT (GET_CODE (op));
8908 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8910 if (fmt[i] == 'E')
8912 int j;
8914 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8915 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8916 return true;
8919 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8920 return true;
8923 return false;
8926 /* Return true if it is appropriate to emit `ret' instructions in the
8927 body of a function. Do this only if the epilogue is simple, needing a
8928 couple of insns. Prior to reloading, we can't tell how many registers
8929 must be saved, so return false then. Return false if there is no frame
8930 marker to de-allocate. */
8932 bool
8933 ix86_can_use_return_insn_p (void)
8935 struct ix86_frame frame;
8937 if (! reload_completed || frame_pointer_needed)
8938 return 0;
8940 /* Don't allow more than 32k pop, since that's all we can do
8941 with one instruction. */
8942 if (crtl->args.pops_args && crtl->args.size >= 32768)
8943 return 0;
8945 ix86_compute_frame_layout (&frame);
8946 return (frame.stack_pointer_offset == UNITS_PER_WORD
8947 && (frame.nregs + frame.nsseregs) == 0);
8950 /* Value should be nonzero if functions must have frame pointers.
8951 Zero means the frame pointer need not be set up (and parms may
8952 be accessed via the stack pointer) in functions that seem suitable. */
8954 static bool
8955 ix86_frame_pointer_required (void)
8957 /* If we accessed previous frames, then the generated code expects
8958 to be able to access the saved ebp value in our frame. */
8959 if (cfun->machine->accesses_prev_frame)
8960 return true;
8962 /* Several x86 os'es need a frame pointer for other reasons,
8963 usually pertaining to setjmp. */
8964 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8965 return true;
8967 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8968 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8969 return true;
8971 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8972 allocation is 4GB. */
8973 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8974 return true;
8976 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8977 turns off the frame pointer by default. Turn it back on now if
8978 we've not got a leaf function. */
8979 if (TARGET_OMIT_LEAF_FRAME_POINTER
8980 && (!crtl->is_leaf
8981 || ix86_current_function_calls_tls_descriptor))
8982 return true;
8984 if (crtl->profile && !flag_fentry)
8985 return true;
8987 return false;
8990 /* Record that the current function accesses previous call frames. */
8992 void
8993 ix86_setup_frame_addresses (void)
8995 cfun->machine->accesses_prev_frame = 1;
8998 #ifndef USE_HIDDEN_LINKONCE
8999 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
9000 # define USE_HIDDEN_LINKONCE 1
9001 # else
9002 # define USE_HIDDEN_LINKONCE 0
9003 # endif
9004 #endif
9006 static int pic_labels_used;
9008 /* Fills in the label name that should be used for a pc thunk for
9009 the given register. */
9011 static void
9012 get_pc_thunk_name (char name[32], unsigned int regno)
9014 gcc_assert (!TARGET_64BIT);
9016 if (USE_HIDDEN_LINKONCE)
9017 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
9018 else
9019 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
9023 /* This function generates code for -fpic that loads %ebx with
9024 the return address of the caller and then returns. */
9026 static void
9027 ix86_code_end (void)
9029 rtx xops[2];
9030 int regno;
9032 for (regno = AX_REG; regno <= SP_REG; regno++)
9034 char name[32];
9035 tree decl;
9037 if (!(pic_labels_used & (1 << regno)))
9038 continue;
9040 get_pc_thunk_name (name, regno);
9042 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
9043 get_identifier (name),
9044 build_function_type_list (void_type_node, NULL_TREE));
9045 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
9046 NULL_TREE, void_type_node);
9047 TREE_PUBLIC (decl) = 1;
9048 TREE_STATIC (decl) = 1;
9049 DECL_IGNORED_P (decl) = 1;
9051 #if TARGET_MACHO
9052 if (TARGET_MACHO)
9054 switch_to_section (darwin_sections[text_coal_section]);
9055 fputs ("\t.weak_definition\t", asm_out_file);
9056 assemble_name (asm_out_file, name);
9057 fputs ("\n\t.private_extern\t", asm_out_file);
9058 assemble_name (asm_out_file, name);
9059 putc ('\n', asm_out_file);
9060 ASM_OUTPUT_LABEL (asm_out_file, name);
9061 DECL_WEAK (decl) = 1;
9063 else
9064 #endif
9065 if (USE_HIDDEN_LINKONCE)
9067 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
9069 targetm.asm_out.unique_section (decl, 0);
9070 switch_to_section (get_named_section (decl, NULL, 0));
9072 targetm.asm_out.globalize_label (asm_out_file, name);
9073 fputs ("\t.hidden\t", asm_out_file);
9074 assemble_name (asm_out_file, name);
9075 putc ('\n', asm_out_file);
9076 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
9078 else
9080 switch_to_section (text_section);
9081 ASM_OUTPUT_LABEL (asm_out_file, name);
9084 DECL_INITIAL (decl) = make_node (BLOCK);
9085 current_function_decl = decl;
9086 init_function_start (decl);
9087 first_function_block_is_cold = false;
9088 /* Make sure unwind info is emitted for the thunk if needed. */
9089 final_start_function (emit_barrier (), asm_out_file, 1);
9091 /* Pad stack IP move with 4 instructions (two NOPs count
9092 as one instruction). */
9093 if (TARGET_PAD_SHORT_FUNCTION)
9095 int i = 8;
9097 while (i--)
9098 fputs ("\tnop\n", asm_out_file);
9101 xops[0] = gen_rtx_REG (Pmode, regno);
9102 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
9103 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
9104 fputs ("\tret\n", asm_out_file);
9105 final_end_function ();
9106 init_insn_lengths ();
9107 free_after_compilation (cfun);
9108 set_cfun (NULL);
9109 current_function_decl = NULL;
9112 if (flag_split_stack)
9113 file_end_indicate_split_stack ();
9116 /* Emit code for the SET_GOT patterns. */
9118 const char *
9119 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
9121 rtx xops[3];
9123 xops[0] = dest;
9125 if (TARGET_VXWORKS_RTP && flag_pic)
9127 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
9128 xops[2] = gen_rtx_MEM (Pmode,
9129 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
9130 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
9132 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
9133 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
9134 an unadorned address. */
9135 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
9136 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
9137 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
9138 return "";
9141 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
9143 if (!flag_pic)
9145 if (TARGET_MACHO)
9146 /* We don't need a pic base, we're not producing pic. */
9147 gcc_unreachable ();
9149 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
9150 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
9151 targetm.asm_out.internal_label (asm_out_file, "L",
9152 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
9154 else
9156 char name[32];
9157 get_pc_thunk_name (name, REGNO (dest));
9158 pic_labels_used |= 1 << REGNO (dest);
9160 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
9161 xops[2] = gen_rtx_MEM (QImode, xops[2]);
9162 output_asm_insn ("call\t%X2", xops);
9164 #if TARGET_MACHO
9165 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
9166 This is what will be referenced by the Mach-O PIC subsystem. */
9167 if (machopic_should_output_picbase_label () || !label)
9168 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
9170 /* When we are restoring the pic base at the site of a nonlocal label,
9171 and we decided to emit the pic base above, we will still output a
9172 local label used for calculating the correction offset (even though
9173 the offset will be 0 in that case). */
9174 if (label)
9175 targetm.asm_out.internal_label (asm_out_file, "L",
9176 CODE_LABEL_NUMBER (label));
9177 #endif
9180 if (!TARGET_MACHO)
9181 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
9183 return "";
9186 /* Generate an "push" pattern for input ARG. */
9188 static rtx
9189 gen_push (rtx arg)
9191 struct machine_function *m = cfun->machine;
9193 if (m->fs.cfa_reg == stack_pointer_rtx)
9194 m->fs.cfa_offset += UNITS_PER_WORD;
9195 m->fs.sp_offset += UNITS_PER_WORD;
9197 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9198 arg = gen_rtx_REG (word_mode, REGNO (arg));
9200 return gen_rtx_SET (VOIDmode,
9201 gen_rtx_MEM (word_mode,
9202 gen_rtx_PRE_DEC (Pmode,
9203 stack_pointer_rtx)),
9204 arg);
9207 /* Generate an "pop" pattern for input ARG. */
9209 static rtx
9210 gen_pop (rtx arg)
9212 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9213 arg = gen_rtx_REG (word_mode, REGNO (arg));
9215 return gen_rtx_SET (VOIDmode,
9216 arg,
9217 gen_rtx_MEM (word_mode,
9218 gen_rtx_POST_INC (Pmode,
9219 stack_pointer_rtx)));
9222 /* Return >= 0 if there is an unused call-clobbered register available
9223 for the entire function. */
9225 static unsigned int
9226 ix86_select_alt_pic_regnum (void)
9228 if (crtl->is_leaf
9229 && !crtl->profile
9230 && !ix86_current_function_calls_tls_descriptor)
9232 int i, drap;
9233 /* Can't use the same register for both PIC and DRAP. */
9234 if (crtl->drap_reg)
9235 drap = REGNO (crtl->drap_reg);
9236 else
9237 drap = -1;
9238 for (i = 2; i >= 0; --i)
9239 if (i != drap && !df_regs_ever_live_p (i))
9240 return i;
9243 return INVALID_REGNUM;
9246 /* Return TRUE if we need to save REGNO. */
9248 static bool
9249 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9251 if (pic_offset_table_rtx
9252 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9253 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9254 || crtl->profile
9255 || crtl->calls_eh_return
9256 || crtl->uses_const_pool
9257 || cfun->has_nonlocal_label))
9258 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9260 if (crtl->calls_eh_return && maybe_eh_return)
9262 unsigned i;
9263 for (i = 0; ; i++)
9265 unsigned test = EH_RETURN_DATA_REGNO (i);
9266 if (test == INVALID_REGNUM)
9267 break;
9268 if (test == regno)
9269 return true;
9273 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9274 return true;
9276 return (df_regs_ever_live_p (regno)
9277 && !call_used_regs[regno]
9278 && !fixed_regs[regno]
9279 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9282 /* Return number of saved general prupose registers. */
9284 static int
9285 ix86_nsaved_regs (void)
9287 int nregs = 0;
9288 int regno;
9290 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9291 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9292 nregs ++;
9293 return nregs;
9296 /* Return number of saved SSE registrers. */
9298 static int
9299 ix86_nsaved_sseregs (void)
9301 int nregs = 0;
9302 int regno;
9304 if (!TARGET_64BIT_MS_ABI)
9305 return 0;
9306 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9307 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9308 nregs ++;
9309 return nregs;
9312 /* Given FROM and TO register numbers, say whether this elimination is
9313 allowed. If stack alignment is needed, we can only replace argument
9314 pointer with hard frame pointer, or replace frame pointer with stack
9315 pointer. Otherwise, frame pointer elimination is automatically
9316 handled and all other eliminations are valid. */
9318 static bool
9319 ix86_can_eliminate (const int from, const int to)
9321 if (stack_realign_fp)
9322 return ((from == ARG_POINTER_REGNUM
9323 && to == HARD_FRAME_POINTER_REGNUM)
9324 || (from == FRAME_POINTER_REGNUM
9325 && to == STACK_POINTER_REGNUM));
9326 else
9327 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9330 /* Return the offset between two registers, one to be eliminated, and the other
9331 its replacement, at the start of a routine. */
9333 HOST_WIDE_INT
9334 ix86_initial_elimination_offset (int from, int to)
9336 struct ix86_frame frame;
9337 ix86_compute_frame_layout (&frame);
9339 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9340 return frame.hard_frame_pointer_offset;
9341 else if (from == FRAME_POINTER_REGNUM
9342 && to == HARD_FRAME_POINTER_REGNUM)
9343 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9344 else
9346 gcc_assert (to == STACK_POINTER_REGNUM);
9348 if (from == ARG_POINTER_REGNUM)
9349 return frame.stack_pointer_offset;
9351 gcc_assert (from == FRAME_POINTER_REGNUM);
9352 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9356 /* In a dynamically-aligned function, we can't know the offset from
9357 stack pointer to frame pointer, so we must ensure that setjmp
9358 eliminates fp against the hard fp (%ebp) rather than trying to
9359 index from %esp up to the top of the frame across a gap that is
9360 of unknown (at compile-time) size. */
9361 static rtx
9362 ix86_builtin_setjmp_frame_value (void)
9364 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9367 /* When using -fsplit-stack, the allocation routines set a field in
9368 the TCB to the bottom of the stack plus this much space, measured
9369 in bytes. */
9371 #define SPLIT_STACK_AVAILABLE 256
9373 /* Fill structure ix86_frame about frame of currently computed function. */
9375 static void
9376 ix86_compute_frame_layout (struct ix86_frame *frame)
9378 unsigned HOST_WIDE_INT stack_alignment_needed;
9379 HOST_WIDE_INT offset;
9380 unsigned HOST_WIDE_INT preferred_alignment;
9381 HOST_WIDE_INT size = get_frame_size ();
9382 HOST_WIDE_INT to_allocate;
9384 frame->nregs = ix86_nsaved_regs ();
9385 frame->nsseregs = ix86_nsaved_sseregs ();
9387 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9388 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9390 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9391 function prologues and leaf. */
9392 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9393 && (!crtl->is_leaf || cfun->calls_alloca != 0
9394 || ix86_current_function_calls_tls_descriptor))
9396 preferred_alignment = 16;
9397 stack_alignment_needed = 16;
9398 crtl->preferred_stack_boundary = 128;
9399 crtl->stack_alignment_needed = 128;
9402 gcc_assert (!size || stack_alignment_needed);
9403 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9404 gcc_assert (preferred_alignment <= stack_alignment_needed);
9406 /* For SEH we have to limit the amount of code movement into the prologue.
9407 At present we do this via a BLOCKAGE, at which point there's very little
9408 scheduling that can be done, which means that there's very little point
9409 in doing anything except PUSHs. */
9410 if (TARGET_SEH)
9411 cfun->machine->use_fast_prologue_epilogue = false;
9413 /* During reload iteration the amount of registers saved can change.
9414 Recompute the value as needed. Do not recompute when amount of registers
9415 didn't change as reload does multiple calls to the function and does not
9416 expect the decision to change within single iteration. */
9417 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR)
9418 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9420 int count = frame->nregs;
9421 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9423 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9425 /* The fast prologue uses move instead of push to save registers. This
9426 is significantly longer, but also executes faster as modern hardware
9427 can execute the moves in parallel, but can't do that for push/pop.
9429 Be careful about choosing what prologue to emit: When function takes
9430 many instructions to execute we may use slow version as well as in
9431 case function is known to be outside hot spot (this is known with
9432 feedback only). Weight the size of function by number of registers
9433 to save as it is cheap to use one or two push instructions but very
9434 slow to use many of them. */
9435 if (count)
9436 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9437 if (node->frequency < NODE_FREQUENCY_NORMAL
9438 || (flag_branch_probabilities
9439 && node->frequency < NODE_FREQUENCY_HOT))
9440 cfun->machine->use_fast_prologue_epilogue = false;
9441 else
9442 cfun->machine->use_fast_prologue_epilogue
9443 = !expensive_function_p (count);
9446 frame->save_regs_using_mov
9447 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9448 /* If static stack checking is enabled and done with probes,
9449 the registers need to be saved before allocating the frame. */
9450 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9452 /* Skip return address. */
9453 offset = UNITS_PER_WORD;
9455 /* Skip pushed static chain. */
9456 if (ix86_static_chain_on_stack)
9457 offset += UNITS_PER_WORD;
9459 /* Skip saved base pointer. */
9460 if (frame_pointer_needed)
9461 offset += UNITS_PER_WORD;
9462 frame->hfp_save_offset = offset;
9464 /* The traditional frame pointer location is at the top of the frame. */
9465 frame->hard_frame_pointer_offset = offset;
9467 /* Register save area */
9468 offset += frame->nregs * UNITS_PER_WORD;
9469 frame->reg_save_offset = offset;
9471 /* On SEH target, registers are pushed just before the frame pointer
9472 location. */
9473 if (TARGET_SEH)
9474 frame->hard_frame_pointer_offset = offset;
9476 /* Align and set SSE register save area. */
9477 if (frame->nsseregs)
9479 /* The only ABI that has saved SSE registers (Win64) also has a
9480 16-byte aligned default stack, and thus we don't need to be
9481 within the re-aligned local stack frame to save them. */
9482 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9483 offset = (offset + 16 - 1) & -16;
9484 offset += frame->nsseregs * 16;
9486 frame->sse_reg_save_offset = offset;
9488 /* The re-aligned stack starts here. Values before this point are not
9489 directly comparable with values below this point. In order to make
9490 sure that no value happens to be the same before and after, force
9491 the alignment computation below to add a non-zero value. */
9492 if (stack_realign_fp)
9493 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9495 /* Va-arg area */
9496 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9497 offset += frame->va_arg_size;
9499 /* Align start of frame for local function. */
9500 if (stack_realign_fp
9501 || offset != frame->sse_reg_save_offset
9502 || size != 0
9503 || !crtl->is_leaf
9504 || cfun->calls_alloca
9505 || ix86_current_function_calls_tls_descriptor)
9506 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9508 /* Frame pointer points here. */
9509 frame->frame_pointer_offset = offset;
9511 offset += size;
9513 /* Add outgoing arguments area. Can be skipped if we eliminated
9514 all the function calls as dead code.
9515 Skipping is however impossible when function calls alloca. Alloca
9516 expander assumes that last crtl->outgoing_args_size
9517 of stack frame are unused. */
9518 if (ACCUMULATE_OUTGOING_ARGS
9519 && (!crtl->is_leaf || cfun->calls_alloca
9520 || ix86_current_function_calls_tls_descriptor))
9522 offset += crtl->outgoing_args_size;
9523 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9525 else
9526 frame->outgoing_arguments_size = 0;
9528 /* Align stack boundary. Only needed if we're calling another function
9529 or using alloca. */
9530 if (!crtl->is_leaf || cfun->calls_alloca
9531 || ix86_current_function_calls_tls_descriptor)
9532 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9534 /* We've reached end of stack frame. */
9535 frame->stack_pointer_offset = offset;
9537 /* Size prologue needs to allocate. */
9538 to_allocate = offset - frame->sse_reg_save_offset;
9540 if ((!to_allocate && frame->nregs <= 1)
9541 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9542 frame->save_regs_using_mov = false;
9544 if (ix86_using_red_zone ()
9545 && crtl->sp_is_unchanging
9546 && crtl->is_leaf
9547 && !ix86_current_function_calls_tls_descriptor)
9549 frame->red_zone_size = to_allocate;
9550 if (frame->save_regs_using_mov)
9551 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9552 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9553 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9555 else
9556 frame->red_zone_size = 0;
9557 frame->stack_pointer_offset -= frame->red_zone_size;
9559 /* The SEH frame pointer location is near the bottom of the frame.
9560 This is enforced by the fact that the difference between the
9561 stack pointer and the frame pointer is limited to 240 bytes in
9562 the unwind data structure. */
9563 if (TARGET_SEH)
9565 HOST_WIDE_INT diff;
9567 /* If we can leave the frame pointer where it is, do so. Also, returns
9568 the establisher frame for __builtin_frame_address (0). */
9569 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9570 if (diff <= SEH_MAX_FRAME_SIZE
9571 && (diff > 240 || (diff & 15) != 0)
9572 && !crtl->accesses_prior_frames)
9574 /* Ideally we'd determine what portion of the local stack frame
9575 (within the constraint of the lowest 240) is most heavily used.
9576 But without that complication, simply bias the frame pointer
9577 by 128 bytes so as to maximize the amount of the local stack
9578 frame that is addressable with 8-bit offsets. */
9579 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9584 /* This is semi-inlined memory_address_length, but simplified
9585 since we know that we're always dealing with reg+offset, and
9586 to avoid having to create and discard all that rtl. */
9588 static inline int
9589 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9591 int len = 4;
9593 if (offset == 0)
9595 /* EBP and R13 cannot be encoded without an offset. */
9596 len = (regno == BP_REG || regno == R13_REG);
9598 else if (IN_RANGE (offset, -128, 127))
9599 len = 1;
9601 /* ESP and R12 must be encoded with a SIB byte. */
9602 if (regno == SP_REG || regno == R12_REG)
9603 len++;
9605 return len;
9608 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9609 The valid base registers are taken from CFUN->MACHINE->FS. */
9611 static rtx
9612 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9614 const struct machine_function *m = cfun->machine;
9615 rtx base_reg = NULL;
9616 HOST_WIDE_INT base_offset = 0;
9618 if (m->use_fast_prologue_epilogue)
9620 /* Choose the base register most likely to allow the most scheduling
9621 opportunities. Generally FP is valid throughout the function,
9622 while DRAP must be reloaded within the epilogue. But choose either
9623 over the SP due to increased encoding size. */
9625 if (m->fs.fp_valid)
9627 base_reg = hard_frame_pointer_rtx;
9628 base_offset = m->fs.fp_offset - cfa_offset;
9630 else if (m->fs.drap_valid)
9632 base_reg = crtl->drap_reg;
9633 base_offset = 0 - cfa_offset;
9635 else if (m->fs.sp_valid)
9637 base_reg = stack_pointer_rtx;
9638 base_offset = m->fs.sp_offset - cfa_offset;
9641 else
9643 HOST_WIDE_INT toffset;
9644 int len = 16, tlen;
9646 /* Choose the base register with the smallest address encoding.
9647 With a tie, choose FP > DRAP > SP. */
9648 if (m->fs.sp_valid)
9650 base_reg = stack_pointer_rtx;
9651 base_offset = m->fs.sp_offset - cfa_offset;
9652 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9654 if (m->fs.drap_valid)
9656 toffset = 0 - cfa_offset;
9657 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9658 if (tlen <= len)
9660 base_reg = crtl->drap_reg;
9661 base_offset = toffset;
9662 len = tlen;
9665 if (m->fs.fp_valid)
9667 toffset = m->fs.fp_offset - cfa_offset;
9668 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9669 if (tlen <= len)
9671 base_reg = hard_frame_pointer_rtx;
9672 base_offset = toffset;
9673 len = tlen;
9677 gcc_assert (base_reg != NULL);
9679 return plus_constant (Pmode, base_reg, base_offset);
9682 /* Emit code to save registers in the prologue. */
9684 static void
9685 ix86_emit_save_regs (void)
9687 unsigned int regno;
9688 rtx insn;
9690 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9691 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9693 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9694 RTX_FRAME_RELATED_P (insn) = 1;
9698 /* Emit a single register save at CFA - CFA_OFFSET. */
9700 static void
9701 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9702 HOST_WIDE_INT cfa_offset)
9704 struct machine_function *m = cfun->machine;
9705 rtx reg = gen_rtx_REG (mode, regno);
9706 rtx mem, addr, base, insn;
9708 addr = choose_baseaddr (cfa_offset);
9709 mem = gen_frame_mem (mode, addr);
9711 /* For SSE saves, we need to indicate the 128-bit alignment. */
9712 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9714 insn = emit_move_insn (mem, reg);
9715 RTX_FRAME_RELATED_P (insn) = 1;
9717 base = addr;
9718 if (GET_CODE (base) == PLUS)
9719 base = XEXP (base, 0);
9720 gcc_checking_assert (REG_P (base));
9722 /* When saving registers into a re-aligned local stack frame, avoid
9723 any tricky guessing by dwarf2out. */
9724 if (m->fs.realigned)
9726 gcc_checking_assert (stack_realign_drap);
9728 if (regno == REGNO (crtl->drap_reg))
9730 /* A bit of a hack. We force the DRAP register to be saved in
9731 the re-aligned stack frame, which provides us with a copy
9732 of the CFA that will last past the prologue. Install it. */
9733 gcc_checking_assert (cfun->machine->fs.fp_valid);
9734 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9735 cfun->machine->fs.fp_offset - cfa_offset);
9736 mem = gen_rtx_MEM (mode, addr);
9737 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9739 else
9741 /* The frame pointer is a stable reference within the
9742 aligned frame. Use it. */
9743 gcc_checking_assert (cfun->machine->fs.fp_valid);
9744 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9745 cfun->machine->fs.fp_offset - cfa_offset);
9746 mem = gen_rtx_MEM (mode, addr);
9747 add_reg_note (insn, REG_CFA_EXPRESSION,
9748 gen_rtx_SET (VOIDmode, mem, reg));
9752 /* The memory may not be relative to the current CFA register,
9753 which means that we may need to generate a new pattern for
9754 use by the unwind info. */
9755 else if (base != m->fs.cfa_reg)
9757 addr = plus_constant (Pmode, m->fs.cfa_reg,
9758 m->fs.cfa_offset - cfa_offset);
9759 mem = gen_rtx_MEM (mode, addr);
9760 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9764 /* Emit code to save registers using MOV insns.
9765 First register is stored at CFA - CFA_OFFSET. */
9766 static void
9767 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9769 unsigned int regno;
9771 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9772 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9774 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9775 cfa_offset -= UNITS_PER_WORD;
9779 /* Emit code to save SSE registers using MOV insns.
9780 First register is stored at CFA - CFA_OFFSET. */
9781 static void
9782 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9784 unsigned int regno;
9786 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9787 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9789 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9790 cfa_offset -= 16;
9794 static GTY(()) rtx queued_cfa_restores;
9796 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9797 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9798 Don't add the note if the previously saved value will be left untouched
9799 within stack red-zone till return, as unwinders can find the same value
9800 in the register and on the stack. */
9802 static void
9803 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9805 if (!crtl->shrink_wrapped
9806 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9807 return;
9809 if (insn)
9811 add_reg_note (insn, REG_CFA_RESTORE, reg);
9812 RTX_FRAME_RELATED_P (insn) = 1;
9814 else
9815 queued_cfa_restores
9816 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9819 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9821 static void
9822 ix86_add_queued_cfa_restore_notes (rtx insn)
9824 rtx last;
9825 if (!queued_cfa_restores)
9826 return;
9827 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9829 XEXP (last, 1) = REG_NOTES (insn);
9830 REG_NOTES (insn) = queued_cfa_restores;
9831 queued_cfa_restores = NULL_RTX;
9832 RTX_FRAME_RELATED_P (insn) = 1;
9835 /* Expand prologue or epilogue stack adjustment.
9836 The pattern exist to put a dependency on all ebp-based memory accesses.
9837 STYLE should be negative if instructions should be marked as frame related,
9838 zero if %r11 register is live and cannot be freely used and positive
9839 otherwise. */
9841 static void
9842 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9843 int style, bool set_cfa)
9845 struct machine_function *m = cfun->machine;
9846 rtx insn;
9847 bool add_frame_related_expr = false;
9849 if (Pmode == SImode)
9850 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9851 else if (x86_64_immediate_operand (offset, DImode))
9852 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9853 else
9855 rtx tmp;
9856 /* r11 is used by indirect sibcall return as well, set before the
9857 epilogue and used after the epilogue. */
9858 if (style)
9859 tmp = gen_rtx_REG (DImode, R11_REG);
9860 else
9862 gcc_assert (src != hard_frame_pointer_rtx
9863 && dest != hard_frame_pointer_rtx);
9864 tmp = hard_frame_pointer_rtx;
9866 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9867 if (style < 0)
9868 add_frame_related_expr = true;
9870 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9873 insn = emit_insn (insn);
9874 if (style >= 0)
9875 ix86_add_queued_cfa_restore_notes (insn);
9877 if (set_cfa)
9879 rtx r;
9881 gcc_assert (m->fs.cfa_reg == src);
9882 m->fs.cfa_offset += INTVAL (offset);
9883 m->fs.cfa_reg = dest;
9885 r = gen_rtx_PLUS (Pmode, src, offset);
9886 r = gen_rtx_SET (VOIDmode, dest, r);
9887 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9888 RTX_FRAME_RELATED_P (insn) = 1;
9890 else if (style < 0)
9892 RTX_FRAME_RELATED_P (insn) = 1;
9893 if (add_frame_related_expr)
9895 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9896 r = gen_rtx_SET (VOIDmode, dest, r);
9897 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9901 if (dest == stack_pointer_rtx)
9903 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9904 bool valid = m->fs.sp_valid;
9906 if (src == hard_frame_pointer_rtx)
9908 valid = m->fs.fp_valid;
9909 ooffset = m->fs.fp_offset;
9911 else if (src == crtl->drap_reg)
9913 valid = m->fs.drap_valid;
9914 ooffset = 0;
9916 else
9918 /* Else there are two possibilities: SP itself, which we set
9919 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9920 taken care of this by hand along the eh_return path. */
9921 gcc_checking_assert (src == stack_pointer_rtx
9922 || offset == const0_rtx);
9925 m->fs.sp_offset = ooffset - INTVAL (offset);
9926 m->fs.sp_valid = valid;
9930 /* Find an available register to be used as dynamic realign argument
9931 pointer regsiter. Such a register will be written in prologue and
9932 used in begin of body, so it must not be
9933 1. parameter passing register.
9934 2. GOT pointer.
9935 We reuse static-chain register if it is available. Otherwise, we
9936 use DI for i386 and R13 for x86-64. We chose R13 since it has
9937 shorter encoding.
9939 Return: the regno of chosen register. */
9941 static unsigned int
9942 find_drap_reg (void)
9944 tree decl = cfun->decl;
9946 if (TARGET_64BIT)
9948 /* Use R13 for nested function or function need static chain.
9949 Since function with tail call may use any caller-saved
9950 registers in epilogue, DRAP must not use caller-saved
9951 register in such case. */
9952 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9953 return R13_REG;
9955 return R10_REG;
9957 else
9959 /* Use DI for nested function or function need static chain.
9960 Since function with tail call may use any caller-saved
9961 registers in epilogue, DRAP must not use caller-saved
9962 register in such case. */
9963 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9964 return DI_REG;
9966 /* Reuse static chain register if it isn't used for parameter
9967 passing. */
9968 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9970 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9971 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9972 return CX_REG;
9974 return DI_REG;
9978 /* Return minimum incoming stack alignment. */
9980 static unsigned int
9981 ix86_minimum_incoming_stack_boundary (bool sibcall)
9983 unsigned int incoming_stack_boundary;
9985 /* Prefer the one specified at command line. */
9986 if (ix86_user_incoming_stack_boundary)
9987 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9988 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9989 if -mstackrealign is used, it isn't used for sibcall check and
9990 estimated stack alignment is 128bit. */
9991 else if (!sibcall
9992 && !TARGET_64BIT
9993 && ix86_force_align_arg_pointer
9994 && crtl->stack_alignment_estimated == 128)
9995 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9996 else
9997 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9999 /* Incoming stack alignment can be changed on individual functions
10000 via force_align_arg_pointer attribute. We use the smallest
10001 incoming stack boundary. */
10002 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
10003 && lookup_attribute (ix86_force_align_arg_pointer_string,
10004 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
10005 incoming_stack_boundary = MIN_STACK_BOUNDARY;
10007 /* The incoming stack frame has to be aligned at least at
10008 parm_stack_boundary. */
10009 if (incoming_stack_boundary < crtl->parm_stack_boundary)
10010 incoming_stack_boundary = crtl->parm_stack_boundary;
10012 /* Stack at entrance of main is aligned by runtime. We use the
10013 smallest incoming stack boundary. */
10014 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
10015 && DECL_NAME (current_function_decl)
10016 && MAIN_NAME_P (DECL_NAME (current_function_decl))
10017 && DECL_FILE_SCOPE_P (current_function_decl))
10018 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
10020 return incoming_stack_boundary;
10023 /* Update incoming stack boundary and estimated stack alignment. */
10025 static void
10026 ix86_update_stack_boundary (void)
10028 ix86_incoming_stack_boundary
10029 = ix86_minimum_incoming_stack_boundary (false);
10031 /* x86_64 vararg needs 16byte stack alignment for register save
10032 area. */
10033 if (TARGET_64BIT
10034 && cfun->stdarg
10035 && crtl->stack_alignment_estimated < 128)
10036 crtl->stack_alignment_estimated = 128;
10039 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
10040 needed or an rtx for DRAP otherwise. */
10042 static rtx
10043 ix86_get_drap_rtx (void)
10045 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
10046 crtl->need_drap = true;
10048 if (stack_realign_drap)
10050 /* Assign DRAP to vDRAP and returns vDRAP */
10051 unsigned int regno = find_drap_reg ();
10052 rtx drap_vreg;
10053 rtx arg_ptr;
10054 rtx seq, insn;
10056 arg_ptr = gen_rtx_REG (Pmode, regno);
10057 crtl->drap_reg = arg_ptr;
10059 start_sequence ();
10060 drap_vreg = copy_to_reg (arg_ptr);
10061 seq = get_insns ();
10062 end_sequence ();
10064 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
10065 if (!optimize)
10067 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
10068 RTX_FRAME_RELATED_P (insn) = 1;
10070 return drap_vreg;
10072 else
10073 return NULL;
10076 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
10078 static rtx
10079 ix86_internal_arg_pointer (void)
10081 return virtual_incoming_args_rtx;
10084 struct scratch_reg {
10085 rtx reg;
10086 bool saved;
10089 /* Return a short-lived scratch register for use on function entry.
10090 In 32-bit mode, it is valid only after the registers are saved
10091 in the prologue. This register must be released by means of
10092 release_scratch_register_on_entry once it is dead. */
10094 static void
10095 get_scratch_register_on_entry (struct scratch_reg *sr)
10097 int regno;
10099 sr->saved = false;
10101 if (TARGET_64BIT)
10103 /* We always use R11 in 64-bit mode. */
10104 regno = R11_REG;
10106 else
10108 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
10109 bool fastcall_p
10110 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
10111 bool thiscall_p
10112 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
10113 bool static_chain_p = DECL_STATIC_CHAIN (decl);
10114 int regparm = ix86_function_regparm (fntype, decl);
10115 int drap_regno
10116 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
10118 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
10119 for the static chain register. */
10120 if ((regparm < 1 || (fastcall_p && !static_chain_p))
10121 && drap_regno != AX_REG)
10122 regno = AX_REG;
10123 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
10124 for the static chain register. */
10125 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
10126 regno = AX_REG;
10127 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
10128 regno = DX_REG;
10129 /* ecx is the static chain register. */
10130 else if (regparm < 3 && !fastcall_p && !thiscall_p
10131 && !static_chain_p
10132 && drap_regno != CX_REG)
10133 regno = CX_REG;
10134 else if (ix86_save_reg (BX_REG, true))
10135 regno = BX_REG;
10136 /* esi is the static chain register. */
10137 else if (!(regparm == 3 && static_chain_p)
10138 && ix86_save_reg (SI_REG, true))
10139 regno = SI_REG;
10140 else if (ix86_save_reg (DI_REG, true))
10141 regno = DI_REG;
10142 else
10144 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
10145 sr->saved = true;
10149 sr->reg = gen_rtx_REG (Pmode, regno);
10150 if (sr->saved)
10152 rtx insn = emit_insn (gen_push (sr->reg));
10153 RTX_FRAME_RELATED_P (insn) = 1;
10157 /* Release a scratch register obtained from the preceding function. */
10159 static void
10160 release_scratch_register_on_entry (struct scratch_reg *sr)
10162 if (sr->saved)
10164 struct machine_function *m = cfun->machine;
10165 rtx x, insn = emit_insn (gen_pop (sr->reg));
10167 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
10168 RTX_FRAME_RELATED_P (insn) = 1;
10169 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
10170 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10171 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
10172 m->fs.sp_offset -= UNITS_PER_WORD;
10176 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
10178 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
10180 static void
10181 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
10183 /* We skip the probe for the first interval + a small dope of 4 words and
10184 probe that many bytes past the specified size to maintain a protection
10185 area at the botton of the stack. */
10186 const int dope = 4 * UNITS_PER_WORD;
10187 rtx size_rtx = GEN_INT (size), last;
10189 /* See if we have a constant small number of probes to generate. If so,
10190 that's the easy case. The run-time loop is made up of 11 insns in the
10191 generic case while the compile-time loop is made up of 3+2*(n-1) insns
10192 for n # of intervals. */
10193 if (size <= 5 * PROBE_INTERVAL)
10195 HOST_WIDE_INT i, adjust;
10196 bool first_probe = true;
10198 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
10199 values of N from 1 until it exceeds SIZE. If only one probe is
10200 needed, this will not generate any code. Then adjust and probe
10201 to PROBE_INTERVAL + SIZE. */
10202 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10204 if (first_probe)
10206 adjust = 2 * PROBE_INTERVAL + dope;
10207 first_probe = false;
10209 else
10210 adjust = PROBE_INTERVAL;
10212 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10213 plus_constant (Pmode, stack_pointer_rtx,
10214 -adjust)));
10215 emit_stack_probe (stack_pointer_rtx);
10218 if (first_probe)
10219 adjust = size + PROBE_INTERVAL + dope;
10220 else
10221 adjust = size + PROBE_INTERVAL - i;
10223 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10224 plus_constant (Pmode, stack_pointer_rtx,
10225 -adjust)));
10226 emit_stack_probe (stack_pointer_rtx);
10228 /* Adjust back to account for the additional first interval. */
10229 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10230 plus_constant (Pmode, stack_pointer_rtx,
10231 PROBE_INTERVAL + dope)));
10234 /* Otherwise, do the same as above, but in a loop. Note that we must be
10235 extra careful with variables wrapping around because we might be at
10236 the very top (or the very bottom) of the address space and we have
10237 to be able to handle this case properly; in particular, we use an
10238 equality test for the loop condition. */
10239 else
10241 HOST_WIDE_INT rounded_size;
10242 struct scratch_reg sr;
10244 get_scratch_register_on_entry (&sr);
10247 /* Step 1: round SIZE to the previous multiple of the interval. */
10249 rounded_size = size & -PROBE_INTERVAL;
10252 /* Step 2: compute initial and final value of the loop counter. */
10254 /* SP = SP_0 + PROBE_INTERVAL. */
10255 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10256 plus_constant (Pmode, stack_pointer_rtx,
10257 - (PROBE_INTERVAL + dope))));
10259 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10260 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10261 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10262 gen_rtx_PLUS (Pmode, sr.reg,
10263 stack_pointer_rtx)));
10266 /* Step 3: the loop
10268 while (SP != LAST_ADDR)
10270 SP = SP + PROBE_INTERVAL
10271 probe at SP
10274 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10275 values of N from 1 until it is equal to ROUNDED_SIZE. */
10277 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10280 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10281 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10283 if (size != rounded_size)
10285 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10286 plus_constant (Pmode, stack_pointer_rtx,
10287 rounded_size - size)));
10288 emit_stack_probe (stack_pointer_rtx);
10291 /* Adjust back to account for the additional first interval. */
10292 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10293 plus_constant (Pmode, stack_pointer_rtx,
10294 PROBE_INTERVAL + dope)));
10296 release_scratch_register_on_entry (&sr);
10299 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10301 /* Even if the stack pointer isn't the CFA register, we need to correctly
10302 describe the adjustments made to it, in particular differentiate the
10303 frame-related ones from the frame-unrelated ones. */
10304 if (size > 0)
10306 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10307 XVECEXP (expr, 0, 0)
10308 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10309 plus_constant (Pmode, stack_pointer_rtx, -size));
10310 XVECEXP (expr, 0, 1)
10311 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10312 plus_constant (Pmode, stack_pointer_rtx,
10313 PROBE_INTERVAL + dope + size));
10314 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10315 RTX_FRAME_RELATED_P (last) = 1;
10317 cfun->machine->fs.sp_offset += size;
10320 /* Make sure nothing is scheduled before we are done. */
10321 emit_insn (gen_blockage ());
10324 /* Adjust the stack pointer up to REG while probing it. */
10326 const char *
10327 output_adjust_stack_and_probe (rtx reg)
10329 static int labelno = 0;
10330 char loop_lab[32], end_lab[32];
10331 rtx xops[2];
10333 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10334 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10336 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10338 /* Jump to END_LAB if SP == LAST_ADDR. */
10339 xops[0] = stack_pointer_rtx;
10340 xops[1] = reg;
10341 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10342 fputs ("\tje\t", asm_out_file);
10343 assemble_name_raw (asm_out_file, end_lab);
10344 fputc ('\n', asm_out_file);
10346 /* SP = SP + PROBE_INTERVAL. */
10347 xops[1] = GEN_INT (PROBE_INTERVAL);
10348 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10350 /* Probe at SP. */
10351 xops[1] = const0_rtx;
10352 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10354 fprintf (asm_out_file, "\tjmp\t");
10355 assemble_name_raw (asm_out_file, loop_lab);
10356 fputc ('\n', asm_out_file);
10358 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10360 return "";
10363 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10364 inclusive. These are offsets from the current stack pointer. */
10366 static void
10367 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10369 /* See if we have a constant small number of probes to generate. If so,
10370 that's the easy case. The run-time loop is made up of 7 insns in the
10371 generic case while the compile-time loop is made up of n insns for n #
10372 of intervals. */
10373 if (size <= 7 * PROBE_INTERVAL)
10375 HOST_WIDE_INT i;
10377 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10378 it exceeds SIZE. If only one probe is needed, this will not
10379 generate any code. Then probe at FIRST + SIZE. */
10380 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10381 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10382 -(first + i)));
10384 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10385 -(first + size)));
10388 /* Otherwise, do the same as above, but in a loop. Note that we must be
10389 extra careful with variables wrapping around because we might be at
10390 the very top (or the very bottom) of the address space and we have
10391 to be able to handle this case properly; in particular, we use an
10392 equality test for the loop condition. */
10393 else
10395 HOST_WIDE_INT rounded_size, last;
10396 struct scratch_reg sr;
10398 get_scratch_register_on_entry (&sr);
10401 /* Step 1: round SIZE to the previous multiple of the interval. */
10403 rounded_size = size & -PROBE_INTERVAL;
10406 /* Step 2: compute initial and final value of the loop counter. */
10408 /* TEST_OFFSET = FIRST. */
10409 emit_move_insn (sr.reg, GEN_INT (-first));
10411 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10412 last = first + rounded_size;
10415 /* Step 3: the loop
10417 while (TEST_ADDR != LAST_ADDR)
10419 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10420 probe at TEST_ADDR
10423 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10424 until it is equal to ROUNDED_SIZE. */
10426 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10429 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10430 that SIZE is equal to ROUNDED_SIZE. */
10432 if (size != rounded_size)
10433 emit_stack_probe (plus_constant (Pmode,
10434 gen_rtx_PLUS (Pmode,
10435 stack_pointer_rtx,
10436 sr.reg),
10437 rounded_size - size));
10439 release_scratch_register_on_entry (&sr);
10442 /* Make sure nothing is scheduled before we are done. */
10443 emit_insn (gen_blockage ());
10446 /* Probe a range of stack addresses from REG to END, inclusive. These are
10447 offsets from the current stack pointer. */
10449 const char *
10450 output_probe_stack_range (rtx reg, rtx end)
10452 static int labelno = 0;
10453 char loop_lab[32], end_lab[32];
10454 rtx xops[3];
10456 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10457 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10459 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10461 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10462 xops[0] = reg;
10463 xops[1] = end;
10464 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10465 fputs ("\tje\t", asm_out_file);
10466 assemble_name_raw (asm_out_file, end_lab);
10467 fputc ('\n', asm_out_file);
10469 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10470 xops[1] = GEN_INT (PROBE_INTERVAL);
10471 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10473 /* Probe at TEST_ADDR. */
10474 xops[0] = stack_pointer_rtx;
10475 xops[1] = reg;
10476 xops[2] = const0_rtx;
10477 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10479 fprintf (asm_out_file, "\tjmp\t");
10480 assemble_name_raw (asm_out_file, loop_lab);
10481 fputc ('\n', asm_out_file);
10483 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10485 return "";
10488 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10489 to be generated in correct form. */
10490 static void
10491 ix86_finalize_stack_realign_flags (void)
10493 /* Check if stack realign is really needed after reload, and
10494 stores result in cfun */
10495 unsigned int incoming_stack_boundary
10496 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10497 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10498 unsigned int stack_realign = (incoming_stack_boundary
10499 < (crtl->is_leaf
10500 ? crtl->max_used_stack_slot_alignment
10501 : crtl->stack_alignment_needed));
10503 if (crtl->stack_realign_finalized)
10505 /* After stack_realign_needed is finalized, we can't no longer
10506 change it. */
10507 gcc_assert (crtl->stack_realign_needed == stack_realign);
10508 return;
10511 /* If the only reason for frame_pointer_needed is that we conservatively
10512 assumed stack realignment might be needed, but in the end nothing that
10513 needed the stack alignment had been spilled, clear frame_pointer_needed
10514 and say we don't need stack realignment. */
10515 if (stack_realign
10516 && !crtl->need_drap
10517 && frame_pointer_needed
10518 && crtl->is_leaf
10519 && flag_omit_frame_pointer
10520 && crtl->sp_is_unchanging
10521 && !ix86_current_function_calls_tls_descriptor
10522 && !crtl->accesses_prior_frames
10523 && !cfun->calls_alloca
10524 && !crtl->calls_eh_return
10525 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10526 && !ix86_frame_pointer_required ()
10527 && get_frame_size () == 0
10528 && ix86_nsaved_sseregs () == 0
10529 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10531 HARD_REG_SET set_up_by_prologue, prologue_used;
10532 basic_block bb;
10534 CLEAR_HARD_REG_SET (prologue_used);
10535 CLEAR_HARD_REG_SET (set_up_by_prologue);
10536 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10537 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10538 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10539 HARD_FRAME_POINTER_REGNUM);
10540 FOR_EACH_BB (bb)
10542 rtx insn;
10543 FOR_BB_INSNS (bb, insn)
10544 if (NONDEBUG_INSN_P (insn)
10545 && requires_stack_frame_p (insn, prologue_used,
10546 set_up_by_prologue))
10548 crtl->stack_realign_needed = stack_realign;
10549 crtl->stack_realign_finalized = true;
10550 return;
10554 frame_pointer_needed = false;
10555 stack_realign = false;
10556 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10557 crtl->stack_alignment_needed = incoming_stack_boundary;
10558 crtl->stack_alignment_estimated = incoming_stack_boundary;
10559 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10560 crtl->preferred_stack_boundary = incoming_stack_boundary;
10561 df_finish_pass (true);
10562 df_scan_alloc (NULL);
10563 df_scan_blocks ();
10564 df_compute_regs_ever_live (true);
10565 df_analyze ();
10568 crtl->stack_realign_needed = stack_realign;
10569 crtl->stack_realign_finalized = true;
10572 /* Expand the prologue into a bunch of separate insns. */
10574 void
10575 ix86_expand_prologue (void)
10577 struct machine_function *m = cfun->machine;
10578 rtx insn, t;
10579 bool pic_reg_used;
10580 struct ix86_frame frame;
10581 HOST_WIDE_INT allocate;
10582 bool int_registers_saved;
10583 bool sse_registers_saved;
10585 ix86_finalize_stack_realign_flags ();
10587 /* DRAP should not coexist with stack_realign_fp */
10588 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10590 memset (&m->fs, 0, sizeof (m->fs));
10592 /* Initialize CFA state for before the prologue. */
10593 m->fs.cfa_reg = stack_pointer_rtx;
10594 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10596 /* Track SP offset to the CFA. We continue tracking this after we've
10597 swapped the CFA register away from SP. In the case of re-alignment
10598 this is fudged; we're interested to offsets within the local frame. */
10599 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10600 m->fs.sp_valid = true;
10602 ix86_compute_frame_layout (&frame);
10604 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10606 /* We should have already generated an error for any use of
10607 ms_hook on a nested function. */
10608 gcc_checking_assert (!ix86_static_chain_on_stack);
10610 /* Check if profiling is active and we shall use profiling before
10611 prologue variant. If so sorry. */
10612 if (crtl->profile && flag_fentry != 0)
10613 sorry ("ms_hook_prologue attribute isn%'t compatible "
10614 "with -mfentry for 32-bit");
10616 /* In ix86_asm_output_function_label we emitted:
10617 8b ff movl.s %edi,%edi
10618 55 push %ebp
10619 8b ec movl.s %esp,%ebp
10621 This matches the hookable function prologue in Win32 API
10622 functions in Microsoft Windows XP Service Pack 2 and newer.
10623 Wine uses this to enable Windows apps to hook the Win32 API
10624 functions provided by Wine.
10626 What that means is that we've already set up the frame pointer. */
10628 if (frame_pointer_needed
10629 && !(crtl->drap_reg && crtl->stack_realign_needed))
10631 rtx push, mov;
10633 /* We've decided to use the frame pointer already set up.
10634 Describe this to the unwinder by pretending that both
10635 push and mov insns happen right here.
10637 Putting the unwind info here at the end of the ms_hook
10638 is done so that we can make absolutely certain we get
10639 the required byte sequence at the start of the function,
10640 rather than relying on an assembler that can produce
10641 the exact encoding required.
10643 However it does mean (in the unpatched case) that we have
10644 a 1 insn window where the asynchronous unwind info is
10645 incorrect. However, if we placed the unwind info at
10646 its correct location we would have incorrect unwind info
10647 in the patched case. Which is probably all moot since
10648 I don't expect Wine generates dwarf2 unwind info for the
10649 system libraries that use this feature. */
10651 insn = emit_insn (gen_blockage ());
10653 push = gen_push (hard_frame_pointer_rtx);
10654 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10655 stack_pointer_rtx);
10656 RTX_FRAME_RELATED_P (push) = 1;
10657 RTX_FRAME_RELATED_P (mov) = 1;
10659 RTX_FRAME_RELATED_P (insn) = 1;
10660 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10661 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10663 /* Note that gen_push incremented m->fs.cfa_offset, even
10664 though we didn't emit the push insn here. */
10665 m->fs.cfa_reg = hard_frame_pointer_rtx;
10666 m->fs.fp_offset = m->fs.cfa_offset;
10667 m->fs.fp_valid = true;
10669 else
10671 /* The frame pointer is not needed so pop %ebp again.
10672 This leaves us with a pristine state. */
10673 emit_insn (gen_pop (hard_frame_pointer_rtx));
10677 /* The first insn of a function that accepts its static chain on the
10678 stack is to push the register that would be filled in by a direct
10679 call. This insn will be skipped by the trampoline. */
10680 else if (ix86_static_chain_on_stack)
10682 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10683 emit_insn (gen_blockage ());
10685 /* We don't want to interpret this push insn as a register save,
10686 only as a stack adjustment. The real copy of the register as
10687 a save will be done later, if needed. */
10688 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10689 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10690 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10691 RTX_FRAME_RELATED_P (insn) = 1;
10694 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10695 of DRAP is needed and stack realignment is really needed after reload */
10696 if (stack_realign_drap)
10698 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10700 /* Only need to push parameter pointer reg if it is caller saved. */
10701 if (!call_used_regs[REGNO (crtl->drap_reg)])
10703 /* Push arg pointer reg */
10704 insn = emit_insn (gen_push (crtl->drap_reg));
10705 RTX_FRAME_RELATED_P (insn) = 1;
10708 /* Grab the argument pointer. */
10709 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10710 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10711 RTX_FRAME_RELATED_P (insn) = 1;
10712 m->fs.cfa_reg = crtl->drap_reg;
10713 m->fs.cfa_offset = 0;
10715 /* Align the stack. */
10716 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10717 stack_pointer_rtx,
10718 GEN_INT (-align_bytes)));
10719 RTX_FRAME_RELATED_P (insn) = 1;
10721 /* Replicate the return address on the stack so that return
10722 address can be reached via (argp - 1) slot. This is needed
10723 to implement macro RETURN_ADDR_RTX and intrinsic function
10724 expand_builtin_return_addr etc. */
10725 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10726 t = gen_frame_mem (word_mode, t);
10727 insn = emit_insn (gen_push (t));
10728 RTX_FRAME_RELATED_P (insn) = 1;
10730 /* For the purposes of frame and register save area addressing,
10731 we've started over with a new frame. */
10732 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10733 m->fs.realigned = true;
10736 int_registers_saved = (frame.nregs == 0);
10737 sse_registers_saved = (frame.nsseregs == 0);
10739 if (frame_pointer_needed && !m->fs.fp_valid)
10741 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10742 slower on all targets. Also sdb doesn't like it. */
10743 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10744 RTX_FRAME_RELATED_P (insn) = 1;
10746 /* Push registers now, before setting the frame pointer
10747 on SEH target. */
10748 if (!int_registers_saved
10749 && TARGET_SEH
10750 && !frame.save_regs_using_mov)
10752 ix86_emit_save_regs ();
10753 int_registers_saved = true;
10754 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10757 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10759 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10760 RTX_FRAME_RELATED_P (insn) = 1;
10762 if (m->fs.cfa_reg == stack_pointer_rtx)
10763 m->fs.cfa_reg = hard_frame_pointer_rtx;
10764 m->fs.fp_offset = m->fs.sp_offset;
10765 m->fs.fp_valid = true;
10769 if (!int_registers_saved)
10771 /* If saving registers via PUSH, do so now. */
10772 if (!frame.save_regs_using_mov)
10774 ix86_emit_save_regs ();
10775 int_registers_saved = true;
10776 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10779 /* When using red zone we may start register saving before allocating
10780 the stack frame saving one cycle of the prologue. However, avoid
10781 doing this if we have to probe the stack; at least on x86_64 the
10782 stack probe can turn into a call that clobbers a red zone location. */
10783 else if (ix86_using_red_zone ()
10784 && (! TARGET_STACK_PROBE
10785 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10787 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10788 int_registers_saved = true;
10792 if (stack_realign_fp)
10794 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10795 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10797 /* The computation of the size of the re-aligned stack frame means
10798 that we must allocate the size of the register save area before
10799 performing the actual alignment. Otherwise we cannot guarantee
10800 that there's enough storage above the realignment point. */
10801 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10802 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10803 GEN_INT (m->fs.sp_offset
10804 - frame.sse_reg_save_offset),
10805 -1, false);
10807 /* Align the stack. */
10808 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10809 stack_pointer_rtx,
10810 GEN_INT (-align_bytes)));
10812 /* For the purposes of register save area addressing, the stack
10813 pointer is no longer valid. As for the value of sp_offset,
10814 see ix86_compute_frame_layout, which we need to match in order
10815 to pass verification of stack_pointer_offset at the end. */
10816 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10817 m->fs.sp_valid = false;
10820 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10822 if (flag_stack_usage_info)
10824 /* We start to count from ARG_POINTER. */
10825 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10827 /* If it was realigned, take into account the fake frame. */
10828 if (stack_realign_drap)
10830 if (ix86_static_chain_on_stack)
10831 stack_size += UNITS_PER_WORD;
10833 if (!call_used_regs[REGNO (crtl->drap_reg)])
10834 stack_size += UNITS_PER_WORD;
10836 /* This over-estimates by 1 minimal-stack-alignment-unit but
10837 mitigates that by counting in the new return address slot. */
10838 current_function_dynamic_stack_size
10839 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10842 current_function_static_stack_size = stack_size;
10845 /* On SEH target with very large frame size, allocate an area to save
10846 SSE registers (as the very large allocation won't be described). */
10847 if (TARGET_SEH
10848 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10849 && !sse_registers_saved)
10851 HOST_WIDE_INT sse_size =
10852 frame.sse_reg_save_offset - frame.reg_save_offset;
10854 gcc_assert (int_registers_saved);
10856 /* No need to do stack checking as the area will be immediately
10857 written. */
10858 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10859 GEN_INT (-sse_size), -1,
10860 m->fs.cfa_reg == stack_pointer_rtx);
10861 allocate -= sse_size;
10862 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10863 sse_registers_saved = true;
10866 /* The stack has already been decremented by the instruction calling us
10867 so probe if the size is non-negative to preserve the protection area. */
10868 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10870 /* We expect the registers to be saved when probes are used. */
10871 gcc_assert (int_registers_saved);
10873 if (STACK_CHECK_MOVING_SP)
10875 ix86_adjust_stack_and_probe (allocate);
10876 allocate = 0;
10878 else
10880 HOST_WIDE_INT size = allocate;
10882 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10883 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10885 if (TARGET_STACK_PROBE)
10886 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10887 else
10888 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10892 if (allocate == 0)
10894 else if (!ix86_target_stack_probe ()
10895 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10897 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10898 GEN_INT (-allocate), -1,
10899 m->fs.cfa_reg == stack_pointer_rtx);
10901 else
10903 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10904 rtx r10 = NULL;
10905 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10906 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10907 bool eax_live = false;
10908 bool r10_live = false;
10910 if (TARGET_64BIT)
10911 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10912 if (!TARGET_64BIT_MS_ABI)
10913 eax_live = ix86_eax_live_at_start_p ();
10915 /* Note that SEH directives need to continue tracking the stack
10916 pointer even after the frame pointer has been set up. */
10917 if (eax_live)
10919 insn = emit_insn (gen_push (eax));
10920 allocate -= UNITS_PER_WORD;
10921 if (sp_is_cfa_reg || TARGET_SEH)
10923 if (sp_is_cfa_reg)
10924 m->fs.cfa_offset += UNITS_PER_WORD;
10925 RTX_FRAME_RELATED_P (insn) = 1;
10929 if (r10_live)
10931 r10 = gen_rtx_REG (Pmode, R10_REG);
10932 insn = emit_insn (gen_push (r10));
10933 allocate -= UNITS_PER_WORD;
10934 if (sp_is_cfa_reg || TARGET_SEH)
10936 if (sp_is_cfa_reg)
10937 m->fs.cfa_offset += UNITS_PER_WORD;
10938 RTX_FRAME_RELATED_P (insn) = 1;
10942 emit_move_insn (eax, GEN_INT (allocate));
10943 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10945 /* Use the fact that AX still contains ALLOCATE. */
10946 adjust_stack_insn = (Pmode == DImode
10947 ? gen_pro_epilogue_adjust_stack_di_sub
10948 : gen_pro_epilogue_adjust_stack_si_sub);
10950 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10951 stack_pointer_rtx, eax));
10953 if (sp_is_cfa_reg || TARGET_SEH)
10955 if (sp_is_cfa_reg)
10956 m->fs.cfa_offset += allocate;
10957 RTX_FRAME_RELATED_P (insn) = 1;
10958 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10959 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10960 plus_constant (Pmode, stack_pointer_rtx,
10961 -allocate)));
10963 m->fs.sp_offset += allocate;
10965 if (r10_live && eax_live)
10967 t = choose_baseaddr (m->fs.sp_offset - allocate);
10968 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10969 gen_frame_mem (word_mode, t));
10970 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10971 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10972 gen_frame_mem (word_mode, t));
10974 else if (eax_live || r10_live)
10976 t = choose_baseaddr (m->fs.sp_offset - allocate);
10977 emit_move_insn (gen_rtx_REG (word_mode,
10978 (eax_live ? AX_REG : R10_REG)),
10979 gen_frame_mem (word_mode, t));
10982 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10984 /* If we havn't already set up the frame pointer, do so now. */
10985 if (frame_pointer_needed && !m->fs.fp_valid)
10987 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10988 GEN_INT (frame.stack_pointer_offset
10989 - frame.hard_frame_pointer_offset));
10990 insn = emit_insn (insn);
10991 RTX_FRAME_RELATED_P (insn) = 1;
10992 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10994 if (m->fs.cfa_reg == stack_pointer_rtx)
10995 m->fs.cfa_reg = hard_frame_pointer_rtx;
10996 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10997 m->fs.fp_valid = true;
11000 if (!int_registers_saved)
11001 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
11002 if (!sse_registers_saved)
11003 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
11005 pic_reg_used = false;
11006 /* We don't use pic-register for pe-coff target. */
11007 if (pic_offset_table_rtx
11008 && !TARGET_PECOFF
11009 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
11010 || crtl->profile))
11012 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
11014 if (alt_pic_reg_used != INVALID_REGNUM)
11015 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
11017 pic_reg_used = true;
11020 if (pic_reg_used)
11022 if (TARGET_64BIT)
11024 if (ix86_cmodel == CM_LARGE_PIC)
11026 rtx label, tmp_reg;
11028 gcc_assert (Pmode == DImode);
11029 label = gen_label_rtx ();
11030 emit_label (label);
11031 LABEL_PRESERVE_P (label) = 1;
11032 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
11033 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
11034 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
11035 label));
11036 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
11037 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
11038 pic_offset_table_rtx, tmp_reg));
11040 else
11041 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
11043 else
11045 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
11046 RTX_FRAME_RELATED_P (insn) = 1;
11047 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
11051 /* In the pic_reg_used case, make sure that the got load isn't deleted
11052 when mcount needs it. Blockage to avoid call movement across mcount
11053 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
11054 note. */
11055 if (crtl->profile && !flag_fentry && pic_reg_used)
11056 emit_insn (gen_prologue_use (pic_offset_table_rtx));
11058 if (crtl->drap_reg && !crtl->stack_realign_needed)
11060 /* vDRAP is setup but after reload it turns out stack realign
11061 isn't necessary, here we will emit prologue to setup DRAP
11062 without stack realign adjustment */
11063 t = choose_baseaddr (0);
11064 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
11067 /* Prevent instructions from being scheduled into register save push
11068 sequence when access to the redzone area is done through frame pointer.
11069 The offset between the frame pointer and the stack pointer is calculated
11070 relative to the value of the stack pointer at the end of the function
11071 prologue, and moving instructions that access redzone area via frame
11072 pointer inside push sequence violates this assumption. */
11073 if (frame_pointer_needed && frame.red_zone_size)
11074 emit_insn (gen_memory_blockage ());
11076 /* Emit cld instruction if stringops are used in the function. */
11077 if (TARGET_CLD && ix86_current_function_needs_cld)
11078 emit_insn (gen_cld ());
11080 /* SEH requires that the prologue end within 256 bytes of the start of
11081 the function. Prevent instruction schedules that would extend that.
11082 Further, prevent alloca modifications to the stack pointer from being
11083 combined with prologue modifications. */
11084 if (TARGET_SEH)
11085 emit_insn (gen_prologue_use (stack_pointer_rtx));
11088 /* Emit code to restore REG using a POP insn. */
11090 static void
11091 ix86_emit_restore_reg_using_pop (rtx reg)
11093 struct machine_function *m = cfun->machine;
11094 rtx insn = emit_insn (gen_pop (reg));
11096 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
11097 m->fs.sp_offset -= UNITS_PER_WORD;
11099 if (m->fs.cfa_reg == crtl->drap_reg
11100 && REGNO (reg) == REGNO (crtl->drap_reg))
11102 /* Previously we'd represented the CFA as an expression
11103 like *(%ebp - 8). We've just popped that value from
11104 the stack, which means we need to reset the CFA to
11105 the drap register. This will remain until we restore
11106 the stack pointer. */
11107 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11108 RTX_FRAME_RELATED_P (insn) = 1;
11110 /* This means that the DRAP register is valid for addressing too. */
11111 m->fs.drap_valid = true;
11112 return;
11115 if (m->fs.cfa_reg == stack_pointer_rtx)
11117 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11118 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
11119 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11120 RTX_FRAME_RELATED_P (insn) = 1;
11122 m->fs.cfa_offset -= UNITS_PER_WORD;
11125 /* When the frame pointer is the CFA, and we pop it, we are
11126 swapping back to the stack pointer as the CFA. This happens
11127 for stack frames that don't allocate other data, so we assume
11128 the stack pointer is now pointing at the return address, i.e.
11129 the function entry state, which makes the offset be 1 word. */
11130 if (reg == hard_frame_pointer_rtx)
11132 m->fs.fp_valid = false;
11133 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11135 m->fs.cfa_reg = stack_pointer_rtx;
11136 m->fs.cfa_offset -= UNITS_PER_WORD;
11138 add_reg_note (insn, REG_CFA_DEF_CFA,
11139 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11140 GEN_INT (m->fs.cfa_offset)));
11141 RTX_FRAME_RELATED_P (insn) = 1;
11146 /* Emit code to restore saved registers using POP insns. */
11148 static void
11149 ix86_emit_restore_regs_using_pop (void)
11151 unsigned int regno;
11153 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11154 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
11155 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
11158 /* Emit code and notes for the LEAVE instruction. */
11160 static void
11161 ix86_emit_leave (void)
11163 struct machine_function *m = cfun->machine;
11164 rtx insn = emit_insn (ix86_gen_leave ());
11166 ix86_add_queued_cfa_restore_notes (insn);
11168 gcc_assert (m->fs.fp_valid);
11169 m->fs.sp_valid = true;
11170 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
11171 m->fs.fp_valid = false;
11173 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11175 m->fs.cfa_reg = stack_pointer_rtx;
11176 m->fs.cfa_offset = m->fs.sp_offset;
11178 add_reg_note (insn, REG_CFA_DEF_CFA,
11179 plus_constant (Pmode, stack_pointer_rtx,
11180 m->fs.sp_offset));
11181 RTX_FRAME_RELATED_P (insn) = 1;
11183 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
11184 m->fs.fp_offset);
11187 /* Emit code to restore saved registers using MOV insns.
11188 First register is restored from CFA - CFA_OFFSET. */
11189 static void
11190 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
11191 bool maybe_eh_return)
11193 struct machine_function *m = cfun->machine;
11194 unsigned int regno;
11196 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11197 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11199 rtx reg = gen_rtx_REG (word_mode, regno);
11200 rtx insn, mem;
11202 mem = choose_baseaddr (cfa_offset);
11203 mem = gen_frame_mem (word_mode, mem);
11204 insn = emit_move_insn (reg, mem);
11206 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
11208 /* Previously we'd represented the CFA as an expression
11209 like *(%ebp - 8). We've just popped that value from
11210 the stack, which means we need to reset the CFA to
11211 the drap register. This will remain until we restore
11212 the stack pointer. */
11213 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11214 RTX_FRAME_RELATED_P (insn) = 1;
11216 /* This means that the DRAP register is valid for addressing. */
11217 m->fs.drap_valid = true;
11219 else
11220 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11222 cfa_offset -= UNITS_PER_WORD;
11226 /* Emit code to restore saved registers using MOV insns.
11227 First register is restored from CFA - CFA_OFFSET. */
11228 static void
11229 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11230 bool maybe_eh_return)
11232 unsigned int regno;
11234 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11235 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11237 rtx reg = gen_rtx_REG (V4SFmode, regno);
11238 rtx mem;
11240 mem = choose_baseaddr (cfa_offset);
11241 mem = gen_rtx_MEM (V4SFmode, mem);
11242 set_mem_align (mem, 128);
11243 emit_move_insn (reg, mem);
11245 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11247 cfa_offset -= 16;
11251 /* Restore function stack, frame, and registers. */
11253 void
11254 ix86_expand_epilogue (int style)
11256 struct machine_function *m = cfun->machine;
11257 struct machine_frame_state frame_state_save = m->fs;
11258 struct ix86_frame frame;
11259 bool restore_regs_via_mov;
11260 bool using_drap;
11262 ix86_finalize_stack_realign_flags ();
11263 ix86_compute_frame_layout (&frame);
11265 m->fs.sp_valid = (!frame_pointer_needed
11266 || (crtl->sp_is_unchanging
11267 && !stack_realign_fp));
11268 gcc_assert (!m->fs.sp_valid
11269 || m->fs.sp_offset == frame.stack_pointer_offset);
11271 /* The FP must be valid if the frame pointer is present. */
11272 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11273 gcc_assert (!m->fs.fp_valid
11274 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11276 /* We must have *some* valid pointer to the stack frame. */
11277 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11279 /* The DRAP is never valid at this point. */
11280 gcc_assert (!m->fs.drap_valid);
11282 /* See the comment about red zone and frame
11283 pointer usage in ix86_expand_prologue. */
11284 if (frame_pointer_needed && frame.red_zone_size)
11285 emit_insn (gen_memory_blockage ());
11287 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11288 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11290 /* Determine the CFA offset of the end of the red-zone. */
11291 m->fs.red_zone_offset = 0;
11292 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11294 /* The red-zone begins below the return address. */
11295 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11297 /* When the register save area is in the aligned portion of
11298 the stack, determine the maximum runtime displacement that
11299 matches up with the aligned frame. */
11300 if (stack_realign_drap)
11301 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11302 + UNITS_PER_WORD);
11305 /* Special care must be taken for the normal return case of a function
11306 using eh_return: the eax and edx registers are marked as saved, but
11307 not restored along this path. Adjust the save location to match. */
11308 if (crtl->calls_eh_return && style != 2)
11309 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11311 /* EH_RETURN requires the use of moves to function properly. */
11312 if (crtl->calls_eh_return)
11313 restore_regs_via_mov = true;
11314 /* SEH requires the use of pops to identify the epilogue. */
11315 else if (TARGET_SEH)
11316 restore_regs_via_mov = false;
11317 /* If we're only restoring one register and sp is not valid then
11318 using a move instruction to restore the register since it's
11319 less work than reloading sp and popping the register. */
11320 else if (!m->fs.sp_valid && frame.nregs <= 1)
11321 restore_regs_via_mov = true;
11322 else if (TARGET_EPILOGUE_USING_MOVE
11323 && cfun->machine->use_fast_prologue_epilogue
11324 && (frame.nregs > 1
11325 || m->fs.sp_offset != frame.reg_save_offset))
11326 restore_regs_via_mov = true;
11327 else if (frame_pointer_needed
11328 && !frame.nregs
11329 && m->fs.sp_offset != frame.reg_save_offset)
11330 restore_regs_via_mov = true;
11331 else if (frame_pointer_needed
11332 && TARGET_USE_LEAVE
11333 && cfun->machine->use_fast_prologue_epilogue
11334 && frame.nregs == 1)
11335 restore_regs_via_mov = true;
11336 else
11337 restore_regs_via_mov = false;
11339 if (restore_regs_via_mov || frame.nsseregs)
11341 /* Ensure that the entire register save area is addressable via
11342 the stack pointer, if we will restore via sp. */
11343 if (TARGET_64BIT
11344 && m->fs.sp_offset > 0x7fffffff
11345 && !(m->fs.fp_valid || m->fs.drap_valid)
11346 && (frame.nsseregs + frame.nregs) != 0)
11348 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11349 GEN_INT (m->fs.sp_offset
11350 - frame.sse_reg_save_offset),
11351 style,
11352 m->fs.cfa_reg == stack_pointer_rtx);
11356 /* If there are any SSE registers to restore, then we have to do it
11357 via moves, since there's obviously no pop for SSE regs. */
11358 if (frame.nsseregs)
11359 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11360 style == 2);
11362 if (restore_regs_via_mov)
11364 rtx t;
11366 if (frame.nregs)
11367 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11369 /* eh_return epilogues need %ecx added to the stack pointer. */
11370 if (style == 2)
11372 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11374 /* Stack align doesn't work with eh_return. */
11375 gcc_assert (!stack_realign_drap);
11376 /* Neither does regparm nested functions. */
11377 gcc_assert (!ix86_static_chain_on_stack);
11379 if (frame_pointer_needed)
11381 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11382 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11383 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11385 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11386 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11388 /* Note that we use SA as a temporary CFA, as the return
11389 address is at the proper place relative to it. We
11390 pretend this happens at the FP restore insn because
11391 prior to this insn the FP would be stored at the wrong
11392 offset relative to SA, and after this insn we have no
11393 other reasonable register to use for the CFA. We don't
11394 bother resetting the CFA to the SP for the duration of
11395 the return insn. */
11396 add_reg_note (insn, REG_CFA_DEF_CFA,
11397 plus_constant (Pmode, sa, UNITS_PER_WORD));
11398 ix86_add_queued_cfa_restore_notes (insn);
11399 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11400 RTX_FRAME_RELATED_P (insn) = 1;
11402 m->fs.cfa_reg = sa;
11403 m->fs.cfa_offset = UNITS_PER_WORD;
11404 m->fs.fp_valid = false;
11406 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11407 const0_rtx, style, false);
11409 else
11411 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11412 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11413 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11414 ix86_add_queued_cfa_restore_notes (insn);
11416 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11417 if (m->fs.cfa_offset != UNITS_PER_WORD)
11419 m->fs.cfa_offset = UNITS_PER_WORD;
11420 add_reg_note (insn, REG_CFA_DEF_CFA,
11421 plus_constant (Pmode, stack_pointer_rtx,
11422 UNITS_PER_WORD));
11423 RTX_FRAME_RELATED_P (insn) = 1;
11426 m->fs.sp_offset = UNITS_PER_WORD;
11427 m->fs.sp_valid = true;
11430 else
11432 /* SEH requires that the function end with (1) a stack adjustment
11433 if necessary, (2) a sequence of pops, and (3) a return or
11434 jump instruction. Prevent insns from the function body from
11435 being scheduled into this sequence. */
11436 if (TARGET_SEH)
11438 /* Prevent a catch region from being adjacent to the standard
11439 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11440 several other flags that would be interesting to test are
11441 not yet set up. */
11442 if (flag_non_call_exceptions)
11443 emit_insn (gen_nops (const1_rtx));
11444 else
11445 emit_insn (gen_blockage ());
11448 /* First step is to deallocate the stack frame so that we can
11449 pop the registers. Also do it on SEH target for very large
11450 frame as the emitted instructions aren't allowed by the ABI in
11451 epilogues. */
11452 if (!m->fs.sp_valid
11453 || (TARGET_SEH
11454 && (m->fs.sp_offset - frame.reg_save_offset
11455 >= SEH_MAX_FRAME_SIZE)))
11457 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11458 GEN_INT (m->fs.fp_offset
11459 - frame.reg_save_offset),
11460 style, false);
11462 else if (m->fs.sp_offset != frame.reg_save_offset)
11464 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11465 GEN_INT (m->fs.sp_offset
11466 - frame.reg_save_offset),
11467 style,
11468 m->fs.cfa_reg == stack_pointer_rtx);
11471 ix86_emit_restore_regs_using_pop ();
11474 /* If we used a stack pointer and haven't already got rid of it,
11475 then do so now. */
11476 if (m->fs.fp_valid)
11478 /* If the stack pointer is valid and pointing at the frame
11479 pointer store address, then we only need a pop. */
11480 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11481 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11482 /* Leave results in shorter dependency chains on CPUs that are
11483 able to grok it fast. */
11484 else if (TARGET_USE_LEAVE
11485 || optimize_bb_for_size_p (EXIT_BLOCK_PTR)
11486 || !cfun->machine->use_fast_prologue_epilogue)
11487 ix86_emit_leave ();
11488 else
11490 pro_epilogue_adjust_stack (stack_pointer_rtx,
11491 hard_frame_pointer_rtx,
11492 const0_rtx, style, !using_drap);
11493 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11497 if (using_drap)
11499 int param_ptr_offset = UNITS_PER_WORD;
11500 rtx insn;
11502 gcc_assert (stack_realign_drap);
11504 if (ix86_static_chain_on_stack)
11505 param_ptr_offset += UNITS_PER_WORD;
11506 if (!call_used_regs[REGNO (crtl->drap_reg)])
11507 param_ptr_offset += UNITS_PER_WORD;
11509 insn = emit_insn (gen_rtx_SET
11510 (VOIDmode, stack_pointer_rtx,
11511 gen_rtx_PLUS (Pmode,
11512 crtl->drap_reg,
11513 GEN_INT (-param_ptr_offset))));
11514 m->fs.cfa_reg = stack_pointer_rtx;
11515 m->fs.cfa_offset = param_ptr_offset;
11516 m->fs.sp_offset = param_ptr_offset;
11517 m->fs.realigned = false;
11519 add_reg_note (insn, REG_CFA_DEF_CFA,
11520 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11521 GEN_INT (param_ptr_offset)));
11522 RTX_FRAME_RELATED_P (insn) = 1;
11524 if (!call_used_regs[REGNO (crtl->drap_reg)])
11525 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11528 /* At this point the stack pointer must be valid, and we must have
11529 restored all of the registers. We may not have deallocated the
11530 entire stack frame. We've delayed this until now because it may
11531 be possible to merge the local stack deallocation with the
11532 deallocation forced by ix86_static_chain_on_stack. */
11533 gcc_assert (m->fs.sp_valid);
11534 gcc_assert (!m->fs.fp_valid);
11535 gcc_assert (!m->fs.realigned);
11536 if (m->fs.sp_offset != UNITS_PER_WORD)
11538 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11539 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11540 style, true);
11542 else
11543 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11545 /* Sibcall epilogues don't want a return instruction. */
11546 if (style == 0)
11548 m->fs = frame_state_save;
11549 return;
11552 if (crtl->args.pops_args && crtl->args.size)
11554 rtx popc = GEN_INT (crtl->args.pops_args);
11556 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11557 address, do explicit add, and jump indirectly to the caller. */
11559 if (crtl->args.pops_args >= 65536)
11561 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11562 rtx insn;
11564 /* There is no "pascal" calling convention in any 64bit ABI. */
11565 gcc_assert (!TARGET_64BIT);
11567 insn = emit_insn (gen_pop (ecx));
11568 m->fs.cfa_offset -= UNITS_PER_WORD;
11569 m->fs.sp_offset -= UNITS_PER_WORD;
11571 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11572 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11573 add_reg_note (insn, REG_CFA_REGISTER,
11574 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11575 RTX_FRAME_RELATED_P (insn) = 1;
11577 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11578 popc, -1, true);
11579 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11581 else
11582 emit_jump_insn (gen_simple_return_pop_internal (popc));
11584 else
11585 emit_jump_insn (gen_simple_return_internal ());
11587 /* Restore the state back to the state from the prologue,
11588 so that it's correct for the next epilogue. */
11589 m->fs = frame_state_save;
11592 /* Reset from the function's potential modifications. */
11594 static void
11595 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11596 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11598 if (pic_offset_table_rtx)
11599 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11600 #if TARGET_MACHO
11601 /* Mach-O doesn't support labels at the end of objects, so if
11602 it looks like we might want one, insert a NOP. */
11604 rtx insn = get_last_insn ();
11605 rtx deleted_debug_label = NULL_RTX;
11606 while (insn
11607 && NOTE_P (insn)
11608 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11610 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11611 notes only, instead set their CODE_LABEL_NUMBER to -1,
11612 otherwise there would be code generation differences
11613 in between -g and -g0. */
11614 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11615 deleted_debug_label = insn;
11616 insn = PREV_INSN (insn);
11618 if (insn
11619 && (LABEL_P (insn)
11620 || (NOTE_P (insn)
11621 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11622 fputs ("\tnop\n", file);
11623 else if (deleted_debug_label)
11624 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11625 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11626 CODE_LABEL_NUMBER (insn) = -1;
11628 #endif
11632 /* Return a scratch register to use in the split stack prologue. The
11633 split stack prologue is used for -fsplit-stack. It is the first
11634 instructions in the function, even before the regular prologue.
11635 The scratch register can be any caller-saved register which is not
11636 used for parameters or for the static chain. */
11638 static unsigned int
11639 split_stack_prologue_scratch_regno (void)
11641 if (TARGET_64BIT)
11642 return R11_REG;
11643 else
11645 bool is_fastcall, is_thiscall;
11646 int regparm;
11648 is_fastcall = (lookup_attribute ("fastcall",
11649 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11650 != NULL);
11651 is_thiscall = (lookup_attribute ("thiscall",
11652 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11653 != NULL);
11654 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11656 if (is_fastcall)
11658 if (DECL_STATIC_CHAIN (cfun->decl))
11660 sorry ("-fsplit-stack does not support fastcall with "
11661 "nested function");
11662 return INVALID_REGNUM;
11664 return AX_REG;
11666 else if (is_thiscall)
11668 if (!DECL_STATIC_CHAIN (cfun->decl))
11669 return DX_REG;
11670 return AX_REG;
11672 else if (regparm < 3)
11674 if (!DECL_STATIC_CHAIN (cfun->decl))
11675 return CX_REG;
11676 else
11678 if (regparm >= 2)
11680 sorry ("-fsplit-stack does not support 2 register "
11681 " parameters for a nested function");
11682 return INVALID_REGNUM;
11684 return DX_REG;
11687 else
11689 /* FIXME: We could make this work by pushing a register
11690 around the addition and comparison. */
11691 sorry ("-fsplit-stack does not support 3 register parameters");
11692 return INVALID_REGNUM;
11697 /* A SYMBOL_REF for the function which allocates new stackspace for
11698 -fsplit-stack. */
11700 static GTY(()) rtx split_stack_fn;
11702 /* A SYMBOL_REF for the more stack function when using the large
11703 model. */
11705 static GTY(()) rtx split_stack_fn_large;
11707 /* Handle -fsplit-stack. These are the first instructions in the
11708 function, even before the regular prologue. */
11710 void
11711 ix86_expand_split_stack_prologue (void)
11713 struct ix86_frame frame;
11714 HOST_WIDE_INT allocate;
11715 unsigned HOST_WIDE_INT args_size;
11716 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11717 rtx scratch_reg = NULL_RTX;
11718 rtx varargs_label = NULL_RTX;
11719 rtx fn;
11721 gcc_assert (flag_split_stack && reload_completed);
11723 ix86_finalize_stack_realign_flags ();
11724 ix86_compute_frame_layout (&frame);
11725 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11727 /* This is the label we will branch to if we have enough stack
11728 space. We expect the basic block reordering pass to reverse this
11729 branch if optimizing, so that we branch in the unlikely case. */
11730 label = gen_label_rtx ();
11732 /* We need to compare the stack pointer minus the frame size with
11733 the stack boundary in the TCB. The stack boundary always gives
11734 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11735 can compare directly. Otherwise we need to do an addition. */
11737 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11738 UNSPEC_STACK_CHECK);
11739 limit = gen_rtx_CONST (Pmode, limit);
11740 limit = gen_rtx_MEM (Pmode, limit);
11741 if (allocate < SPLIT_STACK_AVAILABLE)
11742 current = stack_pointer_rtx;
11743 else
11745 unsigned int scratch_regno;
11746 rtx offset;
11748 /* We need a scratch register to hold the stack pointer minus
11749 the required frame size. Since this is the very start of the
11750 function, the scratch register can be any caller-saved
11751 register which is not used for parameters. */
11752 offset = GEN_INT (- allocate);
11753 scratch_regno = split_stack_prologue_scratch_regno ();
11754 if (scratch_regno == INVALID_REGNUM)
11755 return;
11756 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11757 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11759 /* We don't use ix86_gen_add3 in this case because it will
11760 want to split to lea, but when not optimizing the insn
11761 will not be split after this point. */
11762 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11763 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11764 offset)));
11766 else
11768 emit_move_insn (scratch_reg, offset);
11769 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11770 stack_pointer_rtx));
11772 current = scratch_reg;
11775 ix86_expand_branch (GEU, current, limit, label);
11776 jump_insn = get_last_insn ();
11777 JUMP_LABEL (jump_insn) = label;
11779 /* Mark the jump as very likely to be taken. */
11780 add_reg_note (jump_insn, REG_BR_PROB,
11781 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11783 if (split_stack_fn == NULL_RTX)
11784 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11785 fn = split_stack_fn;
11787 /* Get more stack space. We pass in the desired stack space and the
11788 size of the arguments to copy to the new stack. In 32-bit mode
11789 we push the parameters; __morestack will return on a new stack
11790 anyhow. In 64-bit mode we pass the parameters in r10 and
11791 r11. */
11792 allocate_rtx = GEN_INT (allocate);
11793 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11794 call_fusage = NULL_RTX;
11795 if (TARGET_64BIT)
11797 rtx reg10, reg11;
11799 reg10 = gen_rtx_REG (Pmode, R10_REG);
11800 reg11 = gen_rtx_REG (Pmode, R11_REG);
11802 /* If this function uses a static chain, it will be in %r10.
11803 Preserve it across the call to __morestack. */
11804 if (DECL_STATIC_CHAIN (cfun->decl))
11806 rtx rax;
11808 rax = gen_rtx_REG (word_mode, AX_REG);
11809 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11810 use_reg (&call_fusage, rax);
11813 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11814 && !TARGET_PECOFF)
11816 HOST_WIDE_INT argval;
11818 gcc_assert (Pmode == DImode);
11819 /* When using the large model we need to load the address
11820 into a register, and we've run out of registers. So we
11821 switch to a different calling convention, and we call a
11822 different function: __morestack_large. We pass the
11823 argument size in the upper 32 bits of r10 and pass the
11824 frame size in the lower 32 bits. */
11825 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11826 gcc_assert ((args_size & 0xffffffff) == args_size);
11828 if (split_stack_fn_large == NULL_RTX)
11829 split_stack_fn_large =
11830 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11832 if (ix86_cmodel == CM_LARGE_PIC)
11834 rtx label, x;
11836 label = gen_label_rtx ();
11837 emit_label (label);
11838 LABEL_PRESERVE_P (label) = 1;
11839 emit_insn (gen_set_rip_rex64 (reg10, label));
11840 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11841 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11842 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11843 UNSPEC_GOT);
11844 x = gen_rtx_CONST (Pmode, x);
11845 emit_move_insn (reg11, x);
11846 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11847 x = gen_const_mem (Pmode, x);
11848 emit_move_insn (reg11, x);
11850 else
11851 emit_move_insn (reg11, split_stack_fn_large);
11853 fn = reg11;
11855 argval = ((args_size << 16) << 16) + allocate;
11856 emit_move_insn (reg10, GEN_INT (argval));
11858 else
11860 emit_move_insn (reg10, allocate_rtx);
11861 emit_move_insn (reg11, GEN_INT (args_size));
11862 use_reg (&call_fusage, reg11);
11865 use_reg (&call_fusage, reg10);
11867 else
11869 emit_insn (gen_push (GEN_INT (args_size)));
11870 emit_insn (gen_push (allocate_rtx));
11872 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11873 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11874 NULL_RTX, false);
11875 add_function_usage_to (call_insn, call_fusage);
11877 /* In order to make call/return prediction work right, we now need
11878 to execute a return instruction. See
11879 libgcc/config/i386/morestack.S for the details on how this works.
11881 For flow purposes gcc must not see this as a return
11882 instruction--we need control flow to continue at the subsequent
11883 label. Therefore, we use an unspec. */
11884 gcc_assert (crtl->args.pops_args < 65536);
11885 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11887 /* If we are in 64-bit mode and this function uses a static chain,
11888 we saved %r10 in %rax before calling _morestack. */
11889 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11890 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11891 gen_rtx_REG (word_mode, AX_REG));
11893 /* If this function calls va_start, we need to store a pointer to
11894 the arguments on the old stack, because they may not have been
11895 all copied to the new stack. At this point the old stack can be
11896 found at the frame pointer value used by __morestack, because
11897 __morestack has set that up before calling back to us. Here we
11898 store that pointer in a scratch register, and in
11899 ix86_expand_prologue we store the scratch register in a stack
11900 slot. */
11901 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11903 unsigned int scratch_regno;
11904 rtx frame_reg;
11905 int words;
11907 scratch_regno = split_stack_prologue_scratch_regno ();
11908 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11909 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11911 /* 64-bit:
11912 fp -> old fp value
11913 return address within this function
11914 return address of caller of this function
11915 stack arguments
11916 So we add three words to get to the stack arguments.
11918 32-bit:
11919 fp -> old fp value
11920 return address within this function
11921 first argument to __morestack
11922 second argument to __morestack
11923 return address of caller of this function
11924 stack arguments
11925 So we add five words to get to the stack arguments.
11927 words = TARGET_64BIT ? 3 : 5;
11928 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11929 gen_rtx_PLUS (Pmode, frame_reg,
11930 GEN_INT (words * UNITS_PER_WORD))));
11932 varargs_label = gen_label_rtx ();
11933 emit_jump_insn (gen_jump (varargs_label));
11934 JUMP_LABEL (get_last_insn ()) = varargs_label;
11936 emit_barrier ();
11939 emit_label (label);
11940 LABEL_NUSES (label) = 1;
11942 /* If this function calls va_start, we now have to set the scratch
11943 register for the case where we do not call __morestack. In this
11944 case we need to set it based on the stack pointer. */
11945 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11947 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11948 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11949 GEN_INT (UNITS_PER_WORD))));
11951 emit_label (varargs_label);
11952 LABEL_NUSES (varargs_label) = 1;
11956 /* We may have to tell the dataflow pass that the split stack prologue
11957 is initializing a scratch register. */
11959 static void
11960 ix86_live_on_entry (bitmap regs)
11962 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11964 gcc_assert (flag_split_stack);
11965 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11969 /* Determine if op is suitable SUBREG RTX for address. */
11971 static bool
11972 ix86_address_subreg_operand (rtx op)
11974 enum machine_mode mode;
11976 if (!REG_P (op))
11977 return false;
11979 mode = GET_MODE (op);
11981 if (GET_MODE_CLASS (mode) != MODE_INT)
11982 return false;
11984 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11985 failures when the register is one word out of a two word structure. */
11986 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11987 return false;
11989 /* Allow only SUBREGs of non-eliminable hard registers. */
11990 return register_no_elim_operand (op, mode);
11993 /* Extract the parts of an RTL expression that is a valid memory address
11994 for an instruction. Return 0 if the structure of the address is
11995 grossly off. Return -1 if the address contains ASHIFT, so it is not
11996 strictly valid, but still used for computing length of lea instruction. */
11999 ix86_decompose_address (rtx addr, struct ix86_address *out)
12001 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
12002 rtx base_reg, index_reg;
12003 HOST_WIDE_INT scale = 1;
12004 rtx scale_rtx = NULL_RTX;
12005 rtx tmp;
12006 int retval = 1;
12007 enum ix86_address_seg seg = SEG_DEFAULT;
12009 /* Allow zero-extended SImode addresses,
12010 they will be emitted with addr32 prefix. */
12011 if (TARGET_64BIT && GET_MODE (addr) == DImode)
12013 if (GET_CODE (addr) == ZERO_EXTEND
12014 && GET_MODE (XEXP (addr, 0)) == SImode)
12016 addr = XEXP (addr, 0);
12017 if (CONST_INT_P (addr))
12018 return 0;
12020 else if (GET_CODE (addr) == AND
12021 && const_32bit_mask (XEXP (addr, 1), DImode))
12023 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
12024 if (addr == NULL_RTX)
12025 return 0;
12027 if (CONST_INT_P (addr))
12028 return 0;
12032 /* Allow SImode subregs of DImode addresses,
12033 they will be emitted with addr32 prefix. */
12034 if (TARGET_64BIT && GET_MODE (addr) == SImode)
12036 if (GET_CODE (addr) == SUBREG
12037 && GET_MODE (SUBREG_REG (addr)) == DImode)
12039 addr = SUBREG_REG (addr);
12040 if (CONST_INT_P (addr))
12041 return 0;
12045 if (REG_P (addr))
12046 base = addr;
12047 else if (GET_CODE (addr) == SUBREG)
12049 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
12050 base = addr;
12051 else
12052 return 0;
12054 else if (GET_CODE (addr) == PLUS)
12056 rtx addends[4], op;
12057 int n = 0, i;
12059 op = addr;
12062 if (n >= 4)
12063 return 0;
12064 addends[n++] = XEXP (op, 1);
12065 op = XEXP (op, 0);
12067 while (GET_CODE (op) == PLUS);
12068 if (n >= 4)
12069 return 0;
12070 addends[n] = op;
12072 for (i = n; i >= 0; --i)
12074 op = addends[i];
12075 switch (GET_CODE (op))
12077 case MULT:
12078 if (index)
12079 return 0;
12080 index = XEXP (op, 0);
12081 scale_rtx = XEXP (op, 1);
12082 break;
12084 case ASHIFT:
12085 if (index)
12086 return 0;
12087 index = XEXP (op, 0);
12088 tmp = XEXP (op, 1);
12089 if (!CONST_INT_P (tmp))
12090 return 0;
12091 scale = INTVAL (tmp);
12092 if ((unsigned HOST_WIDE_INT) scale > 3)
12093 return 0;
12094 scale = 1 << scale;
12095 break;
12097 case ZERO_EXTEND:
12098 op = XEXP (op, 0);
12099 if (GET_CODE (op) != UNSPEC)
12100 return 0;
12101 /* FALLTHRU */
12103 case UNSPEC:
12104 if (XINT (op, 1) == UNSPEC_TP
12105 && TARGET_TLS_DIRECT_SEG_REFS
12106 && seg == SEG_DEFAULT)
12107 seg = DEFAULT_TLS_SEG_REG;
12108 else
12109 return 0;
12110 break;
12112 case SUBREG:
12113 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
12114 return 0;
12115 /* FALLTHRU */
12117 case REG:
12118 if (!base)
12119 base = op;
12120 else if (!index)
12121 index = op;
12122 else
12123 return 0;
12124 break;
12126 case CONST:
12127 case CONST_INT:
12128 case SYMBOL_REF:
12129 case LABEL_REF:
12130 if (disp)
12131 return 0;
12132 disp = op;
12133 break;
12135 default:
12136 return 0;
12140 else if (GET_CODE (addr) == MULT)
12142 index = XEXP (addr, 0); /* index*scale */
12143 scale_rtx = XEXP (addr, 1);
12145 else if (GET_CODE (addr) == ASHIFT)
12147 /* We're called for lea too, which implements ashift on occasion. */
12148 index = XEXP (addr, 0);
12149 tmp = XEXP (addr, 1);
12150 if (!CONST_INT_P (tmp))
12151 return 0;
12152 scale = INTVAL (tmp);
12153 if ((unsigned HOST_WIDE_INT) scale > 3)
12154 return 0;
12155 scale = 1 << scale;
12156 retval = -1;
12158 else if (CONST_INT_P (addr))
12160 if (!x86_64_immediate_operand (addr, VOIDmode))
12161 return 0;
12163 /* Constant addresses are sign extended to 64bit, we have to
12164 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
12165 if (TARGET_X32
12166 && val_signbit_known_set_p (SImode, INTVAL (addr)))
12167 return 0;
12169 disp = addr;
12171 else
12172 disp = addr; /* displacement */
12174 if (index)
12176 if (REG_P (index))
12178 else if (GET_CODE (index) == SUBREG
12179 && ix86_address_subreg_operand (SUBREG_REG (index)))
12181 else
12182 return 0;
12185 /* Address override works only on the (%reg) part of %fs:(%reg). */
12186 if (seg != SEG_DEFAULT
12187 && ((base && GET_MODE (base) != word_mode)
12188 || (index && GET_MODE (index) != word_mode)))
12189 return 0;
12191 /* Extract the integral value of scale. */
12192 if (scale_rtx)
12194 if (!CONST_INT_P (scale_rtx))
12195 return 0;
12196 scale = INTVAL (scale_rtx);
12199 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
12200 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
12202 /* Avoid useless 0 displacement. */
12203 if (disp == const0_rtx && (base || index))
12204 disp = NULL_RTX;
12206 /* Allow arg pointer and stack pointer as index if there is not scaling. */
12207 if (base_reg && index_reg && scale == 1
12208 && (index_reg == arg_pointer_rtx
12209 || index_reg == frame_pointer_rtx
12210 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12212 rtx tmp;
12213 tmp = base, base = index, index = tmp;
12214 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12217 /* Special case: %ebp cannot be encoded as a base without a displacement.
12218 Similarly %r13. */
12219 if (!disp
12220 && base_reg
12221 && (base_reg == hard_frame_pointer_rtx
12222 || base_reg == frame_pointer_rtx
12223 || base_reg == arg_pointer_rtx
12224 || (REG_P (base_reg)
12225 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12226 || REGNO (base_reg) == R13_REG))))
12227 disp = const0_rtx;
12229 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12230 Avoid this by transforming to [%esi+0].
12231 Reload calls address legitimization without cfun defined, so we need
12232 to test cfun for being non-NULL. */
12233 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12234 && base_reg && !index_reg && !disp
12235 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12236 disp = const0_rtx;
12238 /* Special case: encode reg+reg instead of reg*2. */
12239 if (!base && index && scale == 2)
12240 base = index, base_reg = index_reg, scale = 1;
12242 /* Special case: scaling cannot be encoded without base or displacement. */
12243 if (!base && !disp && index && scale != 1)
12244 disp = const0_rtx;
12246 out->base = base;
12247 out->index = index;
12248 out->disp = disp;
12249 out->scale = scale;
12250 out->seg = seg;
12252 return retval;
12255 /* Return cost of the memory address x.
12256 For i386, it is better to use a complex address than let gcc copy
12257 the address into a reg and make a new pseudo. But not if the address
12258 requires to two regs - that would mean more pseudos with longer
12259 lifetimes. */
12260 static int
12261 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12262 addr_space_t as ATTRIBUTE_UNUSED,
12263 bool speed ATTRIBUTE_UNUSED)
12265 struct ix86_address parts;
12266 int cost = 1;
12267 int ok = ix86_decompose_address (x, &parts);
12269 gcc_assert (ok);
12271 if (parts.base && GET_CODE (parts.base) == SUBREG)
12272 parts.base = SUBREG_REG (parts.base);
12273 if (parts.index && GET_CODE (parts.index) == SUBREG)
12274 parts.index = SUBREG_REG (parts.index);
12276 /* Attempt to minimize number of registers in the address. */
12277 if ((parts.base
12278 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12279 || (parts.index
12280 && (!REG_P (parts.index)
12281 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12282 cost++;
12284 if (parts.base
12285 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12286 && parts.index
12287 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12288 && parts.base != parts.index)
12289 cost++;
12291 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12292 since it's predecode logic can't detect the length of instructions
12293 and it degenerates to vector decoded. Increase cost of such
12294 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12295 to split such addresses or even refuse such addresses at all.
12297 Following addressing modes are affected:
12298 [base+scale*index]
12299 [scale*index+disp]
12300 [base+index]
12302 The first and last case may be avoidable by explicitly coding the zero in
12303 memory address, but I don't have AMD-K6 machine handy to check this
12304 theory. */
12306 if (TARGET_K6
12307 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12308 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12309 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12310 cost += 10;
12312 return cost;
12315 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12316 this is used for to form addresses to local data when -fPIC is in
12317 use. */
12319 static bool
12320 darwin_local_data_pic (rtx disp)
12322 return (GET_CODE (disp) == UNSPEC
12323 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12326 /* Determine if a given RTX is a valid constant. We already know this
12327 satisfies CONSTANT_P. */
12329 static bool
12330 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12332 switch (GET_CODE (x))
12334 case CONST:
12335 x = XEXP (x, 0);
12337 if (GET_CODE (x) == PLUS)
12339 if (!CONST_INT_P (XEXP (x, 1)))
12340 return false;
12341 x = XEXP (x, 0);
12344 if (TARGET_MACHO && darwin_local_data_pic (x))
12345 return true;
12347 /* Only some unspecs are valid as "constants". */
12348 if (GET_CODE (x) == UNSPEC)
12349 switch (XINT (x, 1))
12351 case UNSPEC_GOT:
12352 case UNSPEC_GOTOFF:
12353 case UNSPEC_PLTOFF:
12354 return TARGET_64BIT;
12355 case UNSPEC_TPOFF:
12356 case UNSPEC_NTPOFF:
12357 x = XVECEXP (x, 0, 0);
12358 return (GET_CODE (x) == SYMBOL_REF
12359 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12360 case UNSPEC_DTPOFF:
12361 x = XVECEXP (x, 0, 0);
12362 return (GET_CODE (x) == SYMBOL_REF
12363 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12364 default:
12365 return false;
12368 /* We must have drilled down to a symbol. */
12369 if (GET_CODE (x) == LABEL_REF)
12370 return true;
12371 if (GET_CODE (x) != SYMBOL_REF)
12372 return false;
12373 /* FALLTHRU */
12375 case SYMBOL_REF:
12376 /* TLS symbols are never valid. */
12377 if (SYMBOL_REF_TLS_MODEL (x))
12378 return false;
12380 /* DLLIMPORT symbols are never valid. */
12381 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12382 && SYMBOL_REF_DLLIMPORT_P (x))
12383 return false;
12385 #if TARGET_MACHO
12386 /* mdynamic-no-pic */
12387 if (MACHO_DYNAMIC_NO_PIC_P)
12388 return machopic_symbol_defined_p (x);
12389 #endif
12390 break;
12392 case CONST_DOUBLE:
12393 if (GET_MODE (x) == TImode
12394 && x != CONST0_RTX (TImode)
12395 && !TARGET_64BIT)
12396 return false;
12397 break;
12399 case CONST_VECTOR:
12400 if (!standard_sse_constant_p (x))
12401 return false;
12403 default:
12404 break;
12407 /* Otherwise we handle everything else in the move patterns. */
12408 return true;
12411 /* Determine if it's legal to put X into the constant pool. This
12412 is not possible for the address of thread-local symbols, which
12413 is checked above. */
12415 static bool
12416 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12418 /* We can always put integral constants and vectors in memory. */
12419 switch (GET_CODE (x))
12421 case CONST_INT:
12422 case CONST_DOUBLE:
12423 case CONST_VECTOR:
12424 return false;
12426 default:
12427 break;
12429 return !ix86_legitimate_constant_p (mode, x);
12432 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12433 otherwise zero. */
12435 static bool
12436 is_imported_p (rtx x)
12438 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12439 || GET_CODE (x) != SYMBOL_REF)
12440 return false;
12442 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12446 /* Nonzero if the constant value X is a legitimate general operand
12447 when generating PIC code. It is given that flag_pic is on and
12448 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12450 bool
12451 legitimate_pic_operand_p (rtx x)
12453 rtx inner;
12455 switch (GET_CODE (x))
12457 case CONST:
12458 inner = XEXP (x, 0);
12459 if (GET_CODE (inner) == PLUS
12460 && CONST_INT_P (XEXP (inner, 1)))
12461 inner = XEXP (inner, 0);
12463 /* Only some unspecs are valid as "constants". */
12464 if (GET_CODE (inner) == UNSPEC)
12465 switch (XINT (inner, 1))
12467 case UNSPEC_GOT:
12468 case UNSPEC_GOTOFF:
12469 case UNSPEC_PLTOFF:
12470 return TARGET_64BIT;
12471 case UNSPEC_TPOFF:
12472 x = XVECEXP (inner, 0, 0);
12473 return (GET_CODE (x) == SYMBOL_REF
12474 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12475 case UNSPEC_MACHOPIC_OFFSET:
12476 return legitimate_pic_address_disp_p (x);
12477 default:
12478 return false;
12480 /* FALLTHRU */
12482 case SYMBOL_REF:
12483 case LABEL_REF:
12484 return legitimate_pic_address_disp_p (x);
12486 default:
12487 return true;
12491 /* Determine if a given CONST RTX is a valid memory displacement
12492 in PIC mode. */
12494 bool
12495 legitimate_pic_address_disp_p (rtx disp)
12497 bool saw_plus;
12499 /* In 64bit mode we can allow direct addresses of symbols and labels
12500 when they are not dynamic symbols. */
12501 if (TARGET_64BIT)
12503 rtx op0 = disp, op1;
12505 switch (GET_CODE (disp))
12507 case LABEL_REF:
12508 return true;
12510 case CONST:
12511 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12512 break;
12513 op0 = XEXP (XEXP (disp, 0), 0);
12514 op1 = XEXP (XEXP (disp, 0), 1);
12515 if (!CONST_INT_P (op1)
12516 || INTVAL (op1) >= 16*1024*1024
12517 || INTVAL (op1) < -16*1024*1024)
12518 break;
12519 if (GET_CODE (op0) == LABEL_REF)
12520 return true;
12521 if (GET_CODE (op0) == CONST
12522 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12523 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12524 return true;
12525 if (GET_CODE (op0) == UNSPEC
12526 && XINT (op0, 1) == UNSPEC_PCREL)
12527 return true;
12528 if (GET_CODE (op0) != SYMBOL_REF)
12529 break;
12530 /* FALLTHRU */
12532 case SYMBOL_REF:
12533 /* TLS references should always be enclosed in UNSPEC.
12534 The dllimported symbol needs always to be resolved. */
12535 if (SYMBOL_REF_TLS_MODEL (op0)
12536 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12537 return false;
12539 if (TARGET_PECOFF)
12541 if (is_imported_p (op0))
12542 return true;
12544 if (SYMBOL_REF_FAR_ADDR_P (op0)
12545 || !SYMBOL_REF_LOCAL_P (op0))
12546 break;
12548 /* Function-symbols need to be resolved only for
12549 large-model.
12550 For the small-model we don't need to resolve anything
12551 here. */
12552 if ((ix86_cmodel != CM_LARGE_PIC
12553 && SYMBOL_REF_FUNCTION_P (op0))
12554 || ix86_cmodel == CM_SMALL_PIC)
12555 return true;
12556 /* Non-external symbols don't need to be resolved for
12557 large, and medium-model. */
12558 if ((ix86_cmodel == CM_LARGE_PIC
12559 || ix86_cmodel == CM_MEDIUM_PIC)
12560 && !SYMBOL_REF_EXTERNAL_P (op0))
12561 return true;
12563 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12564 && SYMBOL_REF_LOCAL_P (op0)
12565 && ix86_cmodel != CM_LARGE_PIC)
12566 return true;
12567 break;
12569 default:
12570 break;
12573 if (GET_CODE (disp) != CONST)
12574 return false;
12575 disp = XEXP (disp, 0);
12577 if (TARGET_64BIT)
12579 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12580 of GOT tables. We should not need these anyway. */
12581 if (GET_CODE (disp) != UNSPEC
12582 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12583 && XINT (disp, 1) != UNSPEC_GOTOFF
12584 && XINT (disp, 1) != UNSPEC_PCREL
12585 && XINT (disp, 1) != UNSPEC_PLTOFF))
12586 return false;
12588 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12589 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12590 return false;
12591 return true;
12594 saw_plus = false;
12595 if (GET_CODE (disp) == PLUS)
12597 if (!CONST_INT_P (XEXP (disp, 1)))
12598 return false;
12599 disp = XEXP (disp, 0);
12600 saw_plus = true;
12603 if (TARGET_MACHO && darwin_local_data_pic (disp))
12604 return true;
12606 if (GET_CODE (disp) != UNSPEC)
12607 return false;
12609 switch (XINT (disp, 1))
12611 case UNSPEC_GOT:
12612 if (saw_plus)
12613 return false;
12614 /* We need to check for both symbols and labels because VxWorks loads
12615 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12616 details. */
12617 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12618 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12619 case UNSPEC_GOTOFF:
12620 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12621 While ABI specify also 32bit relocation but we don't produce it in
12622 small PIC model at all. */
12623 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12624 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12625 && !TARGET_64BIT)
12626 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12627 return false;
12628 case UNSPEC_GOTTPOFF:
12629 case UNSPEC_GOTNTPOFF:
12630 case UNSPEC_INDNTPOFF:
12631 if (saw_plus)
12632 return false;
12633 disp = XVECEXP (disp, 0, 0);
12634 return (GET_CODE (disp) == SYMBOL_REF
12635 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12636 case UNSPEC_NTPOFF:
12637 disp = XVECEXP (disp, 0, 0);
12638 return (GET_CODE (disp) == SYMBOL_REF
12639 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12640 case UNSPEC_DTPOFF:
12641 disp = XVECEXP (disp, 0, 0);
12642 return (GET_CODE (disp) == SYMBOL_REF
12643 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12646 return false;
12649 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12650 replace the input X, or the original X if no replacement is called for.
12651 The output parameter *WIN is 1 if the calling macro should goto WIN,
12652 0 if it should not. */
12654 bool
12655 ix86_legitimize_reload_address (rtx x,
12656 enum machine_mode mode ATTRIBUTE_UNUSED,
12657 int opnum, int type,
12658 int ind_levels ATTRIBUTE_UNUSED)
12660 /* Reload can generate:
12662 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12663 (reg:DI 97))
12664 (reg:DI 2 cx))
12666 This RTX is rejected from ix86_legitimate_address_p due to
12667 non-strictness of base register 97. Following this rejection,
12668 reload pushes all three components into separate registers,
12669 creating invalid memory address RTX.
12671 Following code reloads only the invalid part of the
12672 memory address RTX. */
12674 if (GET_CODE (x) == PLUS
12675 && REG_P (XEXP (x, 1))
12676 && GET_CODE (XEXP (x, 0)) == PLUS
12677 && REG_P (XEXP (XEXP (x, 0), 1)))
12679 rtx base, index;
12680 bool something_reloaded = false;
12682 base = XEXP (XEXP (x, 0), 1);
12683 if (!REG_OK_FOR_BASE_STRICT_P (base))
12685 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12686 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12687 opnum, (enum reload_type) type);
12688 something_reloaded = true;
12691 index = XEXP (x, 1);
12692 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12694 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12695 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12696 opnum, (enum reload_type) type);
12697 something_reloaded = true;
12700 gcc_assert (something_reloaded);
12701 return true;
12704 return false;
12707 /* Recognizes RTL expressions that are valid memory addresses for an
12708 instruction. The MODE argument is the machine mode for the MEM
12709 expression that wants to use this address.
12711 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12712 convert common non-canonical forms to canonical form so that they will
12713 be recognized. */
12715 static bool
12716 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12717 rtx addr, bool strict)
12719 struct ix86_address parts;
12720 rtx base, index, disp;
12721 HOST_WIDE_INT scale;
12723 if (ix86_decompose_address (addr, &parts) <= 0)
12724 /* Decomposition failed. */
12725 return false;
12727 base = parts.base;
12728 index = parts.index;
12729 disp = parts.disp;
12730 scale = parts.scale;
12732 /* Validate base register. */
12733 if (base)
12735 rtx reg;
12737 if (REG_P (base))
12738 reg = base;
12739 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12740 reg = SUBREG_REG (base);
12741 else
12742 /* Base is not a register. */
12743 return false;
12745 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12746 return false;
12748 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12749 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12750 /* Base is not valid. */
12751 return false;
12754 /* Validate index register. */
12755 if (index)
12757 rtx reg;
12759 if (REG_P (index))
12760 reg = index;
12761 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12762 reg = SUBREG_REG (index);
12763 else
12764 /* Index is not a register. */
12765 return false;
12767 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12768 return false;
12770 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12771 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12772 /* Index is not valid. */
12773 return false;
12776 /* Index and base should have the same mode. */
12777 if (base && index
12778 && GET_MODE (base) != GET_MODE (index))
12779 return false;
12781 /* Validate scale factor. */
12782 if (scale != 1)
12784 if (!index)
12785 /* Scale without index. */
12786 return false;
12788 if (scale != 2 && scale != 4 && scale != 8)
12789 /* Scale is not a valid multiplier. */
12790 return false;
12793 /* Validate displacement. */
12794 if (disp)
12796 if (GET_CODE (disp) == CONST
12797 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12798 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12799 switch (XINT (XEXP (disp, 0), 1))
12801 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12802 used. While ABI specify also 32bit relocations, we don't produce
12803 them at all and use IP relative instead. */
12804 case UNSPEC_GOT:
12805 case UNSPEC_GOTOFF:
12806 gcc_assert (flag_pic);
12807 if (!TARGET_64BIT)
12808 goto is_legitimate_pic;
12810 /* 64bit address unspec. */
12811 return false;
12813 case UNSPEC_GOTPCREL:
12814 case UNSPEC_PCREL:
12815 gcc_assert (flag_pic);
12816 goto is_legitimate_pic;
12818 case UNSPEC_GOTTPOFF:
12819 case UNSPEC_GOTNTPOFF:
12820 case UNSPEC_INDNTPOFF:
12821 case UNSPEC_NTPOFF:
12822 case UNSPEC_DTPOFF:
12823 break;
12825 case UNSPEC_STACK_CHECK:
12826 gcc_assert (flag_split_stack);
12827 break;
12829 default:
12830 /* Invalid address unspec. */
12831 return false;
12834 else if (SYMBOLIC_CONST (disp)
12835 && (flag_pic
12836 || (TARGET_MACHO
12837 #if TARGET_MACHO
12838 && MACHOPIC_INDIRECT
12839 && !machopic_operand_p (disp)
12840 #endif
12844 is_legitimate_pic:
12845 if (TARGET_64BIT && (index || base))
12847 /* foo@dtpoff(%rX) is ok. */
12848 if (GET_CODE (disp) != CONST
12849 || GET_CODE (XEXP (disp, 0)) != PLUS
12850 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12851 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12852 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12853 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12854 /* Non-constant pic memory reference. */
12855 return false;
12857 else if ((!TARGET_MACHO || flag_pic)
12858 && ! legitimate_pic_address_disp_p (disp))
12859 /* Displacement is an invalid pic construct. */
12860 return false;
12861 #if TARGET_MACHO
12862 else if (MACHO_DYNAMIC_NO_PIC_P
12863 && !ix86_legitimate_constant_p (Pmode, disp))
12864 /* displacment must be referenced via non_lazy_pointer */
12865 return false;
12866 #endif
12868 /* This code used to verify that a symbolic pic displacement
12869 includes the pic_offset_table_rtx register.
12871 While this is good idea, unfortunately these constructs may
12872 be created by "adds using lea" optimization for incorrect
12873 code like:
12875 int a;
12876 int foo(int i)
12878 return *(&a+i);
12881 This code is nonsensical, but results in addressing
12882 GOT table with pic_offset_table_rtx base. We can't
12883 just refuse it easily, since it gets matched by
12884 "addsi3" pattern, that later gets split to lea in the
12885 case output register differs from input. While this
12886 can be handled by separate addsi pattern for this case
12887 that never results in lea, this seems to be easier and
12888 correct fix for crash to disable this test. */
12890 else if (GET_CODE (disp) != LABEL_REF
12891 && !CONST_INT_P (disp)
12892 && (GET_CODE (disp) != CONST
12893 || !ix86_legitimate_constant_p (Pmode, disp))
12894 && (GET_CODE (disp) != SYMBOL_REF
12895 || !ix86_legitimate_constant_p (Pmode, disp)))
12896 /* Displacement is not constant. */
12897 return false;
12898 else if (TARGET_64BIT
12899 && !x86_64_immediate_operand (disp, VOIDmode))
12900 /* Displacement is out of range. */
12901 return false;
12904 /* Everything looks valid. */
12905 return true;
12908 /* Determine if a given RTX is a valid constant address. */
12910 bool
12911 constant_address_p (rtx x)
12913 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12916 /* Return a unique alias set for the GOT. */
12918 static alias_set_type
12919 ix86_GOT_alias_set (void)
12921 static alias_set_type set = -1;
12922 if (set == -1)
12923 set = new_alias_set ();
12924 return set;
12927 /* Return a legitimate reference for ORIG (an address) using the
12928 register REG. If REG is 0, a new pseudo is generated.
12930 There are two types of references that must be handled:
12932 1. Global data references must load the address from the GOT, via
12933 the PIC reg. An insn is emitted to do this load, and the reg is
12934 returned.
12936 2. Static data references, constant pool addresses, and code labels
12937 compute the address as an offset from the GOT, whose base is in
12938 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12939 differentiate them from global data objects. The returned
12940 address is the PIC reg + an unspec constant.
12942 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12943 reg also appears in the address. */
12945 static rtx
12946 legitimize_pic_address (rtx orig, rtx reg)
12948 rtx addr = orig;
12949 rtx new_rtx = orig;
12951 #if TARGET_MACHO
12952 if (TARGET_MACHO && !TARGET_64BIT)
12954 if (reg == 0)
12955 reg = gen_reg_rtx (Pmode);
12956 /* Use the generic Mach-O PIC machinery. */
12957 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12959 #endif
12961 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12963 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12964 if (tmp)
12965 return tmp;
12968 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12969 new_rtx = addr;
12970 else if (TARGET_64BIT && !TARGET_PECOFF
12971 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12973 rtx tmpreg;
12974 /* This symbol may be referenced via a displacement from the PIC
12975 base address (@GOTOFF). */
12977 if (reload_in_progress)
12978 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12979 if (GET_CODE (addr) == CONST)
12980 addr = XEXP (addr, 0);
12981 if (GET_CODE (addr) == PLUS)
12983 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12984 UNSPEC_GOTOFF);
12985 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12987 else
12988 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12989 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12990 if (!reg)
12991 tmpreg = gen_reg_rtx (Pmode);
12992 else
12993 tmpreg = reg;
12994 emit_move_insn (tmpreg, new_rtx);
12996 if (reg != 0)
12998 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12999 tmpreg, 1, OPTAB_DIRECT);
13000 new_rtx = reg;
13002 else
13003 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
13005 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
13007 /* This symbol may be referenced via a displacement from the PIC
13008 base address (@GOTOFF). */
13010 if (reload_in_progress)
13011 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13012 if (GET_CODE (addr) == CONST)
13013 addr = XEXP (addr, 0);
13014 if (GET_CODE (addr) == PLUS)
13016 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
13017 UNSPEC_GOTOFF);
13018 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
13020 else
13021 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
13022 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13023 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
13025 if (reg != 0)
13027 emit_move_insn (reg, new_rtx);
13028 new_rtx = reg;
13031 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
13032 /* We can't use @GOTOFF for text labels on VxWorks;
13033 see gotoff_operand. */
13034 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
13036 rtx tmp = legitimize_pe_coff_symbol (addr, true);
13037 if (tmp)
13038 return tmp;
13040 /* For x64 PE-COFF there is no GOT table. So we use address
13041 directly. */
13042 if (TARGET_64BIT && TARGET_PECOFF)
13044 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
13045 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13047 if (reg == 0)
13048 reg = gen_reg_rtx (Pmode);
13049 emit_move_insn (reg, new_rtx);
13050 new_rtx = reg;
13052 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
13054 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
13055 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13056 new_rtx = gen_const_mem (Pmode, new_rtx);
13057 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
13059 if (reg == 0)
13060 reg = gen_reg_rtx (Pmode);
13061 /* Use directly gen_movsi, otherwise the address is loaded
13062 into register for CSE. We don't want to CSE this addresses,
13063 instead we CSE addresses from the GOT table, so skip this. */
13064 emit_insn (gen_movsi (reg, new_rtx));
13065 new_rtx = reg;
13067 else
13069 /* This symbol must be referenced via a load from the
13070 Global Offset Table (@GOT). */
13072 if (reload_in_progress)
13073 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13074 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
13075 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13076 if (TARGET_64BIT)
13077 new_rtx = force_reg (Pmode, new_rtx);
13078 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
13079 new_rtx = gen_const_mem (Pmode, new_rtx);
13080 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
13082 if (reg == 0)
13083 reg = gen_reg_rtx (Pmode);
13084 emit_move_insn (reg, new_rtx);
13085 new_rtx = reg;
13088 else
13090 if (CONST_INT_P (addr)
13091 && !x86_64_immediate_operand (addr, VOIDmode))
13093 if (reg)
13095 emit_move_insn (reg, addr);
13096 new_rtx = reg;
13098 else
13099 new_rtx = force_reg (Pmode, addr);
13101 else if (GET_CODE (addr) == CONST)
13103 addr = XEXP (addr, 0);
13105 /* We must match stuff we generate before. Assume the only
13106 unspecs that can get here are ours. Not that we could do
13107 anything with them anyway.... */
13108 if (GET_CODE (addr) == UNSPEC
13109 || (GET_CODE (addr) == PLUS
13110 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
13111 return orig;
13112 gcc_assert (GET_CODE (addr) == PLUS);
13114 if (GET_CODE (addr) == PLUS)
13116 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
13118 /* Check first to see if this is a constant offset from a @GOTOFF
13119 symbol reference. */
13120 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
13121 && CONST_INT_P (op1))
13123 if (!TARGET_64BIT)
13125 if (reload_in_progress)
13126 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13127 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
13128 UNSPEC_GOTOFF);
13129 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
13130 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13131 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
13133 if (reg != 0)
13135 emit_move_insn (reg, new_rtx);
13136 new_rtx = reg;
13139 else
13141 if (INTVAL (op1) < -16*1024*1024
13142 || INTVAL (op1) >= 16*1024*1024)
13144 if (!x86_64_immediate_operand (op1, Pmode))
13145 op1 = force_reg (Pmode, op1);
13146 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
13150 else
13152 rtx base = legitimize_pic_address (op0, reg);
13153 enum machine_mode mode = GET_MODE (base);
13154 new_rtx
13155 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
13157 if (CONST_INT_P (new_rtx))
13159 if (INTVAL (new_rtx) < -16*1024*1024
13160 || INTVAL (new_rtx) >= 16*1024*1024)
13162 if (!x86_64_immediate_operand (new_rtx, mode))
13163 new_rtx = force_reg (mode, new_rtx);
13164 new_rtx
13165 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
13167 else
13168 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
13170 else
13172 if (GET_CODE (new_rtx) == PLUS
13173 && CONSTANT_P (XEXP (new_rtx, 1)))
13175 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
13176 new_rtx = XEXP (new_rtx, 1);
13178 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
13183 return new_rtx;
13186 /* Load the thread pointer. If TO_REG is true, force it into a register. */
13188 static rtx
13189 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
13191 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
13193 if (GET_MODE (tp) != tp_mode)
13195 gcc_assert (GET_MODE (tp) == SImode);
13196 gcc_assert (tp_mode == DImode);
13198 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
13201 if (to_reg)
13202 tp = copy_to_mode_reg (tp_mode, tp);
13204 return tp;
13207 /* Construct the SYMBOL_REF for the tls_get_addr function. */
13209 static GTY(()) rtx ix86_tls_symbol;
13211 static rtx
13212 ix86_tls_get_addr (void)
13214 if (!ix86_tls_symbol)
13216 const char *sym
13217 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
13218 ? "___tls_get_addr" : "__tls_get_addr");
13220 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
13223 return ix86_tls_symbol;
13226 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
13228 static GTY(()) rtx ix86_tls_module_base_symbol;
13231 ix86_tls_module_base (void)
13233 if (!ix86_tls_module_base_symbol)
13235 ix86_tls_module_base_symbol
13236 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
13238 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
13239 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13242 return ix86_tls_module_base_symbol;
13245 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13246 false if we expect this to be used for a memory address and true if
13247 we expect to load the address into a register. */
13249 static rtx
13250 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13252 rtx dest, base, off;
13253 rtx pic = NULL_RTX, tp = NULL_RTX;
13254 enum machine_mode tp_mode = Pmode;
13255 int type;
13257 switch (model)
13259 case TLS_MODEL_GLOBAL_DYNAMIC:
13260 dest = gen_reg_rtx (Pmode);
13262 if (!TARGET_64BIT)
13264 if (flag_pic && !TARGET_PECOFF)
13265 pic = pic_offset_table_rtx;
13266 else
13268 pic = gen_reg_rtx (Pmode);
13269 emit_insn (gen_set_got (pic));
13273 if (TARGET_GNU2_TLS)
13275 if (TARGET_64BIT)
13276 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13277 else
13278 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13280 tp = get_thread_pointer (Pmode, true);
13281 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13283 if (GET_MODE (x) != Pmode)
13284 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13286 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13288 else
13290 rtx caddr = ix86_tls_get_addr ();
13292 if (TARGET_64BIT)
13294 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13295 rtx insns;
13297 start_sequence ();
13298 emit_call_insn
13299 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13300 insns = get_insns ();
13301 end_sequence ();
13303 if (GET_MODE (x) != Pmode)
13304 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13306 RTL_CONST_CALL_P (insns) = 1;
13307 emit_libcall_block (insns, dest, rax, x);
13309 else
13310 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13312 break;
13314 case TLS_MODEL_LOCAL_DYNAMIC:
13315 base = gen_reg_rtx (Pmode);
13317 if (!TARGET_64BIT)
13319 if (flag_pic)
13320 pic = pic_offset_table_rtx;
13321 else
13323 pic = gen_reg_rtx (Pmode);
13324 emit_insn (gen_set_got (pic));
13328 if (TARGET_GNU2_TLS)
13330 rtx tmp = ix86_tls_module_base ();
13332 if (TARGET_64BIT)
13333 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13334 else
13335 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13337 tp = get_thread_pointer (Pmode, true);
13338 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13339 gen_rtx_MINUS (Pmode, tmp, tp));
13341 else
13343 rtx caddr = ix86_tls_get_addr ();
13345 if (TARGET_64BIT)
13347 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13348 rtx insns, eqv;
13350 start_sequence ();
13351 emit_call_insn
13352 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13353 insns = get_insns ();
13354 end_sequence ();
13356 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13357 share the LD_BASE result with other LD model accesses. */
13358 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13359 UNSPEC_TLS_LD_BASE);
13361 RTL_CONST_CALL_P (insns) = 1;
13362 emit_libcall_block (insns, base, rax, eqv);
13364 else
13365 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13368 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13369 off = gen_rtx_CONST (Pmode, off);
13371 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13373 if (TARGET_GNU2_TLS)
13375 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13377 if (GET_MODE (x) != Pmode)
13378 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13380 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13382 break;
13384 case TLS_MODEL_INITIAL_EXEC:
13385 if (TARGET_64BIT)
13387 if (TARGET_SUN_TLS && !TARGET_X32)
13389 /* The Sun linker took the AMD64 TLS spec literally
13390 and can only handle %rax as destination of the
13391 initial executable code sequence. */
13393 dest = gen_reg_rtx (DImode);
13394 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13395 return dest;
13398 /* Generate DImode references to avoid %fs:(%reg32)
13399 problems and linker IE->LE relaxation bug. */
13400 tp_mode = DImode;
13401 pic = NULL;
13402 type = UNSPEC_GOTNTPOFF;
13404 else if (flag_pic)
13406 if (reload_in_progress)
13407 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13408 pic = pic_offset_table_rtx;
13409 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13411 else if (!TARGET_ANY_GNU_TLS)
13413 pic = gen_reg_rtx (Pmode);
13414 emit_insn (gen_set_got (pic));
13415 type = UNSPEC_GOTTPOFF;
13417 else
13419 pic = NULL;
13420 type = UNSPEC_INDNTPOFF;
13423 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13424 off = gen_rtx_CONST (tp_mode, off);
13425 if (pic)
13426 off = gen_rtx_PLUS (tp_mode, pic, off);
13427 off = gen_const_mem (tp_mode, off);
13428 set_mem_alias_set (off, ix86_GOT_alias_set ());
13430 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13432 base = get_thread_pointer (tp_mode,
13433 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13434 off = force_reg (tp_mode, off);
13435 return gen_rtx_PLUS (tp_mode, base, off);
13437 else
13439 base = get_thread_pointer (Pmode, true);
13440 dest = gen_reg_rtx (Pmode);
13441 emit_insn (ix86_gen_sub3 (dest, base, off));
13443 break;
13445 case TLS_MODEL_LOCAL_EXEC:
13446 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13447 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13448 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13449 off = gen_rtx_CONST (Pmode, off);
13451 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13453 base = get_thread_pointer (Pmode,
13454 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13455 return gen_rtx_PLUS (Pmode, base, off);
13457 else
13459 base = get_thread_pointer (Pmode, true);
13460 dest = gen_reg_rtx (Pmode);
13461 emit_insn (ix86_gen_sub3 (dest, base, off));
13463 break;
13465 default:
13466 gcc_unreachable ();
13469 return dest;
13472 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13473 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13474 unique refptr-DECL symbol corresponding to symbol DECL. */
13476 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13477 htab_t dllimport_map;
13479 static tree
13480 get_dllimport_decl (tree decl, bool beimport)
13482 struct tree_map *h, in;
13483 void **loc;
13484 const char *name;
13485 const char *prefix;
13486 size_t namelen, prefixlen;
13487 char *imp_name;
13488 tree to;
13489 rtx rtl;
13491 if (!dllimport_map)
13492 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13494 in.hash = htab_hash_pointer (decl);
13495 in.base.from = decl;
13496 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13497 h = (struct tree_map *) *loc;
13498 if (h)
13499 return h->to;
13501 *loc = h = ggc_alloc_tree_map ();
13502 h->hash = in.hash;
13503 h->base.from = decl;
13504 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13505 VAR_DECL, NULL, ptr_type_node);
13506 DECL_ARTIFICIAL (to) = 1;
13507 DECL_IGNORED_P (to) = 1;
13508 DECL_EXTERNAL (to) = 1;
13509 TREE_READONLY (to) = 1;
13511 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13512 name = targetm.strip_name_encoding (name);
13513 if (beimport)
13514 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13515 ? "*__imp_" : "*__imp__";
13516 else
13517 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13518 namelen = strlen (name);
13519 prefixlen = strlen (prefix);
13520 imp_name = (char *) alloca (namelen + prefixlen + 1);
13521 memcpy (imp_name, prefix, prefixlen);
13522 memcpy (imp_name + prefixlen, name, namelen + 1);
13524 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13525 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13526 SET_SYMBOL_REF_DECL (rtl, to);
13527 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13528 if (!beimport)
13530 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13531 #ifdef SUB_TARGET_RECORD_STUB
13532 SUB_TARGET_RECORD_STUB (name);
13533 #endif
13536 rtl = gen_const_mem (Pmode, rtl);
13537 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13539 SET_DECL_RTL (to, rtl);
13540 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13542 return to;
13545 /* Expand SYMBOL into its corresponding far-addresse symbol.
13546 WANT_REG is true if we require the result be a register. */
13548 static rtx
13549 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13551 tree imp_decl;
13552 rtx x;
13554 gcc_assert (SYMBOL_REF_DECL (symbol));
13555 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13557 x = DECL_RTL (imp_decl);
13558 if (want_reg)
13559 x = force_reg (Pmode, x);
13560 return x;
13563 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13564 true if we require the result be a register. */
13566 static rtx
13567 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13569 tree imp_decl;
13570 rtx x;
13572 gcc_assert (SYMBOL_REF_DECL (symbol));
13573 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13575 x = DECL_RTL (imp_decl);
13576 if (want_reg)
13577 x = force_reg (Pmode, x);
13578 return x;
13581 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13582 is true if we require the result be a register. */
13584 static rtx
13585 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13587 if (!TARGET_PECOFF)
13588 return NULL_RTX;
13590 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13592 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13593 return legitimize_dllimport_symbol (addr, inreg);
13594 if (GET_CODE (addr) == CONST
13595 && GET_CODE (XEXP (addr, 0)) == PLUS
13596 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13597 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13599 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13600 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13604 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13605 return NULL_RTX;
13606 if (GET_CODE (addr) == SYMBOL_REF
13607 && !is_imported_p (addr)
13608 && SYMBOL_REF_EXTERNAL_P (addr)
13609 && SYMBOL_REF_DECL (addr))
13610 return legitimize_pe_coff_extern_decl (addr, inreg);
13612 if (GET_CODE (addr) == CONST
13613 && GET_CODE (XEXP (addr, 0)) == PLUS
13614 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13615 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13616 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13617 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13619 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13620 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13622 return NULL_RTX;
13625 /* Try machine-dependent ways of modifying an illegitimate address
13626 to be legitimate. If we find one, return the new, valid address.
13627 This macro is used in only one place: `memory_address' in explow.c.
13629 OLDX is the address as it was before break_out_memory_refs was called.
13630 In some cases it is useful to look at this to decide what needs to be done.
13632 It is always safe for this macro to do nothing. It exists to recognize
13633 opportunities to optimize the output.
13635 For the 80386, we handle X+REG by loading X into a register R and
13636 using R+REG. R will go in a general reg and indexing will be used.
13637 However, if REG is a broken-out memory address or multiplication,
13638 nothing needs to be done because REG can certainly go in a general reg.
13640 When -fpic is used, special handling is needed for symbolic references.
13641 See comments by legitimize_pic_address in i386.c for details. */
13643 static rtx
13644 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13645 enum machine_mode mode)
13647 int changed = 0;
13648 unsigned log;
13650 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13651 if (log)
13652 return legitimize_tls_address (x, (enum tls_model) log, false);
13653 if (GET_CODE (x) == CONST
13654 && GET_CODE (XEXP (x, 0)) == PLUS
13655 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13656 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13658 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13659 (enum tls_model) log, false);
13660 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13663 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13665 rtx tmp = legitimize_pe_coff_symbol (x, true);
13666 if (tmp)
13667 return tmp;
13670 if (flag_pic && SYMBOLIC_CONST (x))
13671 return legitimize_pic_address (x, 0);
13673 #if TARGET_MACHO
13674 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13675 return machopic_indirect_data_reference (x, 0);
13676 #endif
13678 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13679 if (GET_CODE (x) == ASHIFT
13680 && CONST_INT_P (XEXP (x, 1))
13681 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13683 changed = 1;
13684 log = INTVAL (XEXP (x, 1));
13685 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13686 GEN_INT (1 << log));
13689 if (GET_CODE (x) == PLUS)
13691 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13693 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13694 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13695 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13697 changed = 1;
13698 log = INTVAL (XEXP (XEXP (x, 0), 1));
13699 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13700 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13701 GEN_INT (1 << log));
13704 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13705 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13706 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13708 changed = 1;
13709 log = INTVAL (XEXP (XEXP (x, 1), 1));
13710 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13711 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13712 GEN_INT (1 << log));
13715 /* Put multiply first if it isn't already. */
13716 if (GET_CODE (XEXP (x, 1)) == MULT)
13718 rtx tmp = XEXP (x, 0);
13719 XEXP (x, 0) = XEXP (x, 1);
13720 XEXP (x, 1) = tmp;
13721 changed = 1;
13724 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13725 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13726 created by virtual register instantiation, register elimination, and
13727 similar optimizations. */
13728 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13730 changed = 1;
13731 x = gen_rtx_PLUS (Pmode,
13732 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13733 XEXP (XEXP (x, 1), 0)),
13734 XEXP (XEXP (x, 1), 1));
13737 /* Canonicalize
13738 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13739 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13740 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13741 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13742 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13743 && CONSTANT_P (XEXP (x, 1)))
13745 rtx constant;
13746 rtx other = NULL_RTX;
13748 if (CONST_INT_P (XEXP (x, 1)))
13750 constant = XEXP (x, 1);
13751 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13753 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13755 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13756 other = XEXP (x, 1);
13758 else
13759 constant = 0;
13761 if (constant)
13763 changed = 1;
13764 x = gen_rtx_PLUS (Pmode,
13765 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13766 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13767 plus_constant (Pmode, other,
13768 INTVAL (constant)));
13772 if (changed && ix86_legitimate_address_p (mode, x, false))
13773 return x;
13775 if (GET_CODE (XEXP (x, 0)) == MULT)
13777 changed = 1;
13778 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13781 if (GET_CODE (XEXP (x, 1)) == MULT)
13783 changed = 1;
13784 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13787 if (changed
13788 && REG_P (XEXP (x, 1))
13789 && REG_P (XEXP (x, 0)))
13790 return x;
13792 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13794 changed = 1;
13795 x = legitimize_pic_address (x, 0);
13798 if (changed && ix86_legitimate_address_p (mode, x, false))
13799 return x;
13801 if (REG_P (XEXP (x, 0)))
13803 rtx temp = gen_reg_rtx (Pmode);
13804 rtx val = force_operand (XEXP (x, 1), temp);
13805 if (val != temp)
13807 val = convert_to_mode (Pmode, val, 1);
13808 emit_move_insn (temp, val);
13811 XEXP (x, 1) = temp;
13812 return x;
13815 else if (REG_P (XEXP (x, 1)))
13817 rtx temp = gen_reg_rtx (Pmode);
13818 rtx val = force_operand (XEXP (x, 0), temp);
13819 if (val != temp)
13821 val = convert_to_mode (Pmode, val, 1);
13822 emit_move_insn (temp, val);
13825 XEXP (x, 0) = temp;
13826 return x;
13830 return x;
13833 /* Print an integer constant expression in assembler syntax. Addition
13834 and subtraction are the only arithmetic that may appear in these
13835 expressions. FILE is the stdio stream to write to, X is the rtx, and
13836 CODE is the operand print code from the output string. */
13838 static void
13839 output_pic_addr_const (FILE *file, rtx x, int code)
13841 char buf[256];
13843 switch (GET_CODE (x))
13845 case PC:
13846 gcc_assert (flag_pic);
13847 putc ('.', file);
13848 break;
13850 case SYMBOL_REF:
13851 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13852 output_addr_const (file, x);
13853 else
13855 const char *name = XSTR (x, 0);
13857 /* Mark the decl as referenced so that cgraph will
13858 output the function. */
13859 if (SYMBOL_REF_DECL (x))
13860 mark_decl_referenced (SYMBOL_REF_DECL (x));
13862 #if TARGET_MACHO
13863 if (MACHOPIC_INDIRECT
13864 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13865 name = machopic_indirection_name (x, /*stub_p=*/true);
13866 #endif
13867 assemble_name (file, name);
13869 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13870 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13871 fputs ("@PLT", file);
13872 break;
13874 case LABEL_REF:
13875 x = XEXP (x, 0);
13876 /* FALLTHRU */
13877 case CODE_LABEL:
13878 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13879 assemble_name (asm_out_file, buf);
13880 break;
13882 case CONST_INT:
13883 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13884 break;
13886 case CONST:
13887 /* This used to output parentheses around the expression,
13888 but that does not work on the 386 (either ATT or BSD assembler). */
13889 output_pic_addr_const (file, XEXP (x, 0), code);
13890 break;
13892 case CONST_DOUBLE:
13893 if (GET_MODE (x) == VOIDmode)
13895 /* We can use %d if the number is <32 bits and positive. */
13896 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13897 fprintf (file, "0x%lx%08lx",
13898 (unsigned long) CONST_DOUBLE_HIGH (x),
13899 (unsigned long) CONST_DOUBLE_LOW (x));
13900 else
13901 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13903 else
13904 /* We can't handle floating point constants;
13905 TARGET_PRINT_OPERAND must handle them. */
13906 output_operand_lossage ("floating constant misused");
13907 break;
13909 case PLUS:
13910 /* Some assemblers need integer constants to appear first. */
13911 if (CONST_INT_P (XEXP (x, 0)))
13913 output_pic_addr_const (file, XEXP (x, 0), code);
13914 putc ('+', file);
13915 output_pic_addr_const (file, XEXP (x, 1), code);
13917 else
13919 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13920 output_pic_addr_const (file, XEXP (x, 1), code);
13921 putc ('+', file);
13922 output_pic_addr_const (file, XEXP (x, 0), code);
13924 break;
13926 case MINUS:
13927 if (!TARGET_MACHO)
13928 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13929 output_pic_addr_const (file, XEXP (x, 0), code);
13930 putc ('-', file);
13931 output_pic_addr_const (file, XEXP (x, 1), code);
13932 if (!TARGET_MACHO)
13933 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13934 break;
13936 case UNSPEC:
13937 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13939 bool f = i386_asm_output_addr_const_extra (file, x);
13940 gcc_assert (f);
13941 break;
13944 gcc_assert (XVECLEN (x, 0) == 1);
13945 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13946 switch (XINT (x, 1))
13948 case UNSPEC_GOT:
13949 fputs ("@GOT", file);
13950 break;
13951 case UNSPEC_GOTOFF:
13952 fputs ("@GOTOFF", file);
13953 break;
13954 case UNSPEC_PLTOFF:
13955 fputs ("@PLTOFF", file);
13956 break;
13957 case UNSPEC_PCREL:
13958 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13959 "(%rip)" : "[rip]", file);
13960 break;
13961 case UNSPEC_GOTPCREL:
13962 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13963 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13964 break;
13965 case UNSPEC_GOTTPOFF:
13966 /* FIXME: This might be @TPOFF in Sun ld too. */
13967 fputs ("@gottpoff", file);
13968 break;
13969 case UNSPEC_TPOFF:
13970 fputs ("@tpoff", file);
13971 break;
13972 case UNSPEC_NTPOFF:
13973 if (TARGET_64BIT)
13974 fputs ("@tpoff", file);
13975 else
13976 fputs ("@ntpoff", file);
13977 break;
13978 case UNSPEC_DTPOFF:
13979 fputs ("@dtpoff", file);
13980 break;
13981 case UNSPEC_GOTNTPOFF:
13982 if (TARGET_64BIT)
13983 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13984 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13985 else
13986 fputs ("@gotntpoff", file);
13987 break;
13988 case UNSPEC_INDNTPOFF:
13989 fputs ("@indntpoff", file);
13990 break;
13991 #if TARGET_MACHO
13992 case UNSPEC_MACHOPIC_OFFSET:
13993 putc ('-', file);
13994 machopic_output_function_base_name (file);
13995 break;
13996 #endif
13997 default:
13998 output_operand_lossage ("invalid UNSPEC as operand");
13999 break;
14001 break;
14003 default:
14004 output_operand_lossage ("invalid expression as operand");
14008 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
14009 We need to emit DTP-relative relocations. */
14011 static void ATTRIBUTE_UNUSED
14012 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
14014 fputs (ASM_LONG, file);
14015 output_addr_const (file, x);
14016 fputs ("@dtpoff", file);
14017 switch (size)
14019 case 4:
14020 break;
14021 case 8:
14022 fputs (", 0", file);
14023 break;
14024 default:
14025 gcc_unreachable ();
14029 /* Return true if X is a representation of the PIC register. This copes
14030 with calls from ix86_find_base_term, where the register might have
14031 been replaced by a cselib value. */
14033 static bool
14034 ix86_pic_register_p (rtx x)
14036 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
14037 return (pic_offset_table_rtx
14038 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
14039 else
14040 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
14043 /* Helper function for ix86_delegitimize_address.
14044 Attempt to delegitimize TLS local-exec accesses. */
14046 static rtx
14047 ix86_delegitimize_tls_address (rtx orig_x)
14049 rtx x = orig_x, unspec;
14050 struct ix86_address addr;
14052 if (!TARGET_TLS_DIRECT_SEG_REFS)
14053 return orig_x;
14054 if (MEM_P (x))
14055 x = XEXP (x, 0);
14056 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
14057 return orig_x;
14058 if (ix86_decompose_address (x, &addr) == 0
14059 || addr.seg != DEFAULT_TLS_SEG_REG
14060 || addr.disp == NULL_RTX
14061 || GET_CODE (addr.disp) != CONST)
14062 return orig_x;
14063 unspec = XEXP (addr.disp, 0);
14064 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
14065 unspec = XEXP (unspec, 0);
14066 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
14067 return orig_x;
14068 x = XVECEXP (unspec, 0, 0);
14069 gcc_assert (GET_CODE (x) == SYMBOL_REF);
14070 if (unspec != XEXP (addr.disp, 0))
14071 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
14072 if (addr.index)
14074 rtx idx = addr.index;
14075 if (addr.scale != 1)
14076 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
14077 x = gen_rtx_PLUS (Pmode, idx, x);
14079 if (addr.base)
14080 x = gen_rtx_PLUS (Pmode, addr.base, x);
14081 if (MEM_P (orig_x))
14082 x = replace_equiv_address_nv (orig_x, x);
14083 return x;
14086 /* In the name of slightly smaller debug output, and to cater to
14087 general assembler lossage, recognize PIC+GOTOFF and turn it back
14088 into a direct symbol reference.
14090 On Darwin, this is necessary to avoid a crash, because Darwin
14091 has a different PIC label for each routine but the DWARF debugging
14092 information is not associated with any particular routine, so it's
14093 necessary to remove references to the PIC label from RTL stored by
14094 the DWARF output code. */
14096 static rtx
14097 ix86_delegitimize_address (rtx x)
14099 rtx orig_x = delegitimize_mem_from_attrs (x);
14100 /* addend is NULL or some rtx if x is something+GOTOFF where
14101 something doesn't include the PIC register. */
14102 rtx addend = NULL_RTX;
14103 /* reg_addend is NULL or a multiple of some register. */
14104 rtx reg_addend = NULL_RTX;
14105 /* const_addend is NULL or a const_int. */
14106 rtx const_addend = NULL_RTX;
14107 /* This is the result, or NULL. */
14108 rtx result = NULL_RTX;
14110 x = orig_x;
14112 if (MEM_P (x))
14113 x = XEXP (x, 0);
14115 if (TARGET_64BIT)
14117 if (GET_CODE (x) == CONST
14118 && GET_CODE (XEXP (x, 0)) == PLUS
14119 && GET_MODE (XEXP (x, 0)) == Pmode
14120 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
14121 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
14122 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
14124 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
14125 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
14126 if (MEM_P (orig_x))
14127 x = replace_equiv_address_nv (orig_x, x);
14128 return x;
14130 if (GET_CODE (x) != CONST
14131 || GET_CODE (XEXP (x, 0)) != UNSPEC
14132 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
14133 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
14134 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
14135 return ix86_delegitimize_tls_address (orig_x);
14136 x = XVECEXP (XEXP (x, 0), 0, 0);
14137 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
14139 x = simplify_gen_subreg (GET_MODE (orig_x), x,
14140 GET_MODE (x), 0);
14141 if (x == NULL_RTX)
14142 return orig_x;
14144 return x;
14147 if (GET_CODE (x) != PLUS
14148 || GET_CODE (XEXP (x, 1)) != CONST)
14149 return ix86_delegitimize_tls_address (orig_x);
14151 if (ix86_pic_register_p (XEXP (x, 0)))
14152 /* %ebx + GOT/GOTOFF */
14154 else if (GET_CODE (XEXP (x, 0)) == PLUS)
14156 /* %ebx + %reg * scale + GOT/GOTOFF */
14157 reg_addend = XEXP (x, 0);
14158 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
14159 reg_addend = XEXP (reg_addend, 1);
14160 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
14161 reg_addend = XEXP (reg_addend, 0);
14162 else
14164 reg_addend = NULL_RTX;
14165 addend = XEXP (x, 0);
14168 else
14169 addend = XEXP (x, 0);
14171 x = XEXP (XEXP (x, 1), 0);
14172 if (GET_CODE (x) == PLUS
14173 && CONST_INT_P (XEXP (x, 1)))
14175 const_addend = XEXP (x, 1);
14176 x = XEXP (x, 0);
14179 if (GET_CODE (x) == UNSPEC
14180 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
14181 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
14182 result = XVECEXP (x, 0, 0);
14184 if (TARGET_MACHO && darwin_local_data_pic (x)
14185 && !MEM_P (orig_x))
14186 result = XVECEXP (x, 0, 0);
14188 if (! result)
14189 return ix86_delegitimize_tls_address (orig_x);
14191 if (const_addend)
14192 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
14193 if (reg_addend)
14194 result = gen_rtx_PLUS (Pmode, reg_addend, result);
14195 if (addend)
14197 /* If the rest of original X doesn't involve the PIC register, add
14198 addend and subtract pic_offset_table_rtx. This can happen e.g.
14199 for code like:
14200 leal (%ebx, %ecx, 4), %ecx
14202 movl foo@GOTOFF(%ecx), %edx
14203 in which case we return (%ecx - %ebx) + foo. */
14204 if (pic_offset_table_rtx)
14205 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
14206 pic_offset_table_rtx),
14207 result);
14208 else
14209 return orig_x;
14211 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
14213 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
14214 if (result == NULL_RTX)
14215 return orig_x;
14217 return result;
14220 /* If X is a machine specific address (i.e. a symbol or label being
14221 referenced as a displacement from the GOT implemented using an
14222 UNSPEC), then return the base term. Otherwise return X. */
14225 ix86_find_base_term (rtx x)
14227 rtx term;
14229 if (TARGET_64BIT)
14231 if (GET_CODE (x) != CONST)
14232 return x;
14233 term = XEXP (x, 0);
14234 if (GET_CODE (term) == PLUS
14235 && (CONST_INT_P (XEXP (term, 1))
14236 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14237 term = XEXP (term, 0);
14238 if (GET_CODE (term) != UNSPEC
14239 || (XINT (term, 1) != UNSPEC_GOTPCREL
14240 && XINT (term, 1) != UNSPEC_PCREL))
14241 return x;
14243 return XVECEXP (term, 0, 0);
14246 return ix86_delegitimize_address (x);
14249 static void
14250 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14251 bool fp, FILE *file)
14253 const char *suffix;
14255 if (mode == CCFPmode || mode == CCFPUmode)
14257 code = ix86_fp_compare_code_to_integer (code);
14258 mode = CCmode;
14260 if (reverse)
14261 code = reverse_condition (code);
14263 switch (code)
14265 case EQ:
14266 switch (mode)
14268 case CCAmode:
14269 suffix = "a";
14270 break;
14272 case CCCmode:
14273 suffix = "c";
14274 break;
14276 case CCOmode:
14277 suffix = "o";
14278 break;
14280 case CCSmode:
14281 suffix = "s";
14282 break;
14284 default:
14285 suffix = "e";
14287 break;
14288 case NE:
14289 switch (mode)
14291 case CCAmode:
14292 suffix = "na";
14293 break;
14295 case CCCmode:
14296 suffix = "nc";
14297 break;
14299 case CCOmode:
14300 suffix = "no";
14301 break;
14303 case CCSmode:
14304 suffix = "ns";
14305 break;
14307 default:
14308 suffix = "ne";
14310 break;
14311 case GT:
14312 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14313 suffix = "g";
14314 break;
14315 case GTU:
14316 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14317 Those same assemblers have the same but opposite lossage on cmov. */
14318 if (mode == CCmode)
14319 suffix = fp ? "nbe" : "a";
14320 else if (mode == CCCmode)
14321 suffix = "b";
14322 else
14323 gcc_unreachable ();
14324 break;
14325 case LT:
14326 switch (mode)
14328 case CCNOmode:
14329 case CCGOCmode:
14330 suffix = "s";
14331 break;
14333 case CCmode:
14334 case CCGCmode:
14335 suffix = "l";
14336 break;
14338 default:
14339 gcc_unreachable ();
14341 break;
14342 case LTU:
14343 gcc_assert (mode == CCmode || mode == CCCmode);
14344 suffix = "b";
14345 break;
14346 case GE:
14347 switch (mode)
14349 case CCNOmode:
14350 case CCGOCmode:
14351 suffix = "ns";
14352 break;
14354 case CCmode:
14355 case CCGCmode:
14356 suffix = "ge";
14357 break;
14359 default:
14360 gcc_unreachable ();
14362 break;
14363 case GEU:
14364 /* ??? As above. */
14365 gcc_assert (mode == CCmode || mode == CCCmode);
14366 suffix = fp ? "nb" : "ae";
14367 break;
14368 case LE:
14369 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14370 suffix = "le";
14371 break;
14372 case LEU:
14373 /* ??? As above. */
14374 if (mode == CCmode)
14375 suffix = "be";
14376 else if (mode == CCCmode)
14377 suffix = fp ? "nb" : "ae";
14378 else
14379 gcc_unreachable ();
14380 break;
14381 case UNORDERED:
14382 suffix = fp ? "u" : "p";
14383 break;
14384 case ORDERED:
14385 suffix = fp ? "nu" : "np";
14386 break;
14387 default:
14388 gcc_unreachable ();
14390 fputs (suffix, file);
14393 /* Print the name of register X to FILE based on its machine mode and number.
14394 If CODE is 'w', pretend the mode is HImode.
14395 If CODE is 'b', pretend the mode is QImode.
14396 If CODE is 'k', pretend the mode is SImode.
14397 If CODE is 'q', pretend the mode is DImode.
14398 If CODE is 'x', pretend the mode is V4SFmode.
14399 If CODE is 't', pretend the mode is V8SFmode.
14400 If CODE is 'h', pretend the reg is the 'high' byte register.
14401 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14402 If CODE is 'd', duplicate the operand for AVX instruction.
14405 void
14406 print_reg (rtx x, int code, FILE *file)
14408 const char *reg;
14409 unsigned int regno;
14410 bool duplicated = code == 'd' && TARGET_AVX;
14412 if (ASSEMBLER_DIALECT == ASM_ATT)
14413 putc ('%', file);
14415 if (x == pc_rtx)
14417 gcc_assert (TARGET_64BIT);
14418 fputs ("rip", file);
14419 return;
14422 regno = true_regnum (x);
14423 gcc_assert (regno != ARG_POINTER_REGNUM
14424 && regno != FRAME_POINTER_REGNUM
14425 && regno != FLAGS_REG
14426 && regno != FPSR_REG
14427 && regno != FPCR_REG);
14429 if (code == 'w' || MMX_REG_P (x))
14430 code = 2;
14431 else if (code == 'b')
14432 code = 1;
14433 else if (code == 'k')
14434 code = 4;
14435 else if (code == 'q')
14436 code = 8;
14437 else if (code == 'y')
14438 code = 3;
14439 else if (code == 'h')
14440 code = 0;
14441 else if (code == 'x')
14442 code = 16;
14443 else if (code == 't')
14444 code = 32;
14445 else
14446 code = GET_MODE_SIZE (GET_MODE (x));
14448 /* Irritatingly, AMD extended registers use different naming convention
14449 from the normal registers: "r%d[bwd]" */
14450 if (REX_INT_REGNO_P (regno))
14452 gcc_assert (TARGET_64BIT);
14453 putc ('r', file);
14454 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14455 switch (code)
14457 case 0:
14458 error ("extended registers have no high halves");
14459 break;
14460 case 1:
14461 putc ('b', file);
14462 break;
14463 case 2:
14464 putc ('w', file);
14465 break;
14466 case 4:
14467 putc ('d', file);
14468 break;
14469 case 8:
14470 /* no suffix */
14471 break;
14472 default:
14473 error ("unsupported operand size for extended register");
14474 break;
14476 return;
14479 reg = NULL;
14480 switch (code)
14482 case 3:
14483 if (STACK_TOP_P (x))
14485 reg = "st(0)";
14486 break;
14488 /* FALLTHRU */
14489 case 8:
14490 case 4:
14491 case 12:
14492 if (! ANY_FP_REG_P (x))
14493 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14494 /* FALLTHRU */
14495 case 16:
14496 case 2:
14497 normal:
14498 reg = hi_reg_name[regno];
14499 break;
14500 case 1:
14501 if (regno >= ARRAY_SIZE (qi_reg_name))
14502 goto normal;
14503 reg = qi_reg_name[regno];
14504 break;
14505 case 0:
14506 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14507 goto normal;
14508 reg = qi_high_reg_name[regno];
14509 break;
14510 case 32:
14511 if (SSE_REG_P (x))
14513 gcc_assert (!duplicated);
14514 putc ('y', file);
14515 fputs (hi_reg_name[regno] + 1, file);
14516 return;
14518 break;
14519 default:
14520 gcc_unreachable ();
14523 fputs (reg, file);
14524 if (duplicated)
14526 if (ASSEMBLER_DIALECT == ASM_ATT)
14527 fprintf (file, ", %%%s", reg);
14528 else
14529 fprintf (file, ", %s", reg);
14533 /* Locate some local-dynamic symbol still in use by this function
14534 so that we can print its name in some tls_local_dynamic_base
14535 pattern. */
14537 static int
14538 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14540 rtx x = *px;
14542 if (GET_CODE (x) == SYMBOL_REF
14543 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14545 cfun->machine->some_ld_name = XSTR (x, 0);
14546 return 1;
14549 return 0;
14552 static const char *
14553 get_some_local_dynamic_name (void)
14555 rtx insn;
14557 if (cfun->machine->some_ld_name)
14558 return cfun->machine->some_ld_name;
14560 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14561 if (NONDEBUG_INSN_P (insn)
14562 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14563 return cfun->machine->some_ld_name;
14565 return NULL;
14568 /* Meaning of CODE:
14569 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14570 C -- print opcode suffix for set/cmov insn.
14571 c -- like C, but print reversed condition
14572 F,f -- likewise, but for floating-point.
14573 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14574 otherwise nothing
14575 R -- print the prefix for register names.
14576 z -- print the opcode suffix for the size of the current operand.
14577 Z -- likewise, with special suffixes for x87 instructions.
14578 * -- print a star (in certain assembler syntax)
14579 A -- print an absolute memory reference.
14580 E -- print address with DImode register names if TARGET_64BIT.
14581 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14582 s -- print a shift double count, followed by the assemblers argument
14583 delimiter.
14584 b -- print the QImode name of the register for the indicated operand.
14585 %b0 would print %al if operands[0] is reg 0.
14586 w -- likewise, print the HImode name of the register.
14587 k -- likewise, print the SImode name of the register.
14588 q -- likewise, print the DImode name of the register.
14589 x -- likewise, print the V4SFmode name of the register.
14590 t -- likewise, print the V8SFmode name of the register.
14591 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14592 y -- print "st(0)" instead of "st" as a register.
14593 d -- print duplicated register operand for AVX instruction.
14594 D -- print condition for SSE cmp instruction.
14595 P -- if PIC, print an @PLT suffix.
14596 p -- print raw symbol name.
14597 X -- don't print any sort of PIC '@' suffix for a symbol.
14598 & -- print some in-use local-dynamic symbol name.
14599 H -- print a memory address offset by 8; used for sse high-parts
14600 Y -- print condition for XOP pcom* instruction.
14601 + -- print a branch hint as 'cs' or 'ds' prefix
14602 ; -- print a semicolon (after prefixes due to bug in older gas).
14603 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14604 @ -- print a segment register of thread base pointer load
14605 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14608 void
14609 ix86_print_operand (FILE *file, rtx x, int code)
14611 if (code)
14613 switch (code)
14615 case 'A':
14616 switch (ASSEMBLER_DIALECT)
14618 case ASM_ATT:
14619 putc ('*', file);
14620 break;
14622 case ASM_INTEL:
14623 /* Intel syntax. For absolute addresses, registers should not
14624 be surrounded by braces. */
14625 if (!REG_P (x))
14627 putc ('[', file);
14628 ix86_print_operand (file, x, 0);
14629 putc (']', file);
14630 return;
14632 break;
14634 default:
14635 gcc_unreachable ();
14638 ix86_print_operand (file, x, 0);
14639 return;
14641 case 'E':
14642 /* Wrap address in an UNSPEC to declare special handling. */
14643 if (TARGET_64BIT)
14644 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14646 output_address (x);
14647 return;
14649 case 'L':
14650 if (ASSEMBLER_DIALECT == ASM_ATT)
14651 putc ('l', file);
14652 return;
14654 case 'W':
14655 if (ASSEMBLER_DIALECT == ASM_ATT)
14656 putc ('w', file);
14657 return;
14659 case 'B':
14660 if (ASSEMBLER_DIALECT == ASM_ATT)
14661 putc ('b', file);
14662 return;
14664 case 'Q':
14665 if (ASSEMBLER_DIALECT == ASM_ATT)
14666 putc ('l', file);
14667 return;
14669 case 'S':
14670 if (ASSEMBLER_DIALECT == ASM_ATT)
14671 putc ('s', file);
14672 return;
14674 case 'T':
14675 if (ASSEMBLER_DIALECT == ASM_ATT)
14676 putc ('t', file);
14677 return;
14679 case 'O':
14680 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14681 if (ASSEMBLER_DIALECT != ASM_ATT)
14682 return;
14684 switch (GET_MODE_SIZE (GET_MODE (x)))
14686 case 2:
14687 putc ('w', file);
14688 break;
14690 case 4:
14691 putc ('l', file);
14692 break;
14694 case 8:
14695 putc ('q', file);
14696 break;
14698 default:
14699 output_operand_lossage
14700 ("invalid operand size for operand code 'O'");
14701 return;
14704 putc ('.', file);
14705 #endif
14706 return;
14708 case 'z':
14709 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14711 /* Opcodes don't get size suffixes if using Intel opcodes. */
14712 if (ASSEMBLER_DIALECT == ASM_INTEL)
14713 return;
14715 switch (GET_MODE_SIZE (GET_MODE (x)))
14717 case 1:
14718 putc ('b', file);
14719 return;
14721 case 2:
14722 putc ('w', file);
14723 return;
14725 case 4:
14726 putc ('l', file);
14727 return;
14729 case 8:
14730 putc ('q', file);
14731 return;
14733 default:
14734 output_operand_lossage
14735 ("invalid operand size for operand code 'z'");
14736 return;
14740 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14741 warning
14742 (0, "non-integer operand used with operand code 'z'");
14743 /* FALLTHRU */
14745 case 'Z':
14746 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14747 if (ASSEMBLER_DIALECT == ASM_INTEL)
14748 return;
14750 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14752 switch (GET_MODE_SIZE (GET_MODE (x)))
14754 case 2:
14755 #ifdef HAVE_AS_IX86_FILDS
14756 putc ('s', file);
14757 #endif
14758 return;
14760 case 4:
14761 putc ('l', file);
14762 return;
14764 case 8:
14765 #ifdef HAVE_AS_IX86_FILDQ
14766 putc ('q', file);
14767 #else
14768 fputs ("ll", file);
14769 #endif
14770 return;
14772 default:
14773 break;
14776 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14778 /* 387 opcodes don't get size suffixes
14779 if the operands are registers. */
14780 if (STACK_REG_P (x))
14781 return;
14783 switch (GET_MODE_SIZE (GET_MODE (x)))
14785 case 4:
14786 putc ('s', file);
14787 return;
14789 case 8:
14790 putc ('l', file);
14791 return;
14793 case 12:
14794 case 16:
14795 putc ('t', file);
14796 return;
14798 default:
14799 break;
14802 else
14804 output_operand_lossage
14805 ("invalid operand type used with operand code 'Z'");
14806 return;
14809 output_operand_lossage
14810 ("invalid operand size for operand code 'Z'");
14811 return;
14813 case 'd':
14814 case 'b':
14815 case 'w':
14816 case 'k':
14817 case 'q':
14818 case 'h':
14819 case 't':
14820 case 'y':
14821 case 'x':
14822 case 'X':
14823 case 'P':
14824 case 'p':
14825 break;
14827 case 's':
14828 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14830 ix86_print_operand (file, x, 0);
14831 fputs (", ", file);
14833 return;
14835 case 'Y':
14836 switch (GET_CODE (x))
14838 case NE:
14839 fputs ("neq", file);
14840 break;
14841 case EQ:
14842 fputs ("eq", file);
14843 break;
14844 case GE:
14845 case GEU:
14846 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14847 break;
14848 case GT:
14849 case GTU:
14850 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14851 break;
14852 case LE:
14853 case LEU:
14854 fputs ("le", file);
14855 break;
14856 case LT:
14857 case LTU:
14858 fputs ("lt", file);
14859 break;
14860 case UNORDERED:
14861 fputs ("unord", file);
14862 break;
14863 case ORDERED:
14864 fputs ("ord", file);
14865 break;
14866 case UNEQ:
14867 fputs ("ueq", file);
14868 break;
14869 case UNGE:
14870 fputs ("nlt", file);
14871 break;
14872 case UNGT:
14873 fputs ("nle", file);
14874 break;
14875 case UNLE:
14876 fputs ("ule", file);
14877 break;
14878 case UNLT:
14879 fputs ("ult", file);
14880 break;
14881 case LTGT:
14882 fputs ("une", file);
14883 break;
14884 default:
14885 output_operand_lossage ("operand is not a condition code, "
14886 "invalid operand code 'Y'");
14887 return;
14889 return;
14891 case 'D':
14892 /* Little bit of braindamage here. The SSE compare instructions
14893 does use completely different names for the comparisons that the
14894 fp conditional moves. */
14895 switch (GET_CODE (x))
14897 case UNEQ:
14898 if (TARGET_AVX)
14900 fputs ("eq_us", file);
14901 break;
14903 case EQ:
14904 fputs ("eq", file);
14905 break;
14906 case UNLT:
14907 if (TARGET_AVX)
14909 fputs ("nge", file);
14910 break;
14912 case LT:
14913 fputs ("lt", file);
14914 break;
14915 case UNLE:
14916 if (TARGET_AVX)
14918 fputs ("ngt", file);
14919 break;
14921 case LE:
14922 fputs ("le", file);
14923 break;
14924 case UNORDERED:
14925 fputs ("unord", file);
14926 break;
14927 case LTGT:
14928 if (TARGET_AVX)
14930 fputs ("neq_oq", file);
14931 break;
14933 case NE:
14934 fputs ("neq", file);
14935 break;
14936 case GE:
14937 if (TARGET_AVX)
14939 fputs ("ge", file);
14940 break;
14942 case UNGE:
14943 fputs ("nlt", file);
14944 break;
14945 case GT:
14946 if (TARGET_AVX)
14948 fputs ("gt", file);
14949 break;
14951 case UNGT:
14952 fputs ("nle", file);
14953 break;
14954 case ORDERED:
14955 fputs ("ord", file);
14956 break;
14957 default:
14958 output_operand_lossage ("operand is not a condition code, "
14959 "invalid operand code 'D'");
14960 return;
14962 return;
14964 case 'F':
14965 case 'f':
14966 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14967 if (ASSEMBLER_DIALECT == ASM_ATT)
14968 putc ('.', file);
14969 #endif
14971 case 'C':
14972 case 'c':
14973 if (!COMPARISON_P (x))
14975 output_operand_lossage ("operand is not a condition code, "
14976 "invalid operand code '%c'", code);
14977 return;
14979 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14980 code == 'c' || code == 'f',
14981 code == 'F' || code == 'f',
14982 file);
14983 return;
14985 case 'H':
14986 if (!offsettable_memref_p (x))
14988 output_operand_lossage ("operand is not an offsettable memory "
14989 "reference, invalid operand code 'H'");
14990 return;
14992 /* It doesn't actually matter what mode we use here, as we're
14993 only going to use this for printing. */
14994 x = adjust_address_nv (x, DImode, 8);
14995 /* Output 'qword ptr' for intel assembler dialect. */
14996 if (ASSEMBLER_DIALECT == ASM_INTEL)
14997 code = 'q';
14998 break;
15000 case 'K':
15001 gcc_assert (CONST_INT_P (x));
15003 if (INTVAL (x) & IX86_HLE_ACQUIRE)
15004 #ifdef HAVE_AS_IX86_HLE
15005 fputs ("xacquire ", file);
15006 #else
15007 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
15008 #endif
15009 else if (INTVAL (x) & IX86_HLE_RELEASE)
15010 #ifdef HAVE_AS_IX86_HLE
15011 fputs ("xrelease ", file);
15012 #else
15013 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
15014 #endif
15015 /* We do not want to print value of the operand. */
15016 return;
15018 case '*':
15019 if (ASSEMBLER_DIALECT == ASM_ATT)
15020 putc ('*', file);
15021 return;
15023 case '&':
15025 const char *name = get_some_local_dynamic_name ();
15026 if (name == NULL)
15027 output_operand_lossage ("'%%&' used without any "
15028 "local dynamic TLS references");
15029 else
15030 assemble_name (file, name);
15031 return;
15034 case '+':
15036 rtx x;
15038 if (!optimize
15039 || optimize_function_for_size_p (cfun)
15040 || !TARGET_BRANCH_PREDICTION_HINTS)
15041 return;
15043 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
15044 if (x)
15046 int pred_val = INTVAL (XEXP (x, 0));
15048 if (pred_val < REG_BR_PROB_BASE * 45 / 100
15049 || pred_val > REG_BR_PROB_BASE * 55 / 100)
15051 bool taken = pred_val > REG_BR_PROB_BASE / 2;
15052 bool cputaken
15053 = final_forward_branch_p (current_output_insn) == 0;
15055 /* Emit hints only in the case default branch prediction
15056 heuristics would fail. */
15057 if (taken != cputaken)
15059 /* We use 3e (DS) prefix for taken branches and
15060 2e (CS) prefix for not taken branches. */
15061 if (taken)
15062 fputs ("ds ; ", file);
15063 else
15064 fputs ("cs ; ", file);
15068 return;
15071 case ';':
15072 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
15073 putc (';', file);
15074 #endif
15075 return;
15077 case '@':
15078 if (ASSEMBLER_DIALECT == ASM_ATT)
15079 putc ('%', file);
15081 /* The kernel uses a different segment register for performance
15082 reasons; a system call would not have to trash the userspace
15083 segment register, which would be expensive. */
15084 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
15085 fputs ("fs", file);
15086 else
15087 fputs ("gs", file);
15088 return;
15090 case '~':
15091 putc (TARGET_AVX2 ? 'i' : 'f', file);
15092 return;
15094 case '^':
15095 if (TARGET_64BIT && Pmode != word_mode)
15096 fputs ("addr32 ", file);
15097 return;
15099 default:
15100 output_operand_lossage ("invalid operand code '%c'", code);
15104 if (REG_P (x))
15105 print_reg (x, code, file);
15107 else if (MEM_P (x))
15109 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
15110 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
15111 && GET_MODE (x) != BLKmode)
15113 const char * size;
15114 switch (GET_MODE_SIZE (GET_MODE (x)))
15116 case 1: size = "BYTE"; break;
15117 case 2: size = "WORD"; break;
15118 case 4: size = "DWORD"; break;
15119 case 8: size = "QWORD"; break;
15120 case 12: size = "TBYTE"; break;
15121 case 16:
15122 if (GET_MODE (x) == XFmode)
15123 size = "TBYTE";
15124 else
15125 size = "XMMWORD";
15126 break;
15127 case 32: size = "YMMWORD"; break;
15128 default:
15129 gcc_unreachable ();
15132 /* Check for explicit size override (codes 'b', 'w', 'k',
15133 'q' and 'x') */
15134 if (code == 'b')
15135 size = "BYTE";
15136 else if (code == 'w')
15137 size = "WORD";
15138 else if (code == 'k')
15139 size = "DWORD";
15140 else if (code == 'q')
15141 size = "QWORD";
15142 else if (code == 'x')
15143 size = "XMMWORD";
15145 fputs (size, file);
15146 fputs (" PTR ", file);
15149 x = XEXP (x, 0);
15150 /* Avoid (%rip) for call operands. */
15151 if (CONSTANT_ADDRESS_P (x) && code == 'P'
15152 && !CONST_INT_P (x))
15153 output_addr_const (file, x);
15154 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
15155 output_operand_lossage ("invalid constraints for operand");
15156 else
15157 output_address (x);
15160 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
15162 REAL_VALUE_TYPE r;
15163 long l;
15165 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15166 REAL_VALUE_TO_TARGET_SINGLE (r, l);
15168 if (ASSEMBLER_DIALECT == ASM_ATT)
15169 putc ('$', file);
15170 /* Sign extend 32bit SFmode immediate to 8 bytes. */
15171 if (code == 'q')
15172 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
15173 (unsigned long long) (int) l);
15174 else
15175 fprintf (file, "0x%08x", (unsigned int) l);
15178 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
15180 REAL_VALUE_TYPE r;
15181 long l[2];
15183 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15184 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
15186 if (ASSEMBLER_DIALECT == ASM_ATT)
15187 putc ('$', file);
15188 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
15191 /* These float cases don't actually occur as immediate operands. */
15192 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
15194 char dstr[30];
15196 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
15197 fputs (dstr, file);
15200 else
15202 /* We have patterns that allow zero sets of memory, for instance.
15203 In 64-bit mode, we should probably support all 8-byte vectors,
15204 since we can in fact encode that into an immediate. */
15205 if (GET_CODE (x) == CONST_VECTOR)
15207 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
15208 x = const0_rtx;
15211 if (code != 'P' && code != 'p')
15213 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
15215 if (ASSEMBLER_DIALECT == ASM_ATT)
15216 putc ('$', file);
15218 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
15219 || GET_CODE (x) == LABEL_REF)
15221 if (ASSEMBLER_DIALECT == ASM_ATT)
15222 putc ('$', file);
15223 else
15224 fputs ("OFFSET FLAT:", file);
15227 if (CONST_INT_P (x))
15228 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
15229 else if (flag_pic || MACHOPIC_INDIRECT)
15230 output_pic_addr_const (file, x, code);
15231 else
15232 output_addr_const (file, x);
15236 static bool
15237 ix86_print_operand_punct_valid_p (unsigned char code)
15239 return (code == '@' || code == '*' || code == '+' || code == '&'
15240 || code == ';' || code == '~' || code == '^');
15243 /* Print a memory operand whose address is ADDR. */
15245 static void
15246 ix86_print_operand_address (FILE *file, rtx addr)
15248 struct ix86_address parts;
15249 rtx base, index, disp;
15250 int scale;
15251 int ok;
15252 bool vsib = false;
15253 int code = 0;
15255 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15257 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15258 gcc_assert (parts.index == NULL_RTX);
15259 parts.index = XVECEXP (addr, 0, 1);
15260 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15261 addr = XVECEXP (addr, 0, 0);
15262 vsib = true;
15264 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15266 gcc_assert (TARGET_64BIT);
15267 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15268 code = 'q';
15270 else
15271 ok = ix86_decompose_address (addr, &parts);
15273 gcc_assert (ok);
15275 base = parts.base;
15276 index = parts.index;
15277 disp = parts.disp;
15278 scale = parts.scale;
15280 switch (parts.seg)
15282 case SEG_DEFAULT:
15283 break;
15284 case SEG_FS:
15285 case SEG_GS:
15286 if (ASSEMBLER_DIALECT == ASM_ATT)
15287 putc ('%', file);
15288 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15289 break;
15290 default:
15291 gcc_unreachable ();
15294 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15295 if (TARGET_64BIT && !base && !index)
15297 rtx symbol = disp;
15299 if (GET_CODE (disp) == CONST
15300 && GET_CODE (XEXP (disp, 0)) == PLUS
15301 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15302 symbol = XEXP (XEXP (disp, 0), 0);
15304 if (GET_CODE (symbol) == LABEL_REF
15305 || (GET_CODE (symbol) == SYMBOL_REF
15306 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15307 base = pc_rtx;
15309 if (!base && !index)
15311 /* Displacement only requires special attention. */
15313 if (CONST_INT_P (disp))
15315 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15316 fputs ("ds:", file);
15317 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15319 else if (flag_pic)
15320 output_pic_addr_const (file, disp, 0);
15321 else
15322 output_addr_const (file, disp);
15324 else
15326 /* Print SImode register names to force addr32 prefix. */
15327 if (SImode_address_operand (addr, VOIDmode))
15329 #ifdef ENABLE_CHECKING
15330 gcc_assert (TARGET_64BIT);
15331 switch (GET_CODE (addr))
15333 case SUBREG:
15334 gcc_assert (GET_MODE (addr) == SImode);
15335 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15336 break;
15337 case ZERO_EXTEND:
15338 case AND:
15339 gcc_assert (GET_MODE (addr) == DImode);
15340 break;
15341 default:
15342 gcc_unreachable ();
15344 #endif
15345 gcc_assert (!code);
15346 code = 'k';
15348 else if (code == 0
15349 && TARGET_X32
15350 && disp
15351 && CONST_INT_P (disp)
15352 && INTVAL (disp) < -16*1024*1024)
15354 /* X32 runs in 64-bit mode, where displacement, DISP, in
15355 address DISP(%r64), is encoded as 32-bit immediate sign-
15356 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15357 address is %r64 + 0xffffffffbffffd00. When %r64 <
15358 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15359 which is invalid for x32. The correct address is %r64
15360 - 0x40000300 == 0xf7ffdd64. To properly encode
15361 -0x40000300(%r64) for x32, we zero-extend negative
15362 displacement by forcing addr32 prefix which truncates
15363 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15364 zero-extend all negative displacements, including -1(%rsp).
15365 However, for small negative displacements, sign-extension
15366 won't cause overflow. We only zero-extend negative
15367 displacements if they < -16*1024*1024, which is also used
15368 to check legitimate address displacements for PIC. */
15369 code = 'k';
15372 if (ASSEMBLER_DIALECT == ASM_ATT)
15374 if (disp)
15376 if (flag_pic)
15377 output_pic_addr_const (file, disp, 0);
15378 else if (GET_CODE (disp) == LABEL_REF)
15379 output_asm_label (disp);
15380 else
15381 output_addr_const (file, disp);
15384 putc ('(', file);
15385 if (base)
15386 print_reg (base, code, file);
15387 if (index)
15389 putc (',', file);
15390 print_reg (index, vsib ? 0 : code, file);
15391 if (scale != 1 || vsib)
15392 fprintf (file, ",%d", scale);
15394 putc (')', file);
15396 else
15398 rtx offset = NULL_RTX;
15400 if (disp)
15402 /* Pull out the offset of a symbol; print any symbol itself. */
15403 if (GET_CODE (disp) == CONST
15404 && GET_CODE (XEXP (disp, 0)) == PLUS
15405 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15407 offset = XEXP (XEXP (disp, 0), 1);
15408 disp = gen_rtx_CONST (VOIDmode,
15409 XEXP (XEXP (disp, 0), 0));
15412 if (flag_pic)
15413 output_pic_addr_const (file, disp, 0);
15414 else if (GET_CODE (disp) == LABEL_REF)
15415 output_asm_label (disp);
15416 else if (CONST_INT_P (disp))
15417 offset = disp;
15418 else
15419 output_addr_const (file, disp);
15422 putc ('[', file);
15423 if (base)
15425 print_reg (base, code, file);
15426 if (offset)
15428 if (INTVAL (offset) >= 0)
15429 putc ('+', file);
15430 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15433 else if (offset)
15434 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15435 else
15436 putc ('0', file);
15438 if (index)
15440 putc ('+', file);
15441 print_reg (index, vsib ? 0 : code, file);
15442 if (scale != 1 || vsib)
15443 fprintf (file, "*%d", scale);
15445 putc (']', file);
15450 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15452 static bool
15453 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15455 rtx op;
15457 if (GET_CODE (x) != UNSPEC)
15458 return false;
15460 op = XVECEXP (x, 0, 0);
15461 switch (XINT (x, 1))
15463 case UNSPEC_GOTTPOFF:
15464 output_addr_const (file, op);
15465 /* FIXME: This might be @TPOFF in Sun ld. */
15466 fputs ("@gottpoff", file);
15467 break;
15468 case UNSPEC_TPOFF:
15469 output_addr_const (file, op);
15470 fputs ("@tpoff", file);
15471 break;
15472 case UNSPEC_NTPOFF:
15473 output_addr_const (file, op);
15474 if (TARGET_64BIT)
15475 fputs ("@tpoff", file);
15476 else
15477 fputs ("@ntpoff", file);
15478 break;
15479 case UNSPEC_DTPOFF:
15480 output_addr_const (file, op);
15481 fputs ("@dtpoff", file);
15482 break;
15483 case UNSPEC_GOTNTPOFF:
15484 output_addr_const (file, op);
15485 if (TARGET_64BIT)
15486 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15487 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15488 else
15489 fputs ("@gotntpoff", file);
15490 break;
15491 case UNSPEC_INDNTPOFF:
15492 output_addr_const (file, op);
15493 fputs ("@indntpoff", file);
15494 break;
15495 #if TARGET_MACHO
15496 case UNSPEC_MACHOPIC_OFFSET:
15497 output_addr_const (file, op);
15498 putc ('-', file);
15499 machopic_output_function_base_name (file);
15500 break;
15501 #endif
15503 case UNSPEC_STACK_CHECK:
15505 int offset;
15507 gcc_assert (flag_split_stack);
15509 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15510 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15511 #else
15512 gcc_unreachable ();
15513 #endif
15515 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15517 break;
15519 default:
15520 return false;
15523 return true;
15526 /* Split one or more double-mode RTL references into pairs of half-mode
15527 references. The RTL can be REG, offsettable MEM, integer constant, or
15528 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15529 split and "num" is its length. lo_half and hi_half are output arrays
15530 that parallel "operands". */
15532 void
15533 split_double_mode (enum machine_mode mode, rtx operands[],
15534 int num, rtx lo_half[], rtx hi_half[])
15536 enum machine_mode half_mode;
15537 unsigned int byte;
15539 switch (mode)
15541 case TImode:
15542 half_mode = DImode;
15543 break;
15544 case DImode:
15545 half_mode = SImode;
15546 break;
15547 default:
15548 gcc_unreachable ();
15551 byte = GET_MODE_SIZE (half_mode);
15553 while (num--)
15555 rtx op = operands[num];
15557 /* simplify_subreg refuse to split volatile memory addresses,
15558 but we still have to handle it. */
15559 if (MEM_P (op))
15561 lo_half[num] = adjust_address (op, half_mode, 0);
15562 hi_half[num] = adjust_address (op, half_mode, byte);
15564 else
15566 lo_half[num] = simplify_gen_subreg (half_mode, op,
15567 GET_MODE (op) == VOIDmode
15568 ? mode : GET_MODE (op), 0);
15569 hi_half[num] = simplify_gen_subreg (half_mode, op,
15570 GET_MODE (op) == VOIDmode
15571 ? mode : GET_MODE (op), byte);
15576 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15577 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15578 is the expression of the binary operation. The output may either be
15579 emitted here, or returned to the caller, like all output_* functions.
15581 There is no guarantee that the operands are the same mode, as they
15582 might be within FLOAT or FLOAT_EXTEND expressions. */
15584 #ifndef SYSV386_COMPAT
15585 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15586 wants to fix the assemblers because that causes incompatibility
15587 with gcc. No-one wants to fix gcc because that causes
15588 incompatibility with assemblers... You can use the option of
15589 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15590 #define SYSV386_COMPAT 1
15591 #endif
15593 const char *
15594 output_387_binary_op (rtx insn, rtx *operands)
15596 static char buf[40];
15597 const char *p;
15598 const char *ssep;
15599 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15601 #ifdef ENABLE_CHECKING
15602 /* Even if we do not want to check the inputs, this documents input
15603 constraints. Which helps in understanding the following code. */
15604 if (STACK_REG_P (operands[0])
15605 && ((REG_P (operands[1])
15606 && REGNO (operands[0]) == REGNO (operands[1])
15607 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15608 || (REG_P (operands[2])
15609 && REGNO (operands[0]) == REGNO (operands[2])
15610 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15611 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15612 ; /* ok */
15613 else
15614 gcc_assert (is_sse);
15615 #endif
15617 switch (GET_CODE (operands[3]))
15619 case PLUS:
15620 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15621 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15622 p = "fiadd";
15623 else
15624 p = "fadd";
15625 ssep = "vadd";
15626 break;
15628 case MINUS:
15629 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15630 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15631 p = "fisub";
15632 else
15633 p = "fsub";
15634 ssep = "vsub";
15635 break;
15637 case MULT:
15638 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15639 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15640 p = "fimul";
15641 else
15642 p = "fmul";
15643 ssep = "vmul";
15644 break;
15646 case DIV:
15647 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15648 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15649 p = "fidiv";
15650 else
15651 p = "fdiv";
15652 ssep = "vdiv";
15653 break;
15655 default:
15656 gcc_unreachable ();
15659 if (is_sse)
15661 if (TARGET_AVX)
15663 strcpy (buf, ssep);
15664 if (GET_MODE (operands[0]) == SFmode)
15665 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15666 else
15667 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15669 else
15671 strcpy (buf, ssep + 1);
15672 if (GET_MODE (operands[0]) == SFmode)
15673 strcat (buf, "ss\t{%2, %0|%0, %2}");
15674 else
15675 strcat (buf, "sd\t{%2, %0|%0, %2}");
15677 return buf;
15679 strcpy (buf, p);
15681 switch (GET_CODE (operands[3]))
15683 case MULT:
15684 case PLUS:
15685 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15687 rtx temp = operands[2];
15688 operands[2] = operands[1];
15689 operands[1] = temp;
15692 /* know operands[0] == operands[1]. */
15694 if (MEM_P (operands[2]))
15696 p = "%Z2\t%2";
15697 break;
15700 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15702 if (STACK_TOP_P (operands[0]))
15703 /* How is it that we are storing to a dead operand[2]?
15704 Well, presumably operands[1] is dead too. We can't
15705 store the result to st(0) as st(0) gets popped on this
15706 instruction. Instead store to operands[2] (which I
15707 think has to be st(1)). st(1) will be popped later.
15708 gcc <= 2.8.1 didn't have this check and generated
15709 assembly code that the Unixware assembler rejected. */
15710 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15711 else
15712 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15713 break;
15716 if (STACK_TOP_P (operands[0]))
15717 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15718 else
15719 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15720 break;
15722 case MINUS:
15723 case DIV:
15724 if (MEM_P (operands[1]))
15726 p = "r%Z1\t%1";
15727 break;
15730 if (MEM_P (operands[2]))
15732 p = "%Z2\t%2";
15733 break;
15736 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15738 #if SYSV386_COMPAT
15739 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15740 derived assemblers, confusingly reverse the direction of
15741 the operation for fsub{r} and fdiv{r} when the
15742 destination register is not st(0). The Intel assembler
15743 doesn't have this brain damage. Read !SYSV386_COMPAT to
15744 figure out what the hardware really does. */
15745 if (STACK_TOP_P (operands[0]))
15746 p = "{p\t%0, %2|rp\t%2, %0}";
15747 else
15748 p = "{rp\t%2, %0|p\t%0, %2}";
15749 #else
15750 if (STACK_TOP_P (operands[0]))
15751 /* As above for fmul/fadd, we can't store to st(0). */
15752 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15753 else
15754 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15755 #endif
15756 break;
15759 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15761 #if SYSV386_COMPAT
15762 if (STACK_TOP_P (operands[0]))
15763 p = "{rp\t%0, %1|p\t%1, %0}";
15764 else
15765 p = "{p\t%1, %0|rp\t%0, %1}";
15766 #else
15767 if (STACK_TOP_P (operands[0]))
15768 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15769 else
15770 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15771 #endif
15772 break;
15775 if (STACK_TOP_P (operands[0]))
15777 if (STACK_TOP_P (operands[1]))
15778 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15779 else
15780 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15781 break;
15783 else if (STACK_TOP_P (operands[1]))
15785 #if SYSV386_COMPAT
15786 p = "{\t%1, %0|r\t%0, %1}";
15787 #else
15788 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15789 #endif
15791 else
15793 #if SYSV386_COMPAT
15794 p = "{r\t%2, %0|\t%0, %2}";
15795 #else
15796 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15797 #endif
15799 break;
15801 default:
15802 gcc_unreachable ();
15805 strcat (buf, p);
15806 return buf;
15809 /* Check if a 256bit AVX register is referenced inside of EXP. */
15811 static int
15812 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15814 rtx exp = *pexp;
15816 if (GET_CODE (exp) == SUBREG)
15817 exp = SUBREG_REG (exp);
15819 if (REG_P (exp)
15820 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15821 return 1;
15823 return 0;
15826 /* Return needed mode for entity in optimize_mode_switching pass. */
15828 static int
15829 ix86_avx_u128_mode_needed (rtx insn)
15831 if (CALL_P (insn))
15833 rtx link;
15835 /* Needed mode is set to AVX_U128_CLEAN if there are
15836 no 256bit modes used in function arguments. */
15837 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15838 link;
15839 link = XEXP (link, 1))
15841 if (GET_CODE (XEXP (link, 0)) == USE)
15843 rtx arg = XEXP (XEXP (link, 0), 0);
15845 if (ix86_check_avx256_register (&arg, NULL))
15846 return AVX_U128_ANY;
15850 return AVX_U128_CLEAN;
15853 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15854 changes state only when a 256bit register is written to, but we need
15855 to prevent the compiler from moving optimal insertion point above
15856 eventual read from 256bit register. */
15857 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15858 return AVX_U128_DIRTY;
15860 return AVX_U128_ANY;
15863 /* Return mode that i387 must be switched into
15864 prior to the execution of insn. */
15866 static int
15867 ix86_i387_mode_needed (int entity, rtx insn)
15869 enum attr_i387_cw mode;
15871 /* The mode UNINITIALIZED is used to store control word after a
15872 function call or ASM pattern. The mode ANY specify that function
15873 has no requirements on the control word and make no changes in the
15874 bits we are interested in. */
15876 if (CALL_P (insn)
15877 || (NONJUMP_INSN_P (insn)
15878 && (asm_noperands (PATTERN (insn)) >= 0
15879 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15880 return I387_CW_UNINITIALIZED;
15882 if (recog_memoized (insn) < 0)
15883 return I387_CW_ANY;
15885 mode = get_attr_i387_cw (insn);
15887 switch (entity)
15889 case I387_TRUNC:
15890 if (mode == I387_CW_TRUNC)
15891 return mode;
15892 break;
15894 case I387_FLOOR:
15895 if (mode == I387_CW_FLOOR)
15896 return mode;
15897 break;
15899 case I387_CEIL:
15900 if (mode == I387_CW_CEIL)
15901 return mode;
15902 break;
15904 case I387_MASK_PM:
15905 if (mode == I387_CW_MASK_PM)
15906 return mode;
15907 break;
15909 default:
15910 gcc_unreachable ();
15913 return I387_CW_ANY;
15916 /* Return mode that entity must be switched into
15917 prior to the execution of insn. */
15920 ix86_mode_needed (int entity, rtx insn)
15922 switch (entity)
15924 case AVX_U128:
15925 return ix86_avx_u128_mode_needed (insn);
15926 case I387_TRUNC:
15927 case I387_FLOOR:
15928 case I387_CEIL:
15929 case I387_MASK_PM:
15930 return ix86_i387_mode_needed (entity, insn);
15931 default:
15932 gcc_unreachable ();
15934 return 0;
15937 /* Check if a 256bit AVX register is referenced in stores. */
15939 static void
15940 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15942 if (ix86_check_avx256_register (&dest, NULL))
15944 bool *used = (bool *) data;
15945 *used = true;
15949 /* Calculate mode of upper 128bit AVX registers after the insn. */
15951 static int
15952 ix86_avx_u128_mode_after (int mode, rtx insn)
15954 rtx pat = PATTERN (insn);
15956 if (vzeroupper_operation (pat, VOIDmode)
15957 || vzeroall_operation (pat, VOIDmode))
15958 return AVX_U128_CLEAN;
15960 /* We know that state is clean after CALL insn if there are no
15961 256bit registers used in the function return register. */
15962 if (CALL_P (insn))
15964 bool avx_reg256_found = false;
15965 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15966 if (!avx_reg256_found)
15967 return AVX_U128_CLEAN;
15970 /* Otherwise, return current mode. Remember that if insn
15971 references AVX 256bit registers, the mode was already changed
15972 to DIRTY from MODE_NEEDED. */
15973 return mode;
15976 /* Return the mode that an insn results in. */
15979 ix86_mode_after (int entity, int mode, rtx insn)
15981 switch (entity)
15983 case AVX_U128:
15984 return ix86_avx_u128_mode_after (mode, insn);
15985 case I387_TRUNC:
15986 case I387_FLOOR:
15987 case I387_CEIL:
15988 case I387_MASK_PM:
15989 return mode;
15990 default:
15991 gcc_unreachable ();
15995 static int
15996 ix86_avx_u128_mode_entry (void)
15998 tree arg;
16000 /* Entry mode is set to AVX_U128_DIRTY if there are
16001 256bit modes used in function arguments. */
16002 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
16003 arg = TREE_CHAIN (arg))
16005 rtx incoming = DECL_INCOMING_RTL (arg);
16007 if (incoming && ix86_check_avx256_register (&incoming, NULL))
16008 return AVX_U128_DIRTY;
16011 return AVX_U128_CLEAN;
16014 /* Return a mode that ENTITY is assumed to be
16015 switched to at function entry. */
16018 ix86_mode_entry (int entity)
16020 switch (entity)
16022 case AVX_U128:
16023 return ix86_avx_u128_mode_entry ();
16024 case I387_TRUNC:
16025 case I387_FLOOR:
16026 case I387_CEIL:
16027 case I387_MASK_PM:
16028 return I387_CW_ANY;
16029 default:
16030 gcc_unreachable ();
16034 static int
16035 ix86_avx_u128_mode_exit (void)
16037 rtx reg = crtl->return_rtx;
16039 /* Exit mode is set to AVX_U128_DIRTY if there are
16040 256bit modes used in the function return register. */
16041 if (reg && ix86_check_avx256_register (&reg, NULL))
16042 return AVX_U128_DIRTY;
16044 return AVX_U128_CLEAN;
16047 /* Return a mode that ENTITY is assumed to be
16048 switched to at function exit. */
16051 ix86_mode_exit (int entity)
16053 switch (entity)
16055 case AVX_U128:
16056 return ix86_avx_u128_mode_exit ();
16057 case I387_TRUNC:
16058 case I387_FLOOR:
16059 case I387_CEIL:
16060 case I387_MASK_PM:
16061 return I387_CW_ANY;
16062 default:
16063 gcc_unreachable ();
16067 /* Output code to initialize control word copies used by trunc?f?i and
16068 rounding patterns. CURRENT_MODE is set to current control word,
16069 while NEW_MODE is set to new control word. */
16071 static void
16072 emit_i387_cw_initialization (int mode)
16074 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
16075 rtx new_mode;
16077 enum ix86_stack_slot slot;
16079 rtx reg = gen_reg_rtx (HImode);
16081 emit_insn (gen_x86_fnstcw_1 (stored_mode));
16082 emit_move_insn (reg, copy_rtx (stored_mode));
16084 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
16085 || optimize_insn_for_size_p ())
16087 switch (mode)
16089 case I387_CW_TRUNC:
16090 /* round toward zero (truncate) */
16091 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
16092 slot = SLOT_CW_TRUNC;
16093 break;
16095 case I387_CW_FLOOR:
16096 /* round down toward -oo */
16097 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16098 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
16099 slot = SLOT_CW_FLOOR;
16100 break;
16102 case I387_CW_CEIL:
16103 /* round up toward +oo */
16104 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16105 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
16106 slot = SLOT_CW_CEIL;
16107 break;
16109 case I387_CW_MASK_PM:
16110 /* mask precision exception for nearbyint() */
16111 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16112 slot = SLOT_CW_MASK_PM;
16113 break;
16115 default:
16116 gcc_unreachable ();
16119 else
16121 switch (mode)
16123 case I387_CW_TRUNC:
16124 /* round toward zero (truncate) */
16125 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
16126 slot = SLOT_CW_TRUNC;
16127 break;
16129 case I387_CW_FLOOR:
16130 /* round down toward -oo */
16131 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
16132 slot = SLOT_CW_FLOOR;
16133 break;
16135 case I387_CW_CEIL:
16136 /* round up toward +oo */
16137 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
16138 slot = SLOT_CW_CEIL;
16139 break;
16141 case I387_CW_MASK_PM:
16142 /* mask precision exception for nearbyint() */
16143 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16144 slot = SLOT_CW_MASK_PM;
16145 break;
16147 default:
16148 gcc_unreachable ();
16152 gcc_assert (slot < MAX_386_STACK_LOCALS);
16154 new_mode = assign_386_stack_local (HImode, slot);
16155 emit_move_insn (new_mode, reg);
16158 /* Emit vzeroupper. */
16160 void
16161 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
16163 int i;
16165 /* Cancel automatic vzeroupper insertion if there are
16166 live call-saved SSE registers at the insertion point. */
16168 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
16169 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16170 return;
16172 if (TARGET_64BIT)
16173 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
16174 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16175 return;
16177 emit_insn (gen_avx_vzeroupper ());
16180 /* Generate one or more insns to set ENTITY to MODE. */
16182 void
16183 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
16185 switch (entity)
16187 case AVX_U128:
16188 if (mode == AVX_U128_CLEAN)
16189 ix86_avx_emit_vzeroupper (regs_live);
16190 break;
16191 case I387_TRUNC:
16192 case I387_FLOOR:
16193 case I387_CEIL:
16194 case I387_MASK_PM:
16195 if (mode != I387_CW_ANY
16196 && mode != I387_CW_UNINITIALIZED)
16197 emit_i387_cw_initialization (mode);
16198 break;
16199 default:
16200 gcc_unreachable ();
16204 /* Output code for INSN to convert a float to a signed int. OPERANDS
16205 are the insn operands. The output may be [HSD]Imode and the input
16206 operand may be [SDX]Fmode. */
16208 const char *
16209 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
16211 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16212 int dimode_p = GET_MODE (operands[0]) == DImode;
16213 int round_mode = get_attr_i387_cw (insn);
16215 /* Jump through a hoop or two for DImode, since the hardware has no
16216 non-popping instruction. We used to do this a different way, but
16217 that was somewhat fragile and broke with post-reload splitters. */
16218 if ((dimode_p || fisttp) && !stack_top_dies)
16219 output_asm_insn ("fld\t%y1", operands);
16221 gcc_assert (STACK_TOP_P (operands[1]));
16222 gcc_assert (MEM_P (operands[0]));
16223 gcc_assert (GET_MODE (operands[1]) != TFmode);
16225 if (fisttp)
16226 output_asm_insn ("fisttp%Z0\t%0", operands);
16227 else
16229 if (round_mode != I387_CW_ANY)
16230 output_asm_insn ("fldcw\t%3", operands);
16231 if (stack_top_dies || dimode_p)
16232 output_asm_insn ("fistp%Z0\t%0", operands);
16233 else
16234 output_asm_insn ("fist%Z0\t%0", operands);
16235 if (round_mode != I387_CW_ANY)
16236 output_asm_insn ("fldcw\t%2", operands);
16239 return "";
16242 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16243 have the values zero or one, indicates the ffreep insn's operand
16244 from the OPERANDS array. */
16246 static const char *
16247 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16249 if (TARGET_USE_FFREEP)
16250 #ifdef HAVE_AS_IX86_FFREEP
16251 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16252 #else
16254 static char retval[32];
16255 int regno = REGNO (operands[opno]);
16257 gcc_assert (STACK_REGNO_P (regno));
16259 regno -= FIRST_STACK_REG;
16261 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16262 return retval;
16264 #endif
16266 return opno ? "fstp\t%y1" : "fstp\t%y0";
16270 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16271 should be used. UNORDERED_P is true when fucom should be used. */
16273 const char *
16274 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16276 int stack_top_dies;
16277 rtx cmp_op0, cmp_op1;
16278 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16280 if (eflags_p)
16282 cmp_op0 = operands[0];
16283 cmp_op1 = operands[1];
16285 else
16287 cmp_op0 = operands[1];
16288 cmp_op1 = operands[2];
16291 if (is_sse)
16293 if (GET_MODE (operands[0]) == SFmode)
16294 if (unordered_p)
16295 return "%vucomiss\t{%1, %0|%0, %1}";
16296 else
16297 return "%vcomiss\t{%1, %0|%0, %1}";
16298 else
16299 if (unordered_p)
16300 return "%vucomisd\t{%1, %0|%0, %1}";
16301 else
16302 return "%vcomisd\t{%1, %0|%0, %1}";
16305 gcc_assert (STACK_TOP_P (cmp_op0));
16307 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16309 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16311 if (stack_top_dies)
16313 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16314 return output_387_ffreep (operands, 1);
16316 else
16317 return "ftst\n\tfnstsw\t%0";
16320 if (STACK_REG_P (cmp_op1)
16321 && stack_top_dies
16322 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16323 && REGNO (cmp_op1) != FIRST_STACK_REG)
16325 /* If both the top of the 387 stack dies, and the other operand
16326 is also a stack register that dies, then this must be a
16327 `fcompp' float compare */
16329 if (eflags_p)
16331 /* There is no double popping fcomi variant. Fortunately,
16332 eflags is immune from the fstp's cc clobbering. */
16333 if (unordered_p)
16334 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16335 else
16336 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16337 return output_387_ffreep (operands, 0);
16339 else
16341 if (unordered_p)
16342 return "fucompp\n\tfnstsw\t%0";
16343 else
16344 return "fcompp\n\tfnstsw\t%0";
16347 else
16349 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16351 static const char * const alt[16] =
16353 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16354 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16355 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16356 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16358 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16359 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16360 NULL,
16361 NULL,
16363 "fcomi\t{%y1, %0|%0, %y1}",
16364 "fcomip\t{%y1, %0|%0, %y1}",
16365 "fucomi\t{%y1, %0|%0, %y1}",
16366 "fucomip\t{%y1, %0|%0, %y1}",
16368 NULL,
16369 NULL,
16370 NULL,
16371 NULL
16374 int mask;
16375 const char *ret;
16377 mask = eflags_p << 3;
16378 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16379 mask |= unordered_p << 1;
16380 mask |= stack_top_dies;
16382 gcc_assert (mask < 16);
16383 ret = alt[mask];
16384 gcc_assert (ret);
16386 return ret;
16390 void
16391 ix86_output_addr_vec_elt (FILE *file, int value)
16393 const char *directive = ASM_LONG;
16395 #ifdef ASM_QUAD
16396 if (TARGET_LP64)
16397 directive = ASM_QUAD;
16398 #else
16399 gcc_assert (!TARGET_64BIT);
16400 #endif
16402 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16405 void
16406 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16408 const char *directive = ASM_LONG;
16410 #ifdef ASM_QUAD
16411 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16412 directive = ASM_QUAD;
16413 #else
16414 gcc_assert (!TARGET_64BIT);
16415 #endif
16416 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16417 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16418 fprintf (file, "%s%s%d-%s%d\n",
16419 directive, LPREFIX, value, LPREFIX, rel);
16420 else if (HAVE_AS_GOTOFF_IN_DATA)
16421 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16422 #if TARGET_MACHO
16423 else if (TARGET_MACHO)
16425 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16426 machopic_output_function_base_name (file);
16427 putc ('\n', file);
16429 #endif
16430 else
16431 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16432 GOT_SYMBOL_NAME, LPREFIX, value);
16435 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16436 for the target. */
16438 void
16439 ix86_expand_clear (rtx dest)
16441 rtx tmp;
16443 /* We play register width games, which are only valid after reload. */
16444 gcc_assert (reload_completed);
16446 /* Avoid HImode and its attendant prefix byte. */
16447 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16448 dest = gen_rtx_REG (SImode, REGNO (dest));
16449 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16451 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16452 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16454 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16455 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16458 emit_insn (tmp);
16461 /* X is an unchanging MEM. If it is a constant pool reference, return
16462 the constant pool rtx, else NULL. */
16465 maybe_get_pool_constant (rtx x)
16467 x = ix86_delegitimize_address (XEXP (x, 0));
16469 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16470 return get_pool_constant (x);
16472 return NULL_RTX;
16475 void
16476 ix86_expand_move (enum machine_mode mode, rtx operands[])
16478 rtx op0, op1;
16479 enum tls_model model;
16481 op0 = operands[0];
16482 op1 = operands[1];
16484 if (GET_CODE (op1) == SYMBOL_REF)
16486 rtx tmp;
16488 model = SYMBOL_REF_TLS_MODEL (op1);
16489 if (model)
16491 op1 = legitimize_tls_address (op1, model, true);
16492 op1 = force_operand (op1, op0);
16493 if (op1 == op0)
16494 return;
16495 op1 = convert_to_mode (mode, op1, 1);
16497 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16498 op1 = tmp;
16500 else if (GET_CODE (op1) == CONST
16501 && GET_CODE (XEXP (op1, 0)) == PLUS
16502 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16504 rtx addend = XEXP (XEXP (op1, 0), 1);
16505 rtx symbol = XEXP (XEXP (op1, 0), 0);
16506 rtx tmp;
16508 model = SYMBOL_REF_TLS_MODEL (symbol);
16509 if (model)
16510 tmp = legitimize_tls_address (symbol, model, true);
16511 else
16512 tmp = legitimize_pe_coff_symbol (symbol, true);
16514 if (tmp)
16516 tmp = force_operand (tmp, NULL);
16517 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16518 op0, 1, OPTAB_DIRECT);
16519 if (tmp == op0)
16520 return;
16521 op1 = convert_to_mode (mode, tmp, 1);
16525 if ((flag_pic || MACHOPIC_INDIRECT)
16526 && symbolic_operand (op1, mode))
16528 if (TARGET_MACHO && !TARGET_64BIT)
16530 #if TARGET_MACHO
16531 /* dynamic-no-pic */
16532 if (MACHOPIC_INDIRECT)
16534 rtx temp = ((reload_in_progress
16535 || ((op0 && REG_P (op0))
16536 && mode == Pmode))
16537 ? op0 : gen_reg_rtx (Pmode));
16538 op1 = machopic_indirect_data_reference (op1, temp);
16539 if (MACHOPIC_PURE)
16540 op1 = machopic_legitimize_pic_address (op1, mode,
16541 temp == op1 ? 0 : temp);
16543 if (op0 != op1 && GET_CODE (op0) != MEM)
16545 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16546 emit_insn (insn);
16547 return;
16549 if (GET_CODE (op0) == MEM)
16550 op1 = force_reg (Pmode, op1);
16551 else
16553 rtx temp = op0;
16554 if (GET_CODE (temp) != REG)
16555 temp = gen_reg_rtx (Pmode);
16556 temp = legitimize_pic_address (op1, temp);
16557 if (temp == op0)
16558 return;
16559 op1 = temp;
16561 /* dynamic-no-pic */
16562 #endif
16564 else
16566 if (MEM_P (op0))
16567 op1 = force_reg (mode, op1);
16568 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16570 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16571 op1 = legitimize_pic_address (op1, reg);
16572 if (op0 == op1)
16573 return;
16574 op1 = convert_to_mode (mode, op1, 1);
16578 else
16580 if (MEM_P (op0)
16581 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16582 || !push_operand (op0, mode))
16583 && MEM_P (op1))
16584 op1 = force_reg (mode, op1);
16586 if (push_operand (op0, mode)
16587 && ! general_no_elim_operand (op1, mode))
16588 op1 = copy_to_mode_reg (mode, op1);
16590 /* Force large constants in 64bit compilation into register
16591 to get them CSEed. */
16592 if (can_create_pseudo_p ()
16593 && (mode == DImode) && TARGET_64BIT
16594 && immediate_operand (op1, mode)
16595 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16596 && !register_operand (op0, mode)
16597 && optimize)
16598 op1 = copy_to_mode_reg (mode, op1);
16600 if (can_create_pseudo_p ()
16601 && FLOAT_MODE_P (mode)
16602 && GET_CODE (op1) == CONST_DOUBLE)
16604 /* If we are loading a floating point constant to a register,
16605 force the value to memory now, since we'll get better code
16606 out the back end. */
16608 op1 = validize_mem (force_const_mem (mode, op1));
16609 if (!register_operand (op0, mode))
16611 rtx temp = gen_reg_rtx (mode);
16612 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16613 emit_move_insn (op0, temp);
16614 return;
16619 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16622 void
16623 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16625 rtx op0 = operands[0], op1 = operands[1];
16626 unsigned int align = GET_MODE_ALIGNMENT (mode);
16628 /* Force constants other than zero into memory. We do not know how
16629 the instructions used to build constants modify the upper 64 bits
16630 of the register, once we have that information we may be able
16631 to handle some of them more efficiently. */
16632 if (can_create_pseudo_p ()
16633 && register_operand (op0, mode)
16634 && (CONSTANT_P (op1)
16635 || (GET_CODE (op1) == SUBREG
16636 && CONSTANT_P (SUBREG_REG (op1))))
16637 && !standard_sse_constant_p (op1))
16638 op1 = validize_mem (force_const_mem (mode, op1));
16640 /* We need to check memory alignment for SSE mode since attribute
16641 can make operands unaligned. */
16642 if (can_create_pseudo_p ()
16643 && SSE_REG_MODE_P (mode)
16644 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16645 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16647 rtx tmp[2];
16649 /* ix86_expand_vector_move_misalign() does not like constants ... */
16650 if (CONSTANT_P (op1)
16651 || (GET_CODE (op1) == SUBREG
16652 && CONSTANT_P (SUBREG_REG (op1))))
16653 op1 = validize_mem (force_const_mem (mode, op1));
16655 /* ... nor both arguments in memory. */
16656 if (!register_operand (op0, mode)
16657 && !register_operand (op1, mode))
16658 op1 = force_reg (mode, op1);
16660 tmp[0] = op0; tmp[1] = op1;
16661 ix86_expand_vector_move_misalign (mode, tmp);
16662 return;
16665 /* Make operand1 a register if it isn't already. */
16666 if (can_create_pseudo_p ()
16667 && !register_operand (op0, mode)
16668 && !register_operand (op1, mode))
16670 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16671 return;
16674 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16677 /* Split 32-byte AVX unaligned load and store if needed. */
16679 static void
16680 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16682 rtx m;
16683 rtx (*extract) (rtx, rtx, rtx);
16684 rtx (*load_unaligned) (rtx, rtx);
16685 rtx (*store_unaligned) (rtx, rtx);
16686 enum machine_mode mode;
16688 switch (GET_MODE (op0))
16690 default:
16691 gcc_unreachable ();
16692 case V32QImode:
16693 extract = gen_avx_vextractf128v32qi;
16694 load_unaligned = gen_avx_loaddqu256;
16695 store_unaligned = gen_avx_storedqu256;
16696 mode = V16QImode;
16697 break;
16698 case V8SFmode:
16699 extract = gen_avx_vextractf128v8sf;
16700 load_unaligned = gen_avx_loadups256;
16701 store_unaligned = gen_avx_storeups256;
16702 mode = V4SFmode;
16703 break;
16704 case V4DFmode:
16705 extract = gen_avx_vextractf128v4df;
16706 load_unaligned = gen_avx_loadupd256;
16707 store_unaligned = gen_avx_storeupd256;
16708 mode = V2DFmode;
16709 break;
16712 if (MEM_P (op1))
16714 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16716 rtx r = gen_reg_rtx (mode);
16717 m = adjust_address (op1, mode, 0);
16718 emit_move_insn (r, m);
16719 m = adjust_address (op1, mode, 16);
16720 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16721 emit_move_insn (op0, r);
16723 else
16724 emit_insn (load_unaligned (op0, op1));
16726 else if (MEM_P (op0))
16728 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16730 m = adjust_address (op0, mode, 0);
16731 emit_insn (extract (m, op1, const0_rtx));
16732 m = adjust_address (op0, mode, 16);
16733 emit_insn (extract (m, op1, const1_rtx));
16735 else
16736 emit_insn (store_unaligned (op0, op1));
16738 else
16739 gcc_unreachable ();
16742 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16743 straight to ix86_expand_vector_move. */
16744 /* Code generation for scalar reg-reg moves of single and double precision data:
16745 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16746 movaps reg, reg
16747 else
16748 movss reg, reg
16749 if (x86_sse_partial_reg_dependency == true)
16750 movapd reg, reg
16751 else
16752 movsd reg, reg
16754 Code generation for scalar loads of double precision data:
16755 if (x86_sse_split_regs == true)
16756 movlpd mem, reg (gas syntax)
16757 else
16758 movsd mem, reg
16760 Code generation for unaligned packed loads of single precision data
16761 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16762 if (x86_sse_unaligned_move_optimal)
16763 movups mem, reg
16765 if (x86_sse_partial_reg_dependency == true)
16767 xorps reg, reg
16768 movlps mem, reg
16769 movhps mem+8, reg
16771 else
16773 movlps mem, reg
16774 movhps mem+8, reg
16777 Code generation for unaligned packed loads of double precision data
16778 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16779 if (x86_sse_unaligned_move_optimal)
16780 movupd mem, reg
16782 if (x86_sse_split_regs == true)
16784 movlpd mem, reg
16785 movhpd mem+8, reg
16787 else
16789 movsd mem, reg
16790 movhpd mem+8, reg
16794 void
16795 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16797 rtx op0, op1, m;
16799 op0 = operands[0];
16800 op1 = operands[1];
16802 if (TARGET_AVX
16803 && GET_MODE_SIZE (mode) == 32)
16805 switch (GET_MODE_CLASS (mode))
16807 case MODE_VECTOR_INT:
16808 case MODE_INT:
16809 op0 = gen_lowpart (V32QImode, op0);
16810 op1 = gen_lowpart (V32QImode, op1);
16811 /* FALLTHRU */
16813 case MODE_VECTOR_FLOAT:
16814 ix86_avx256_split_vector_move_misalign (op0, op1);
16815 break;
16817 default:
16818 gcc_unreachable ();
16821 return;
16824 if (MEM_P (op1))
16826 /* ??? If we have typed data, then it would appear that using
16827 movdqu is the only way to get unaligned data loaded with
16828 integer type. */
16829 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16831 op0 = gen_lowpart (V16QImode, op0);
16832 op1 = gen_lowpart (V16QImode, op1);
16833 /* We will eventually emit movups based on insn attributes. */
16834 emit_insn (gen_sse2_loaddqu (op0, op1));
16836 else if (TARGET_SSE2 && mode == V2DFmode)
16838 rtx zero;
16840 if (TARGET_AVX
16841 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16842 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16843 || optimize_insn_for_size_p ())
16845 /* We will eventually emit movups based on insn attributes. */
16846 emit_insn (gen_sse2_loadupd (op0, op1));
16847 return;
16850 /* When SSE registers are split into halves, we can avoid
16851 writing to the top half twice. */
16852 if (TARGET_SSE_SPLIT_REGS)
16854 emit_clobber (op0);
16855 zero = op0;
16857 else
16859 /* ??? Not sure about the best option for the Intel chips.
16860 The following would seem to satisfy; the register is
16861 entirely cleared, breaking the dependency chain. We
16862 then store to the upper half, with a dependency depth
16863 of one. A rumor has it that Intel recommends two movsd
16864 followed by an unpacklpd, but this is unconfirmed. And
16865 given that the dependency depth of the unpacklpd would
16866 still be one, I'm not sure why this would be better. */
16867 zero = CONST0_RTX (V2DFmode);
16870 m = adjust_address (op1, DFmode, 0);
16871 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16872 m = adjust_address (op1, DFmode, 8);
16873 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16875 else
16877 if (TARGET_AVX
16878 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16879 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16880 || optimize_insn_for_size_p ())
16882 op0 = gen_lowpart (V4SFmode, op0);
16883 op1 = gen_lowpart (V4SFmode, op1);
16884 emit_insn (gen_sse_loadups (op0, op1));
16885 return;
16888 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16889 emit_move_insn (op0, CONST0_RTX (mode));
16890 else
16891 emit_clobber (op0);
16893 if (mode != V4SFmode)
16894 op0 = gen_lowpart (V4SFmode, op0);
16896 m = adjust_address (op1, V2SFmode, 0);
16897 emit_insn (gen_sse_loadlps (op0, op0, m));
16898 m = adjust_address (op1, V2SFmode, 8);
16899 emit_insn (gen_sse_loadhps (op0, op0, m));
16902 else if (MEM_P (op0))
16904 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16906 op0 = gen_lowpart (V16QImode, op0);
16907 op1 = gen_lowpart (V16QImode, op1);
16908 /* We will eventually emit movups based on insn attributes. */
16909 emit_insn (gen_sse2_storedqu (op0, op1));
16911 else if (TARGET_SSE2 && mode == V2DFmode)
16913 if (TARGET_AVX
16914 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16915 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16916 || optimize_insn_for_size_p ())
16917 /* We will eventually emit movups based on insn attributes. */
16918 emit_insn (gen_sse2_storeupd (op0, op1));
16919 else
16921 m = adjust_address (op0, DFmode, 0);
16922 emit_insn (gen_sse2_storelpd (m, op1));
16923 m = adjust_address (op0, DFmode, 8);
16924 emit_insn (gen_sse2_storehpd (m, op1));
16927 else
16929 if (mode != V4SFmode)
16930 op1 = gen_lowpart (V4SFmode, op1);
16932 if (TARGET_AVX
16933 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16934 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16935 || optimize_insn_for_size_p ())
16937 op0 = gen_lowpart (V4SFmode, op0);
16938 emit_insn (gen_sse_storeups (op0, op1));
16940 else
16942 m = adjust_address (op0, V2SFmode, 0);
16943 emit_insn (gen_sse_storelps (m, op1));
16944 m = adjust_address (op0, V2SFmode, 8);
16945 emit_insn (gen_sse_storehps (m, op1));
16949 else
16950 gcc_unreachable ();
16953 /* Expand a push in MODE. This is some mode for which we do not support
16954 proper push instructions, at least from the registers that we expect
16955 the value to live in. */
16957 void
16958 ix86_expand_push (enum machine_mode mode, rtx x)
16960 rtx tmp;
16962 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16963 GEN_INT (-GET_MODE_SIZE (mode)),
16964 stack_pointer_rtx, 1, OPTAB_DIRECT);
16965 if (tmp != stack_pointer_rtx)
16966 emit_move_insn (stack_pointer_rtx, tmp);
16968 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16970 /* When we push an operand onto stack, it has to be aligned at least
16971 at the function argument boundary. However since we don't have
16972 the argument type, we can't determine the actual argument
16973 boundary. */
16974 emit_move_insn (tmp, x);
16977 /* Helper function of ix86_fixup_binary_operands to canonicalize
16978 operand order. Returns true if the operands should be swapped. */
16980 static bool
16981 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16982 rtx operands[])
16984 rtx dst = operands[0];
16985 rtx src1 = operands[1];
16986 rtx src2 = operands[2];
16988 /* If the operation is not commutative, we can't do anything. */
16989 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16990 return false;
16992 /* Highest priority is that src1 should match dst. */
16993 if (rtx_equal_p (dst, src1))
16994 return false;
16995 if (rtx_equal_p (dst, src2))
16996 return true;
16998 /* Next highest priority is that immediate constants come second. */
16999 if (immediate_operand (src2, mode))
17000 return false;
17001 if (immediate_operand (src1, mode))
17002 return true;
17004 /* Lowest priority is that memory references should come second. */
17005 if (MEM_P (src2))
17006 return false;
17007 if (MEM_P (src1))
17008 return true;
17010 return false;
17014 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
17015 destination to use for the operation. If different from the true
17016 destination in operands[0], a copy operation will be required. */
17019 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
17020 rtx operands[])
17022 rtx dst = operands[0];
17023 rtx src1 = operands[1];
17024 rtx src2 = operands[2];
17026 /* Canonicalize operand order. */
17027 if (ix86_swap_binary_operands_p (code, mode, operands))
17029 rtx temp;
17031 /* It is invalid to swap operands of different modes. */
17032 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
17034 temp = src1;
17035 src1 = src2;
17036 src2 = temp;
17039 /* Both source operands cannot be in memory. */
17040 if (MEM_P (src1) && MEM_P (src2))
17042 /* Optimization: Only read from memory once. */
17043 if (rtx_equal_p (src1, src2))
17045 src2 = force_reg (mode, src2);
17046 src1 = src2;
17048 else
17049 src2 = force_reg (mode, src2);
17052 /* If the destination is memory, and we do not have matching source
17053 operands, do things in registers. */
17054 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17055 dst = gen_reg_rtx (mode);
17057 /* Source 1 cannot be a constant. */
17058 if (CONSTANT_P (src1))
17059 src1 = force_reg (mode, src1);
17061 /* Source 1 cannot be a non-matching memory. */
17062 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17063 src1 = force_reg (mode, src1);
17065 /* Improve address combine. */
17066 if (code == PLUS
17067 && GET_MODE_CLASS (mode) == MODE_INT
17068 && MEM_P (src2))
17069 src2 = force_reg (mode, src2);
17071 operands[1] = src1;
17072 operands[2] = src2;
17073 return dst;
17076 /* Similarly, but assume that the destination has already been
17077 set up properly. */
17079 void
17080 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
17081 enum machine_mode mode, rtx operands[])
17083 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
17084 gcc_assert (dst == operands[0]);
17087 /* Attempt to expand a binary operator. Make the expansion closer to the
17088 actual machine, then just general_operand, which will allow 3 separate
17089 memory references (one output, two input) in a single insn. */
17091 void
17092 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
17093 rtx operands[])
17095 rtx src1, src2, dst, op, clob;
17097 dst = ix86_fixup_binary_operands (code, mode, operands);
17098 src1 = operands[1];
17099 src2 = operands[2];
17101 /* Emit the instruction. */
17103 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
17104 if (reload_in_progress)
17106 /* Reload doesn't know about the flags register, and doesn't know that
17107 it doesn't want to clobber it. We can only do this with PLUS. */
17108 gcc_assert (code == PLUS);
17109 emit_insn (op);
17111 else if (reload_completed
17112 && code == PLUS
17113 && !rtx_equal_p (dst, src1))
17115 /* This is going to be an LEA; avoid splitting it later. */
17116 emit_insn (op);
17118 else
17120 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17121 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17124 /* Fix up the destination if needed. */
17125 if (dst != operands[0])
17126 emit_move_insn (operands[0], dst);
17129 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
17130 the given OPERANDS. */
17132 void
17133 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
17134 rtx operands[])
17136 rtx op1 = NULL_RTX, op2 = NULL_RTX;
17137 if (GET_CODE (operands[1]) == SUBREG)
17139 op1 = operands[1];
17140 op2 = operands[2];
17142 else if (GET_CODE (operands[2]) == SUBREG)
17144 op1 = operands[2];
17145 op2 = operands[1];
17147 /* Optimize (__m128i) d | (__m128i) e and similar code
17148 when d and e are float vectors into float vector logical
17149 insn. In C/C++ without using intrinsics there is no other way
17150 to express vector logical operation on float vectors than
17151 to cast them temporarily to integer vectors. */
17152 if (op1
17153 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17154 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
17155 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
17156 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
17157 && SUBREG_BYTE (op1) == 0
17158 && (GET_CODE (op2) == CONST_VECTOR
17159 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
17160 && SUBREG_BYTE (op2) == 0))
17161 && can_create_pseudo_p ())
17163 rtx dst;
17164 switch (GET_MODE (SUBREG_REG (op1)))
17166 case V4SFmode:
17167 case V8SFmode:
17168 case V2DFmode:
17169 case V4DFmode:
17170 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
17171 if (GET_CODE (op2) == CONST_VECTOR)
17173 op2 = gen_lowpart (GET_MODE (dst), op2);
17174 op2 = force_reg (GET_MODE (dst), op2);
17176 else
17178 op1 = operands[1];
17179 op2 = SUBREG_REG (operands[2]);
17180 if (!nonimmediate_operand (op2, GET_MODE (dst)))
17181 op2 = force_reg (GET_MODE (dst), op2);
17183 op1 = SUBREG_REG (op1);
17184 if (!nonimmediate_operand (op1, GET_MODE (dst)))
17185 op1 = force_reg (GET_MODE (dst), op1);
17186 emit_insn (gen_rtx_SET (VOIDmode, dst,
17187 gen_rtx_fmt_ee (code, GET_MODE (dst),
17188 op1, op2)));
17189 emit_move_insn (operands[0], gen_lowpart (mode, dst));
17190 return;
17191 default:
17192 break;
17195 if (!nonimmediate_operand (operands[1], mode))
17196 operands[1] = force_reg (mode, operands[1]);
17197 if (!nonimmediate_operand (operands[2], mode))
17198 operands[2] = force_reg (mode, operands[2]);
17199 ix86_fixup_binary_operands_no_copy (code, mode, operands);
17200 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
17201 gen_rtx_fmt_ee (code, mode, operands[1],
17202 operands[2])));
17205 /* Return TRUE or FALSE depending on whether the binary operator meets the
17206 appropriate constraints. */
17208 bool
17209 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
17210 rtx operands[3])
17212 rtx dst = operands[0];
17213 rtx src1 = operands[1];
17214 rtx src2 = operands[2];
17216 /* Both source operands cannot be in memory. */
17217 if (MEM_P (src1) && MEM_P (src2))
17218 return false;
17220 /* Canonicalize operand order for commutative operators. */
17221 if (ix86_swap_binary_operands_p (code, mode, operands))
17223 rtx temp = src1;
17224 src1 = src2;
17225 src2 = temp;
17228 /* If the destination is memory, we must have a matching source operand. */
17229 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17230 return false;
17232 /* Source 1 cannot be a constant. */
17233 if (CONSTANT_P (src1))
17234 return false;
17236 /* Source 1 cannot be a non-matching memory. */
17237 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17238 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17239 return (code == AND
17240 && (mode == HImode
17241 || mode == SImode
17242 || (TARGET_64BIT && mode == DImode))
17243 && satisfies_constraint_L (src2));
17245 return true;
17248 /* Attempt to expand a unary operator. Make the expansion closer to the
17249 actual machine, then just general_operand, which will allow 2 separate
17250 memory references (one output, one input) in a single insn. */
17252 void
17253 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17254 rtx operands[])
17256 int matching_memory;
17257 rtx src, dst, op, clob;
17259 dst = operands[0];
17260 src = operands[1];
17262 /* If the destination is memory, and we do not have matching source
17263 operands, do things in registers. */
17264 matching_memory = 0;
17265 if (MEM_P (dst))
17267 if (rtx_equal_p (dst, src))
17268 matching_memory = 1;
17269 else
17270 dst = gen_reg_rtx (mode);
17273 /* When source operand is memory, destination must match. */
17274 if (MEM_P (src) && !matching_memory)
17275 src = force_reg (mode, src);
17277 /* Emit the instruction. */
17279 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17280 if (reload_in_progress || code == NOT)
17282 /* Reload doesn't know about the flags register, and doesn't know that
17283 it doesn't want to clobber it. */
17284 gcc_assert (code == NOT);
17285 emit_insn (op);
17287 else
17289 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17290 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17293 /* Fix up the destination if needed. */
17294 if (dst != operands[0])
17295 emit_move_insn (operands[0], dst);
17298 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17299 divisor are within the range [0-255]. */
17301 void
17302 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17303 bool signed_p)
17305 rtx end_label, qimode_label;
17306 rtx insn, div, mod;
17307 rtx scratch, tmp0, tmp1, tmp2;
17308 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17309 rtx (*gen_zero_extend) (rtx, rtx);
17310 rtx (*gen_test_ccno_1) (rtx, rtx);
17312 switch (mode)
17314 case SImode:
17315 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17316 gen_test_ccno_1 = gen_testsi_ccno_1;
17317 gen_zero_extend = gen_zero_extendqisi2;
17318 break;
17319 case DImode:
17320 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17321 gen_test_ccno_1 = gen_testdi_ccno_1;
17322 gen_zero_extend = gen_zero_extendqidi2;
17323 break;
17324 default:
17325 gcc_unreachable ();
17328 end_label = gen_label_rtx ();
17329 qimode_label = gen_label_rtx ();
17331 scratch = gen_reg_rtx (mode);
17333 /* Use 8bit unsigned divimod if dividend and divisor are within
17334 the range [0-255]. */
17335 emit_move_insn (scratch, operands[2]);
17336 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17337 scratch, 1, OPTAB_DIRECT);
17338 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17339 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17340 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17341 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17342 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17343 pc_rtx);
17344 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17345 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17346 JUMP_LABEL (insn) = qimode_label;
17348 /* Generate original signed/unsigned divimod. */
17349 div = gen_divmod4_1 (operands[0], operands[1],
17350 operands[2], operands[3]);
17351 emit_insn (div);
17353 /* Branch to the end. */
17354 emit_jump_insn (gen_jump (end_label));
17355 emit_barrier ();
17357 /* Generate 8bit unsigned divide. */
17358 emit_label (qimode_label);
17359 /* Don't use operands[0] for result of 8bit divide since not all
17360 registers support QImode ZERO_EXTRACT. */
17361 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17362 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17363 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17364 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17366 if (signed_p)
17368 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17369 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17371 else
17373 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17374 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17377 /* Extract remainder from AH. */
17378 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17379 if (REG_P (operands[1]))
17380 insn = emit_move_insn (operands[1], tmp1);
17381 else
17383 /* Need a new scratch register since the old one has result
17384 of 8bit divide. */
17385 scratch = gen_reg_rtx (mode);
17386 emit_move_insn (scratch, tmp1);
17387 insn = emit_move_insn (operands[1], scratch);
17389 set_unique_reg_note (insn, REG_EQUAL, mod);
17391 /* Zero extend quotient from AL. */
17392 tmp1 = gen_lowpart (QImode, tmp0);
17393 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17394 set_unique_reg_note (insn, REG_EQUAL, div);
17396 emit_label (end_label);
17399 #define LEA_MAX_STALL (3)
17400 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17402 /* Increase given DISTANCE in half-cycles according to
17403 dependencies between PREV and NEXT instructions.
17404 Add 1 half-cycle if there is no dependency and
17405 go to next cycle if there is some dependecy. */
17407 static unsigned int
17408 increase_distance (rtx prev, rtx next, unsigned int distance)
17410 df_ref *use_rec;
17411 df_ref *def_rec;
17413 if (!prev || !next)
17414 return distance + (distance & 1) + 2;
17416 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17417 return distance + 1;
17419 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17420 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17421 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17422 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17423 return distance + (distance & 1) + 2;
17425 return distance + 1;
17428 /* Function checks if instruction INSN defines register number
17429 REGNO1 or REGNO2. */
17431 static bool
17432 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17433 rtx insn)
17435 df_ref *def_rec;
17437 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17438 if (DF_REF_REG_DEF_P (*def_rec)
17439 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17440 && (regno1 == DF_REF_REGNO (*def_rec)
17441 || regno2 == DF_REF_REGNO (*def_rec)))
17443 return true;
17446 return false;
17449 /* Function checks if instruction INSN uses register number
17450 REGNO as a part of address expression. */
17452 static bool
17453 insn_uses_reg_mem (unsigned int regno, rtx insn)
17455 df_ref *use_rec;
17457 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17458 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17459 return true;
17461 return false;
17464 /* Search backward for non-agu definition of register number REGNO1
17465 or register number REGNO2 in basic block starting from instruction
17466 START up to head of basic block or instruction INSN.
17468 Function puts true value into *FOUND var if definition was found
17469 and false otherwise.
17471 Distance in half-cycles between START and found instruction or head
17472 of BB is added to DISTANCE and returned. */
17474 static int
17475 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17476 rtx insn, int distance,
17477 rtx start, bool *found)
17479 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17480 rtx prev = start;
17481 rtx next = NULL;
17483 *found = false;
17485 while (prev
17486 && prev != insn
17487 && distance < LEA_SEARCH_THRESHOLD)
17489 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17491 distance = increase_distance (prev, next, distance);
17492 if (insn_defines_reg (regno1, regno2, prev))
17494 if (recog_memoized (prev) < 0
17495 || get_attr_type (prev) != TYPE_LEA)
17497 *found = true;
17498 return distance;
17502 next = prev;
17504 if (prev == BB_HEAD (bb))
17505 break;
17507 prev = PREV_INSN (prev);
17510 return distance;
17513 /* Search backward for non-agu definition of register number REGNO1
17514 or register number REGNO2 in INSN's basic block until
17515 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17516 2. Reach neighbour BBs boundary, or
17517 3. Reach agu definition.
17518 Returns the distance between the non-agu definition point and INSN.
17519 If no definition point, returns -1. */
17521 static int
17522 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17523 rtx insn)
17525 basic_block bb = BLOCK_FOR_INSN (insn);
17526 int distance = 0;
17527 bool found = false;
17529 if (insn != BB_HEAD (bb))
17530 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17531 distance, PREV_INSN (insn),
17532 &found);
17534 if (!found && distance < LEA_SEARCH_THRESHOLD)
17536 edge e;
17537 edge_iterator ei;
17538 bool simple_loop = false;
17540 FOR_EACH_EDGE (e, ei, bb->preds)
17541 if (e->src == bb)
17543 simple_loop = true;
17544 break;
17547 if (simple_loop)
17548 distance = distance_non_agu_define_in_bb (regno1, regno2,
17549 insn, distance,
17550 BB_END (bb), &found);
17551 else
17553 int shortest_dist = -1;
17554 bool found_in_bb = false;
17556 FOR_EACH_EDGE (e, ei, bb->preds)
17558 int bb_dist
17559 = distance_non_agu_define_in_bb (regno1, regno2,
17560 insn, distance,
17561 BB_END (e->src),
17562 &found_in_bb);
17563 if (found_in_bb)
17565 if (shortest_dist < 0)
17566 shortest_dist = bb_dist;
17567 else if (bb_dist > 0)
17568 shortest_dist = MIN (bb_dist, shortest_dist);
17570 found = true;
17574 distance = shortest_dist;
17578 /* get_attr_type may modify recog data. We want to make sure
17579 that recog data is valid for instruction INSN, on which
17580 distance_non_agu_define is called. INSN is unchanged here. */
17581 extract_insn_cached (insn);
17583 if (!found)
17584 return -1;
17586 return distance >> 1;
17589 /* Return the distance in half-cycles between INSN and the next
17590 insn that uses register number REGNO in memory address added
17591 to DISTANCE. Return -1 if REGNO0 is set.
17593 Put true value into *FOUND if register usage was found and
17594 false otherwise.
17595 Put true value into *REDEFINED if register redefinition was
17596 found and false otherwise. */
17598 static int
17599 distance_agu_use_in_bb (unsigned int regno,
17600 rtx insn, int distance, rtx start,
17601 bool *found, bool *redefined)
17603 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17604 rtx next = start;
17605 rtx prev = NULL;
17607 *found = false;
17608 *redefined = false;
17610 while (next
17611 && next != insn
17612 && distance < LEA_SEARCH_THRESHOLD)
17614 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17616 distance = increase_distance(prev, next, distance);
17617 if (insn_uses_reg_mem (regno, next))
17619 /* Return DISTANCE if OP0 is used in memory
17620 address in NEXT. */
17621 *found = true;
17622 return distance;
17625 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17627 /* Return -1 if OP0 is set in NEXT. */
17628 *redefined = true;
17629 return -1;
17632 prev = next;
17635 if (next == BB_END (bb))
17636 break;
17638 next = NEXT_INSN (next);
17641 return distance;
17644 /* Return the distance between INSN and the next insn that uses
17645 register number REGNO0 in memory address. Return -1 if no such
17646 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17648 static int
17649 distance_agu_use (unsigned int regno0, rtx insn)
17651 basic_block bb = BLOCK_FOR_INSN (insn);
17652 int distance = 0;
17653 bool found = false;
17654 bool redefined = false;
17656 if (insn != BB_END (bb))
17657 distance = distance_agu_use_in_bb (regno0, insn, distance,
17658 NEXT_INSN (insn),
17659 &found, &redefined);
17661 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17663 edge e;
17664 edge_iterator ei;
17665 bool simple_loop = false;
17667 FOR_EACH_EDGE (e, ei, bb->succs)
17668 if (e->dest == bb)
17670 simple_loop = true;
17671 break;
17674 if (simple_loop)
17675 distance = distance_agu_use_in_bb (regno0, insn,
17676 distance, BB_HEAD (bb),
17677 &found, &redefined);
17678 else
17680 int shortest_dist = -1;
17681 bool found_in_bb = false;
17682 bool redefined_in_bb = false;
17684 FOR_EACH_EDGE (e, ei, bb->succs)
17686 int bb_dist
17687 = distance_agu_use_in_bb (regno0, insn,
17688 distance, BB_HEAD (e->dest),
17689 &found_in_bb, &redefined_in_bb);
17690 if (found_in_bb)
17692 if (shortest_dist < 0)
17693 shortest_dist = bb_dist;
17694 else if (bb_dist > 0)
17695 shortest_dist = MIN (bb_dist, shortest_dist);
17697 found = true;
17701 distance = shortest_dist;
17705 if (!found || redefined)
17706 return -1;
17708 return distance >> 1;
17711 /* Define this macro to tune LEA priority vs ADD, it take effect when
17712 there is a dilemma of choicing LEA or ADD
17713 Negative value: ADD is more preferred than LEA
17714 Zero: Netrual
17715 Positive value: LEA is more preferred than ADD*/
17716 #define IX86_LEA_PRIORITY 0
17718 /* Return true if usage of lea INSN has performance advantage
17719 over a sequence of instructions. Instructions sequence has
17720 SPLIT_COST cycles higher latency than lea latency. */
17722 static bool
17723 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17724 unsigned int regno2, int split_cost, bool has_scale)
17726 int dist_define, dist_use;
17728 /* For Silvermont if using a 2-source or 3-source LEA for
17729 non-destructive destination purposes, or due to wanting
17730 ability to use SCALE, the use of LEA is justified. */
17731 if (ix86_tune == PROCESSOR_SLM)
17733 if (has_scale)
17734 return true;
17735 if (split_cost < 1)
17736 return false;
17737 if (regno0 == regno1 || regno0 == regno2)
17738 return false;
17739 return true;
17742 dist_define = distance_non_agu_define (regno1, regno2, insn);
17743 dist_use = distance_agu_use (regno0, insn);
17745 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17747 /* If there is no non AGU operand definition, no AGU
17748 operand usage and split cost is 0 then both lea
17749 and non lea variants have same priority. Currently
17750 we prefer lea for 64 bit code and non lea on 32 bit
17751 code. */
17752 if (dist_use < 0 && split_cost == 0)
17753 return TARGET_64BIT || IX86_LEA_PRIORITY;
17754 else
17755 return true;
17758 /* With longer definitions distance lea is more preferable.
17759 Here we change it to take into account splitting cost and
17760 lea priority. */
17761 dist_define += split_cost + IX86_LEA_PRIORITY;
17763 /* If there is no use in memory addess then we just check
17764 that split cost exceeds AGU stall. */
17765 if (dist_use < 0)
17766 return dist_define > LEA_MAX_STALL;
17768 /* If this insn has both backward non-agu dependence and forward
17769 agu dependence, the one with short distance takes effect. */
17770 return dist_define >= dist_use;
17773 /* Return true if it is legal to clobber flags by INSN and
17774 false otherwise. */
17776 static bool
17777 ix86_ok_to_clobber_flags (rtx insn)
17779 basic_block bb = BLOCK_FOR_INSN (insn);
17780 df_ref *use;
17781 bitmap live;
17783 while (insn)
17785 if (NONDEBUG_INSN_P (insn))
17787 for (use = DF_INSN_USES (insn); *use; use++)
17788 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17789 return false;
17791 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17792 return true;
17795 if (insn == BB_END (bb))
17796 break;
17798 insn = NEXT_INSN (insn);
17801 live = df_get_live_out(bb);
17802 return !REGNO_REG_SET_P (live, FLAGS_REG);
17805 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17806 move and add to avoid AGU stalls. */
17808 bool
17809 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17811 unsigned int regno0, regno1, regno2;
17813 /* Check if we need to optimize. */
17814 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17815 return false;
17817 /* Check it is correct to split here. */
17818 if (!ix86_ok_to_clobber_flags(insn))
17819 return false;
17821 regno0 = true_regnum (operands[0]);
17822 regno1 = true_regnum (operands[1]);
17823 regno2 = true_regnum (operands[2]);
17825 /* We need to split only adds with non destructive
17826 destination operand. */
17827 if (regno0 == regno1 || regno0 == regno2)
17828 return false;
17829 else
17830 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17833 /* Return true if we should emit lea instruction instead of mov
17834 instruction. */
17836 bool
17837 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17839 unsigned int regno0, regno1;
17841 /* Check if we need to optimize. */
17842 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17843 return false;
17845 /* Use lea for reg to reg moves only. */
17846 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17847 return false;
17849 regno0 = true_regnum (operands[0]);
17850 regno1 = true_regnum (operands[1]);
17852 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17855 /* Return true if we need to split lea into a sequence of
17856 instructions to avoid AGU stalls. */
17858 bool
17859 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17861 unsigned int regno0, regno1, regno2;
17862 int split_cost;
17863 struct ix86_address parts;
17864 int ok;
17866 /* Check we need to optimize. */
17867 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17868 return false;
17870 /* Check it is correct to split here. */
17871 if (!ix86_ok_to_clobber_flags(insn))
17872 return false;
17874 ok = ix86_decompose_address (operands[1], &parts);
17875 gcc_assert (ok);
17877 /* There should be at least two components in the address. */
17878 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17879 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17880 return false;
17882 /* We should not split into add if non legitimate pic
17883 operand is used as displacement. */
17884 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17885 return false;
17887 regno0 = true_regnum (operands[0]) ;
17888 regno1 = INVALID_REGNUM;
17889 regno2 = INVALID_REGNUM;
17891 if (parts.base)
17892 regno1 = true_regnum (parts.base);
17893 if (parts.index)
17894 regno2 = true_regnum (parts.index);
17896 split_cost = 0;
17898 /* Compute how many cycles we will add to execution time
17899 if split lea into a sequence of instructions. */
17900 if (parts.base || parts.index)
17902 /* Have to use mov instruction if non desctructive
17903 destination form is used. */
17904 if (regno1 != regno0 && regno2 != regno0)
17905 split_cost += 1;
17907 /* Have to add index to base if both exist. */
17908 if (parts.base && parts.index)
17909 split_cost += 1;
17911 /* Have to use shift and adds if scale is 2 or greater. */
17912 if (parts.scale > 1)
17914 if (regno0 != regno1)
17915 split_cost += 1;
17916 else if (regno2 == regno0)
17917 split_cost += 4;
17918 else
17919 split_cost += parts.scale;
17922 /* Have to use add instruction with immediate if
17923 disp is non zero. */
17924 if (parts.disp && parts.disp != const0_rtx)
17925 split_cost += 1;
17927 /* Subtract the price of lea. */
17928 split_cost -= 1;
17931 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
17932 parts.scale > 1);
17935 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17936 matches destination. RTX includes clobber of FLAGS_REG. */
17938 static void
17939 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17940 rtx dst, rtx src)
17942 rtx op, clob;
17944 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17945 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17947 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17950 /* Return true if regno1 def is nearest to the insn. */
17952 static bool
17953 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17955 rtx prev = insn;
17956 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17958 if (insn == start)
17959 return false;
17960 while (prev && prev != start)
17962 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17964 prev = PREV_INSN (prev);
17965 continue;
17967 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17968 return true;
17969 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17970 return false;
17971 prev = PREV_INSN (prev);
17974 /* None of the regs is defined in the bb. */
17975 return false;
17978 /* Split lea instructions into a sequence of instructions
17979 which are executed on ALU to avoid AGU stalls.
17980 It is assumed that it is allowed to clobber flags register
17981 at lea position. */
17983 void
17984 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17986 unsigned int regno0, regno1, regno2;
17987 struct ix86_address parts;
17988 rtx target, tmp;
17989 int ok, adds;
17991 ok = ix86_decompose_address (operands[1], &parts);
17992 gcc_assert (ok);
17994 target = gen_lowpart (mode, operands[0]);
17996 regno0 = true_regnum (target);
17997 regno1 = INVALID_REGNUM;
17998 regno2 = INVALID_REGNUM;
18000 if (parts.base)
18002 parts.base = gen_lowpart (mode, parts.base);
18003 regno1 = true_regnum (parts.base);
18006 if (parts.index)
18008 parts.index = gen_lowpart (mode, parts.index);
18009 regno2 = true_regnum (parts.index);
18012 if (parts.disp)
18013 parts.disp = gen_lowpart (mode, parts.disp);
18015 if (parts.scale > 1)
18017 /* Case r1 = r1 + ... */
18018 if (regno1 == regno0)
18020 /* If we have a case r1 = r1 + C * r1 then we
18021 should use multiplication which is very
18022 expensive. Assume cost model is wrong if we
18023 have such case here. */
18024 gcc_assert (regno2 != regno0);
18026 for (adds = parts.scale; adds > 0; adds--)
18027 ix86_emit_binop (PLUS, mode, target, parts.index);
18029 else
18031 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
18032 if (regno0 != regno2)
18033 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18035 /* Use shift for scaling. */
18036 ix86_emit_binop (ASHIFT, mode, target,
18037 GEN_INT (exact_log2 (parts.scale)));
18039 if (parts.base)
18040 ix86_emit_binop (PLUS, mode, target, parts.base);
18042 if (parts.disp && parts.disp != const0_rtx)
18043 ix86_emit_binop (PLUS, mode, target, parts.disp);
18046 else if (!parts.base && !parts.index)
18048 gcc_assert(parts.disp);
18049 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
18051 else
18053 if (!parts.base)
18055 if (regno0 != regno2)
18056 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18058 else if (!parts.index)
18060 if (regno0 != regno1)
18061 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
18063 else
18065 if (regno0 == regno1)
18066 tmp = parts.index;
18067 else if (regno0 == regno2)
18068 tmp = parts.base;
18069 else
18071 rtx tmp1;
18073 /* Find better operand for SET instruction, depending
18074 on which definition is farther from the insn. */
18075 if (find_nearest_reg_def (insn, regno1, regno2))
18076 tmp = parts.index, tmp1 = parts.base;
18077 else
18078 tmp = parts.base, tmp1 = parts.index;
18080 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
18082 if (parts.disp && parts.disp != const0_rtx)
18083 ix86_emit_binop (PLUS, mode, target, parts.disp);
18085 ix86_emit_binop (PLUS, mode, target, tmp1);
18086 return;
18089 ix86_emit_binop (PLUS, mode, target, tmp);
18092 if (parts.disp && parts.disp != const0_rtx)
18093 ix86_emit_binop (PLUS, mode, target, parts.disp);
18097 /* Return true if it is ok to optimize an ADD operation to LEA
18098 operation to avoid flag register consumation. For most processors,
18099 ADD is faster than LEA. For the processors like ATOM, if the
18100 destination register of LEA holds an actual address which will be
18101 used soon, LEA is better and otherwise ADD is better. */
18103 bool
18104 ix86_lea_for_add_ok (rtx insn, rtx operands[])
18106 unsigned int regno0 = true_regnum (operands[0]);
18107 unsigned int regno1 = true_regnum (operands[1]);
18108 unsigned int regno2 = true_regnum (operands[2]);
18110 /* If a = b + c, (a!=b && a!=c), must use lea form. */
18111 if (regno0 != regno1 && regno0 != regno2)
18112 return true;
18114 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
18115 return false;
18117 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
18120 /* Return true if destination reg of SET_BODY is shift count of
18121 USE_BODY. */
18123 static bool
18124 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
18126 rtx set_dest;
18127 rtx shift_rtx;
18128 int i;
18130 /* Retrieve destination of SET_BODY. */
18131 switch (GET_CODE (set_body))
18133 case SET:
18134 set_dest = SET_DEST (set_body);
18135 if (!set_dest || !REG_P (set_dest))
18136 return false;
18137 break;
18138 case PARALLEL:
18139 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
18140 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
18141 use_body))
18142 return true;
18143 default:
18144 return false;
18145 break;
18148 /* Retrieve shift count of USE_BODY. */
18149 switch (GET_CODE (use_body))
18151 case SET:
18152 shift_rtx = XEXP (use_body, 1);
18153 break;
18154 case PARALLEL:
18155 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
18156 if (ix86_dep_by_shift_count_body (set_body,
18157 XVECEXP (use_body, 0, i)))
18158 return true;
18159 default:
18160 return false;
18161 break;
18164 if (shift_rtx
18165 && (GET_CODE (shift_rtx) == ASHIFT
18166 || GET_CODE (shift_rtx) == LSHIFTRT
18167 || GET_CODE (shift_rtx) == ASHIFTRT
18168 || GET_CODE (shift_rtx) == ROTATE
18169 || GET_CODE (shift_rtx) == ROTATERT))
18171 rtx shift_count = XEXP (shift_rtx, 1);
18173 /* Return true if shift count is dest of SET_BODY. */
18174 if (REG_P (shift_count))
18176 /* Add check since it can be invoked before register
18177 allocation in pre-reload schedule. */
18178 if (reload_completed
18179 && true_regnum (set_dest) == true_regnum (shift_count))
18180 return true;
18181 else if (REGNO(set_dest) == REGNO(shift_count))
18182 return true;
18186 return false;
18189 /* Return true if destination reg of SET_INSN is shift count of
18190 USE_INSN. */
18192 bool
18193 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
18195 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
18196 PATTERN (use_insn));
18199 /* Return TRUE or FALSE depending on whether the unary operator meets the
18200 appropriate constraints. */
18202 bool
18203 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
18204 enum machine_mode mode ATTRIBUTE_UNUSED,
18205 rtx operands[2] ATTRIBUTE_UNUSED)
18207 /* If one of operands is memory, source and destination must match. */
18208 if ((MEM_P (operands[0])
18209 || MEM_P (operands[1]))
18210 && ! rtx_equal_p (operands[0], operands[1]))
18211 return false;
18212 return true;
18215 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
18216 are ok, keeping in mind the possible movddup alternative. */
18218 bool
18219 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
18221 if (MEM_P (operands[0]))
18222 return rtx_equal_p (operands[0], operands[1 + high]);
18223 if (MEM_P (operands[1]) && MEM_P (operands[2]))
18224 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
18225 return true;
18228 /* Post-reload splitter for converting an SF or DFmode value in an
18229 SSE register into an unsigned SImode. */
18231 void
18232 ix86_split_convert_uns_si_sse (rtx operands[])
18234 enum machine_mode vecmode;
18235 rtx value, large, zero_or_two31, input, two31, x;
18237 large = operands[1];
18238 zero_or_two31 = operands[2];
18239 input = operands[3];
18240 two31 = operands[4];
18241 vecmode = GET_MODE (large);
18242 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18244 /* Load up the value into the low element. We must ensure that the other
18245 elements are valid floats -- zero is the easiest such value. */
18246 if (MEM_P (input))
18248 if (vecmode == V4SFmode)
18249 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18250 else
18251 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18253 else
18255 input = gen_rtx_REG (vecmode, REGNO (input));
18256 emit_move_insn (value, CONST0_RTX (vecmode));
18257 if (vecmode == V4SFmode)
18258 emit_insn (gen_sse_movss (value, value, input));
18259 else
18260 emit_insn (gen_sse2_movsd (value, value, input));
18263 emit_move_insn (large, two31);
18264 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18266 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18267 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18269 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18270 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18272 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18273 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18275 large = gen_rtx_REG (V4SImode, REGNO (large));
18276 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18278 x = gen_rtx_REG (V4SImode, REGNO (value));
18279 if (vecmode == V4SFmode)
18280 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18281 else
18282 emit_insn (gen_sse2_cvttpd2dq (x, value));
18283 value = x;
18285 emit_insn (gen_xorv4si3 (value, value, large));
18288 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18289 Expects the 64-bit DImode to be supplied in a pair of integral
18290 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18291 -mfpmath=sse, !optimize_size only. */
18293 void
18294 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18296 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18297 rtx int_xmm, fp_xmm;
18298 rtx biases, exponents;
18299 rtx x;
18301 int_xmm = gen_reg_rtx (V4SImode);
18302 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18303 emit_insn (gen_movdi_to_sse (int_xmm, input));
18304 else if (TARGET_SSE_SPLIT_REGS)
18306 emit_clobber (int_xmm);
18307 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18309 else
18311 x = gen_reg_rtx (V2DImode);
18312 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18313 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18316 x = gen_rtx_CONST_VECTOR (V4SImode,
18317 gen_rtvec (4, GEN_INT (0x43300000UL),
18318 GEN_INT (0x45300000UL),
18319 const0_rtx, const0_rtx));
18320 exponents = validize_mem (force_const_mem (V4SImode, x));
18322 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18323 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18325 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18326 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18327 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18328 (0x1.0p84 + double(fp_value_hi_xmm)).
18329 Note these exponents differ by 32. */
18331 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18333 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18334 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18335 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18336 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18337 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18338 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18339 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18340 biases = validize_mem (force_const_mem (V2DFmode, biases));
18341 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18343 /* Add the upper and lower DFmode values together. */
18344 if (TARGET_SSE3)
18345 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18346 else
18348 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18349 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18350 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18353 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18356 /* Not used, but eases macroization of patterns. */
18357 void
18358 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18359 rtx input ATTRIBUTE_UNUSED)
18361 gcc_unreachable ();
18364 /* Convert an unsigned SImode value into a DFmode. Only currently used
18365 for SSE, but applicable anywhere. */
18367 void
18368 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18370 REAL_VALUE_TYPE TWO31r;
18371 rtx x, fp;
18373 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18374 NULL, 1, OPTAB_DIRECT);
18376 fp = gen_reg_rtx (DFmode);
18377 emit_insn (gen_floatsidf2 (fp, x));
18379 real_ldexp (&TWO31r, &dconst1, 31);
18380 x = const_double_from_real_value (TWO31r, DFmode);
18382 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18383 if (x != target)
18384 emit_move_insn (target, x);
18387 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18388 32-bit mode; otherwise we have a direct convert instruction. */
18390 void
18391 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18393 REAL_VALUE_TYPE TWO32r;
18394 rtx fp_lo, fp_hi, x;
18396 fp_lo = gen_reg_rtx (DFmode);
18397 fp_hi = gen_reg_rtx (DFmode);
18399 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18401 real_ldexp (&TWO32r, &dconst1, 32);
18402 x = const_double_from_real_value (TWO32r, DFmode);
18403 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18405 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18407 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18408 0, OPTAB_DIRECT);
18409 if (x != target)
18410 emit_move_insn (target, x);
18413 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18414 For x86_32, -mfpmath=sse, !optimize_size only. */
18415 void
18416 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18418 REAL_VALUE_TYPE ONE16r;
18419 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18421 real_ldexp (&ONE16r, &dconst1, 16);
18422 x = const_double_from_real_value (ONE16r, SFmode);
18423 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18424 NULL, 0, OPTAB_DIRECT);
18425 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18426 NULL, 0, OPTAB_DIRECT);
18427 fp_hi = gen_reg_rtx (SFmode);
18428 fp_lo = gen_reg_rtx (SFmode);
18429 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18430 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18431 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18432 0, OPTAB_DIRECT);
18433 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18434 0, OPTAB_DIRECT);
18435 if (!rtx_equal_p (target, fp_hi))
18436 emit_move_insn (target, fp_hi);
18439 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18440 a vector of unsigned ints VAL to vector of floats TARGET. */
18442 void
18443 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18445 rtx tmp[8];
18446 REAL_VALUE_TYPE TWO16r;
18447 enum machine_mode intmode = GET_MODE (val);
18448 enum machine_mode fltmode = GET_MODE (target);
18449 rtx (*cvt) (rtx, rtx);
18451 if (intmode == V4SImode)
18452 cvt = gen_floatv4siv4sf2;
18453 else
18454 cvt = gen_floatv8siv8sf2;
18455 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18456 tmp[0] = force_reg (intmode, tmp[0]);
18457 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18458 OPTAB_DIRECT);
18459 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18460 NULL_RTX, 1, OPTAB_DIRECT);
18461 tmp[3] = gen_reg_rtx (fltmode);
18462 emit_insn (cvt (tmp[3], tmp[1]));
18463 tmp[4] = gen_reg_rtx (fltmode);
18464 emit_insn (cvt (tmp[4], tmp[2]));
18465 real_ldexp (&TWO16r, &dconst1, 16);
18466 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18467 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18468 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18469 OPTAB_DIRECT);
18470 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18471 OPTAB_DIRECT);
18472 if (tmp[7] != target)
18473 emit_move_insn (target, tmp[7]);
18476 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18477 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18478 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18479 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18482 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18484 REAL_VALUE_TYPE TWO31r;
18485 rtx two31r, tmp[4];
18486 enum machine_mode mode = GET_MODE (val);
18487 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18488 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18489 rtx (*cmp) (rtx, rtx, rtx, rtx);
18490 int i;
18492 for (i = 0; i < 3; i++)
18493 tmp[i] = gen_reg_rtx (mode);
18494 real_ldexp (&TWO31r, &dconst1, 31);
18495 two31r = const_double_from_real_value (TWO31r, scalarmode);
18496 two31r = ix86_build_const_vector (mode, 1, two31r);
18497 two31r = force_reg (mode, two31r);
18498 switch (mode)
18500 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18501 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18502 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18503 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18504 default: gcc_unreachable ();
18506 tmp[3] = gen_rtx_LE (mode, two31r, val);
18507 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18508 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18509 0, OPTAB_DIRECT);
18510 if (intmode == V4SImode || TARGET_AVX2)
18511 *xorp = expand_simple_binop (intmode, ASHIFT,
18512 gen_lowpart (intmode, tmp[0]),
18513 GEN_INT (31), NULL_RTX, 0,
18514 OPTAB_DIRECT);
18515 else
18517 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18518 two31 = ix86_build_const_vector (intmode, 1, two31);
18519 *xorp = expand_simple_binop (intmode, AND,
18520 gen_lowpart (intmode, tmp[0]),
18521 two31, NULL_RTX, 0,
18522 OPTAB_DIRECT);
18524 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18525 0, OPTAB_DIRECT);
18528 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18529 then replicate the value for all elements of the vector
18530 register. */
18533 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18535 int i, n_elt;
18536 rtvec v;
18537 enum machine_mode scalar_mode;
18539 switch (mode)
18541 case V32QImode:
18542 case V16QImode:
18543 case V16HImode:
18544 case V8HImode:
18545 case V8SImode:
18546 case V4SImode:
18547 case V4DImode:
18548 case V2DImode:
18549 gcc_assert (vect);
18550 case V8SFmode:
18551 case V4SFmode:
18552 case V4DFmode:
18553 case V2DFmode:
18554 n_elt = GET_MODE_NUNITS (mode);
18555 v = rtvec_alloc (n_elt);
18556 scalar_mode = GET_MODE_INNER (mode);
18558 RTVEC_ELT (v, 0) = value;
18560 for (i = 1; i < n_elt; ++i)
18561 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18563 return gen_rtx_CONST_VECTOR (mode, v);
18565 default:
18566 gcc_unreachable ();
18570 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18571 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18572 for an SSE register. If VECT is true, then replicate the mask for
18573 all elements of the vector register. If INVERT is true, then create
18574 a mask excluding the sign bit. */
18577 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18579 enum machine_mode vec_mode, imode;
18580 HOST_WIDE_INT hi, lo;
18581 int shift = 63;
18582 rtx v;
18583 rtx mask;
18585 /* Find the sign bit, sign extended to 2*HWI. */
18586 switch (mode)
18588 case V8SImode:
18589 case V4SImode:
18590 case V8SFmode:
18591 case V4SFmode:
18592 vec_mode = mode;
18593 mode = GET_MODE_INNER (mode);
18594 imode = SImode;
18595 lo = 0x80000000, hi = lo < 0;
18596 break;
18598 case V4DImode:
18599 case V2DImode:
18600 case V4DFmode:
18601 case V2DFmode:
18602 vec_mode = mode;
18603 mode = GET_MODE_INNER (mode);
18604 imode = DImode;
18605 if (HOST_BITS_PER_WIDE_INT >= 64)
18606 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18607 else
18608 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18609 break;
18611 case TImode:
18612 case TFmode:
18613 vec_mode = VOIDmode;
18614 if (HOST_BITS_PER_WIDE_INT >= 64)
18616 imode = TImode;
18617 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18619 else
18621 rtvec vec;
18623 imode = DImode;
18624 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18626 if (invert)
18628 lo = ~lo, hi = ~hi;
18629 v = constm1_rtx;
18631 else
18632 v = const0_rtx;
18634 mask = immed_double_const (lo, hi, imode);
18636 vec = gen_rtvec (2, v, mask);
18637 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18638 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18640 return v;
18642 break;
18644 default:
18645 gcc_unreachable ();
18648 if (invert)
18649 lo = ~lo, hi = ~hi;
18651 /* Force this value into the low part of a fp vector constant. */
18652 mask = immed_double_const (lo, hi, imode);
18653 mask = gen_lowpart (mode, mask);
18655 if (vec_mode == VOIDmode)
18656 return force_reg (mode, mask);
18658 v = ix86_build_const_vector (vec_mode, vect, mask);
18659 return force_reg (vec_mode, v);
18662 /* Generate code for floating point ABS or NEG. */
18664 void
18665 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18666 rtx operands[])
18668 rtx mask, set, dst, src;
18669 bool use_sse = false;
18670 bool vector_mode = VECTOR_MODE_P (mode);
18671 enum machine_mode vmode = mode;
18673 if (vector_mode)
18674 use_sse = true;
18675 else if (mode == TFmode)
18676 use_sse = true;
18677 else if (TARGET_SSE_MATH)
18679 use_sse = SSE_FLOAT_MODE_P (mode);
18680 if (mode == SFmode)
18681 vmode = V4SFmode;
18682 else if (mode == DFmode)
18683 vmode = V2DFmode;
18686 /* NEG and ABS performed with SSE use bitwise mask operations.
18687 Create the appropriate mask now. */
18688 if (use_sse)
18689 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18690 else
18691 mask = NULL_RTX;
18693 dst = operands[0];
18694 src = operands[1];
18696 set = gen_rtx_fmt_e (code, mode, src);
18697 set = gen_rtx_SET (VOIDmode, dst, set);
18699 if (mask)
18701 rtx use, clob;
18702 rtvec par;
18704 use = gen_rtx_USE (VOIDmode, mask);
18705 if (vector_mode)
18706 par = gen_rtvec (2, set, use);
18707 else
18709 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18710 par = gen_rtvec (3, set, use, clob);
18712 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18714 else
18715 emit_insn (set);
18718 /* Expand a copysign operation. Special case operand 0 being a constant. */
18720 void
18721 ix86_expand_copysign (rtx operands[])
18723 enum machine_mode mode, vmode;
18724 rtx dest, op0, op1, mask, nmask;
18726 dest = operands[0];
18727 op0 = operands[1];
18728 op1 = operands[2];
18730 mode = GET_MODE (dest);
18732 if (mode == SFmode)
18733 vmode = V4SFmode;
18734 else if (mode == DFmode)
18735 vmode = V2DFmode;
18736 else
18737 vmode = mode;
18739 if (GET_CODE (op0) == CONST_DOUBLE)
18741 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18743 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18744 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18746 if (mode == SFmode || mode == DFmode)
18748 if (op0 == CONST0_RTX (mode))
18749 op0 = CONST0_RTX (vmode);
18750 else
18752 rtx v = ix86_build_const_vector (vmode, false, op0);
18754 op0 = force_reg (vmode, v);
18757 else if (op0 != CONST0_RTX (mode))
18758 op0 = force_reg (mode, op0);
18760 mask = ix86_build_signbit_mask (vmode, 0, 0);
18762 if (mode == SFmode)
18763 copysign_insn = gen_copysignsf3_const;
18764 else if (mode == DFmode)
18765 copysign_insn = gen_copysigndf3_const;
18766 else
18767 copysign_insn = gen_copysigntf3_const;
18769 emit_insn (copysign_insn (dest, op0, op1, mask));
18771 else
18773 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18775 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18776 mask = ix86_build_signbit_mask (vmode, 0, 0);
18778 if (mode == SFmode)
18779 copysign_insn = gen_copysignsf3_var;
18780 else if (mode == DFmode)
18781 copysign_insn = gen_copysigndf3_var;
18782 else
18783 copysign_insn = gen_copysigntf3_var;
18785 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18789 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18790 be a constant, and so has already been expanded into a vector constant. */
18792 void
18793 ix86_split_copysign_const (rtx operands[])
18795 enum machine_mode mode, vmode;
18796 rtx dest, op0, mask, x;
18798 dest = operands[0];
18799 op0 = operands[1];
18800 mask = operands[3];
18802 mode = GET_MODE (dest);
18803 vmode = GET_MODE (mask);
18805 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18806 x = gen_rtx_AND (vmode, dest, mask);
18807 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18809 if (op0 != CONST0_RTX (vmode))
18811 x = gen_rtx_IOR (vmode, dest, op0);
18812 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18816 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18817 so we have to do two masks. */
18819 void
18820 ix86_split_copysign_var (rtx operands[])
18822 enum machine_mode mode, vmode;
18823 rtx dest, scratch, op0, op1, mask, nmask, x;
18825 dest = operands[0];
18826 scratch = operands[1];
18827 op0 = operands[2];
18828 op1 = operands[3];
18829 nmask = operands[4];
18830 mask = operands[5];
18832 mode = GET_MODE (dest);
18833 vmode = GET_MODE (mask);
18835 if (rtx_equal_p (op0, op1))
18837 /* Shouldn't happen often (it's useless, obviously), but when it does
18838 we'd generate incorrect code if we continue below. */
18839 emit_move_insn (dest, op0);
18840 return;
18843 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18845 gcc_assert (REGNO (op1) == REGNO (scratch));
18847 x = gen_rtx_AND (vmode, scratch, mask);
18848 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18850 dest = mask;
18851 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18852 x = gen_rtx_NOT (vmode, dest);
18853 x = gen_rtx_AND (vmode, x, op0);
18854 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18856 else
18858 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18860 x = gen_rtx_AND (vmode, scratch, mask);
18862 else /* alternative 2,4 */
18864 gcc_assert (REGNO (mask) == REGNO (scratch));
18865 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18866 x = gen_rtx_AND (vmode, scratch, op1);
18868 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18870 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18872 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18873 x = gen_rtx_AND (vmode, dest, nmask);
18875 else /* alternative 3,4 */
18877 gcc_assert (REGNO (nmask) == REGNO (dest));
18878 dest = nmask;
18879 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18880 x = gen_rtx_AND (vmode, dest, op0);
18882 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18885 x = gen_rtx_IOR (vmode, dest, scratch);
18886 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18889 /* Return TRUE or FALSE depending on whether the first SET in INSN
18890 has source and destination with matching CC modes, and that the
18891 CC mode is at least as constrained as REQ_MODE. */
18893 bool
18894 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18896 rtx set;
18897 enum machine_mode set_mode;
18899 set = PATTERN (insn);
18900 if (GET_CODE (set) == PARALLEL)
18901 set = XVECEXP (set, 0, 0);
18902 gcc_assert (GET_CODE (set) == SET);
18903 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18905 set_mode = GET_MODE (SET_DEST (set));
18906 switch (set_mode)
18908 case CCNOmode:
18909 if (req_mode != CCNOmode
18910 && (req_mode != CCmode
18911 || XEXP (SET_SRC (set), 1) != const0_rtx))
18912 return false;
18913 break;
18914 case CCmode:
18915 if (req_mode == CCGCmode)
18916 return false;
18917 /* FALLTHRU */
18918 case CCGCmode:
18919 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18920 return false;
18921 /* FALLTHRU */
18922 case CCGOCmode:
18923 if (req_mode == CCZmode)
18924 return false;
18925 /* FALLTHRU */
18926 case CCZmode:
18927 break;
18929 case CCAmode:
18930 case CCCmode:
18931 case CCOmode:
18932 case CCSmode:
18933 if (set_mode != req_mode)
18934 return false;
18935 break;
18937 default:
18938 gcc_unreachable ();
18941 return GET_MODE (SET_SRC (set)) == set_mode;
18944 /* Generate insn patterns to do an integer compare of OPERANDS. */
18946 static rtx
18947 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18949 enum machine_mode cmpmode;
18950 rtx tmp, flags;
18952 cmpmode = SELECT_CC_MODE (code, op0, op1);
18953 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18955 /* This is very simple, but making the interface the same as in the
18956 FP case makes the rest of the code easier. */
18957 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18958 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18960 /* Return the test that should be put into the flags user, i.e.
18961 the bcc, scc, or cmov instruction. */
18962 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18965 /* Figure out whether to use ordered or unordered fp comparisons.
18966 Return the appropriate mode to use. */
18968 enum machine_mode
18969 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18971 /* ??? In order to make all comparisons reversible, we do all comparisons
18972 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18973 all forms trapping and nontrapping comparisons, we can make inequality
18974 comparisons trapping again, since it results in better code when using
18975 FCOM based compares. */
18976 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18979 enum machine_mode
18980 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18982 enum machine_mode mode = GET_MODE (op0);
18984 if (SCALAR_FLOAT_MODE_P (mode))
18986 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18987 return ix86_fp_compare_mode (code);
18990 switch (code)
18992 /* Only zero flag is needed. */
18993 case EQ: /* ZF=0 */
18994 case NE: /* ZF!=0 */
18995 return CCZmode;
18996 /* Codes needing carry flag. */
18997 case GEU: /* CF=0 */
18998 case LTU: /* CF=1 */
18999 /* Detect overflow checks. They need just the carry flag. */
19000 if (GET_CODE (op0) == PLUS
19001 && rtx_equal_p (op1, XEXP (op0, 0)))
19002 return CCCmode;
19003 else
19004 return CCmode;
19005 case GTU: /* CF=0 & ZF=0 */
19006 case LEU: /* CF=1 | ZF=1 */
19007 /* Detect overflow checks. They need just the carry flag. */
19008 if (GET_CODE (op0) == MINUS
19009 && rtx_equal_p (op1, XEXP (op0, 0)))
19010 return CCCmode;
19011 else
19012 return CCmode;
19013 /* Codes possibly doable only with sign flag when
19014 comparing against zero. */
19015 case GE: /* SF=OF or SF=0 */
19016 case LT: /* SF<>OF or SF=1 */
19017 if (op1 == const0_rtx)
19018 return CCGOCmode;
19019 else
19020 /* For other cases Carry flag is not required. */
19021 return CCGCmode;
19022 /* Codes doable only with sign flag when comparing
19023 against zero, but we miss jump instruction for it
19024 so we need to use relational tests against overflow
19025 that thus needs to be zero. */
19026 case GT: /* ZF=0 & SF=OF */
19027 case LE: /* ZF=1 | SF<>OF */
19028 if (op1 == const0_rtx)
19029 return CCNOmode;
19030 else
19031 return CCGCmode;
19032 /* strcmp pattern do (use flags) and combine may ask us for proper
19033 mode. */
19034 case USE:
19035 return CCmode;
19036 default:
19037 gcc_unreachable ();
19041 /* Return the fixed registers used for condition codes. */
19043 static bool
19044 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
19046 *p1 = FLAGS_REG;
19047 *p2 = FPSR_REG;
19048 return true;
19051 /* If two condition code modes are compatible, return a condition code
19052 mode which is compatible with both. Otherwise, return
19053 VOIDmode. */
19055 static enum machine_mode
19056 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
19058 if (m1 == m2)
19059 return m1;
19061 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
19062 return VOIDmode;
19064 if ((m1 == CCGCmode && m2 == CCGOCmode)
19065 || (m1 == CCGOCmode && m2 == CCGCmode))
19066 return CCGCmode;
19068 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
19069 return m2;
19070 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
19071 return m1;
19073 switch (m1)
19075 default:
19076 gcc_unreachable ();
19078 case CCmode:
19079 case CCGCmode:
19080 case CCGOCmode:
19081 case CCNOmode:
19082 case CCAmode:
19083 case CCCmode:
19084 case CCOmode:
19085 case CCSmode:
19086 case CCZmode:
19087 switch (m2)
19089 default:
19090 return VOIDmode;
19092 case CCmode:
19093 case CCGCmode:
19094 case CCGOCmode:
19095 case CCNOmode:
19096 case CCAmode:
19097 case CCCmode:
19098 case CCOmode:
19099 case CCSmode:
19100 case CCZmode:
19101 return CCmode;
19104 case CCFPmode:
19105 case CCFPUmode:
19106 /* These are only compatible with themselves, which we already
19107 checked above. */
19108 return VOIDmode;
19113 /* Return a comparison we can do and that it is equivalent to
19114 swap_condition (code) apart possibly from orderedness.
19115 But, never change orderedness if TARGET_IEEE_FP, returning
19116 UNKNOWN in that case if necessary. */
19118 static enum rtx_code
19119 ix86_fp_swap_condition (enum rtx_code code)
19121 switch (code)
19123 case GT: /* GTU - CF=0 & ZF=0 */
19124 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
19125 case GE: /* GEU - CF=0 */
19126 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
19127 case UNLT: /* LTU - CF=1 */
19128 return TARGET_IEEE_FP ? UNKNOWN : GT;
19129 case UNLE: /* LEU - CF=1 | ZF=1 */
19130 return TARGET_IEEE_FP ? UNKNOWN : GE;
19131 default:
19132 return swap_condition (code);
19136 /* Return cost of comparison CODE using the best strategy for performance.
19137 All following functions do use number of instructions as a cost metrics.
19138 In future this should be tweaked to compute bytes for optimize_size and
19139 take into account performance of various instructions on various CPUs. */
19141 static int
19142 ix86_fp_comparison_cost (enum rtx_code code)
19144 int arith_cost;
19146 /* The cost of code using bit-twiddling on %ah. */
19147 switch (code)
19149 case UNLE:
19150 case UNLT:
19151 case LTGT:
19152 case GT:
19153 case GE:
19154 case UNORDERED:
19155 case ORDERED:
19156 case UNEQ:
19157 arith_cost = 4;
19158 break;
19159 case LT:
19160 case NE:
19161 case EQ:
19162 case UNGE:
19163 arith_cost = TARGET_IEEE_FP ? 5 : 4;
19164 break;
19165 case LE:
19166 case UNGT:
19167 arith_cost = TARGET_IEEE_FP ? 6 : 4;
19168 break;
19169 default:
19170 gcc_unreachable ();
19173 switch (ix86_fp_comparison_strategy (code))
19175 case IX86_FPCMP_COMI:
19176 return arith_cost > 4 ? 3 : 2;
19177 case IX86_FPCMP_SAHF:
19178 return arith_cost > 4 ? 4 : 3;
19179 default:
19180 return arith_cost;
19184 /* Return strategy to use for floating-point. We assume that fcomi is always
19185 preferrable where available, since that is also true when looking at size
19186 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
19188 enum ix86_fpcmp_strategy
19189 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
19191 /* Do fcomi/sahf based test when profitable. */
19193 if (TARGET_CMOVE)
19194 return IX86_FPCMP_COMI;
19196 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19197 return IX86_FPCMP_SAHF;
19199 return IX86_FPCMP_ARITH;
19202 /* Swap, force into registers, or otherwise massage the two operands
19203 to a fp comparison. The operands are updated in place; the new
19204 comparison code is returned. */
19206 static enum rtx_code
19207 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
19209 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
19210 rtx op0 = *pop0, op1 = *pop1;
19211 enum machine_mode op_mode = GET_MODE (op0);
19212 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
19214 /* All of the unordered compare instructions only work on registers.
19215 The same is true of the fcomi compare instructions. The XFmode
19216 compare instructions require registers except when comparing
19217 against zero or when converting operand 1 from fixed point to
19218 floating point. */
19220 if (!is_sse
19221 && (fpcmp_mode == CCFPUmode
19222 || (op_mode == XFmode
19223 && ! (standard_80387_constant_p (op0) == 1
19224 || standard_80387_constant_p (op1) == 1)
19225 && GET_CODE (op1) != FLOAT)
19226 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19228 op0 = force_reg (op_mode, op0);
19229 op1 = force_reg (op_mode, op1);
19231 else
19233 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19234 things around if they appear profitable, otherwise force op0
19235 into a register. */
19237 if (standard_80387_constant_p (op0) == 0
19238 || (MEM_P (op0)
19239 && ! (standard_80387_constant_p (op1) == 0
19240 || MEM_P (op1))))
19242 enum rtx_code new_code = ix86_fp_swap_condition (code);
19243 if (new_code != UNKNOWN)
19245 rtx tmp;
19246 tmp = op0, op0 = op1, op1 = tmp;
19247 code = new_code;
19251 if (!REG_P (op0))
19252 op0 = force_reg (op_mode, op0);
19254 if (CONSTANT_P (op1))
19256 int tmp = standard_80387_constant_p (op1);
19257 if (tmp == 0)
19258 op1 = validize_mem (force_const_mem (op_mode, op1));
19259 else if (tmp == 1)
19261 if (TARGET_CMOVE)
19262 op1 = force_reg (op_mode, op1);
19264 else
19265 op1 = force_reg (op_mode, op1);
19269 /* Try to rearrange the comparison to make it cheaper. */
19270 if (ix86_fp_comparison_cost (code)
19271 > ix86_fp_comparison_cost (swap_condition (code))
19272 && (REG_P (op1) || can_create_pseudo_p ()))
19274 rtx tmp;
19275 tmp = op0, op0 = op1, op1 = tmp;
19276 code = swap_condition (code);
19277 if (!REG_P (op0))
19278 op0 = force_reg (op_mode, op0);
19281 *pop0 = op0;
19282 *pop1 = op1;
19283 return code;
19286 /* Convert comparison codes we use to represent FP comparison to integer
19287 code that will result in proper branch. Return UNKNOWN if no such code
19288 is available. */
19290 enum rtx_code
19291 ix86_fp_compare_code_to_integer (enum rtx_code code)
19293 switch (code)
19295 case GT:
19296 return GTU;
19297 case GE:
19298 return GEU;
19299 case ORDERED:
19300 case UNORDERED:
19301 return code;
19302 break;
19303 case UNEQ:
19304 return EQ;
19305 break;
19306 case UNLT:
19307 return LTU;
19308 break;
19309 case UNLE:
19310 return LEU;
19311 break;
19312 case LTGT:
19313 return NE;
19314 break;
19315 default:
19316 return UNKNOWN;
19320 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19322 static rtx
19323 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19325 enum machine_mode fpcmp_mode, intcmp_mode;
19326 rtx tmp, tmp2;
19328 fpcmp_mode = ix86_fp_compare_mode (code);
19329 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19331 /* Do fcomi/sahf based test when profitable. */
19332 switch (ix86_fp_comparison_strategy (code))
19334 case IX86_FPCMP_COMI:
19335 intcmp_mode = fpcmp_mode;
19336 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19337 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19338 tmp);
19339 emit_insn (tmp);
19340 break;
19342 case IX86_FPCMP_SAHF:
19343 intcmp_mode = fpcmp_mode;
19344 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19345 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19346 tmp);
19348 if (!scratch)
19349 scratch = gen_reg_rtx (HImode);
19350 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19351 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19352 break;
19354 case IX86_FPCMP_ARITH:
19355 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19356 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19357 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19358 if (!scratch)
19359 scratch = gen_reg_rtx (HImode);
19360 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19362 /* In the unordered case, we have to check C2 for NaN's, which
19363 doesn't happen to work out to anything nice combination-wise.
19364 So do some bit twiddling on the value we've got in AH to come
19365 up with an appropriate set of condition codes. */
19367 intcmp_mode = CCNOmode;
19368 switch (code)
19370 case GT:
19371 case UNGT:
19372 if (code == GT || !TARGET_IEEE_FP)
19374 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19375 code = EQ;
19377 else
19379 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19380 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19381 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19382 intcmp_mode = CCmode;
19383 code = GEU;
19385 break;
19386 case LT:
19387 case UNLT:
19388 if (code == LT && TARGET_IEEE_FP)
19390 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19391 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19392 intcmp_mode = CCmode;
19393 code = EQ;
19395 else
19397 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19398 code = NE;
19400 break;
19401 case GE:
19402 case UNGE:
19403 if (code == GE || !TARGET_IEEE_FP)
19405 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19406 code = EQ;
19408 else
19410 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19411 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19412 code = NE;
19414 break;
19415 case LE:
19416 case UNLE:
19417 if (code == LE && TARGET_IEEE_FP)
19419 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19420 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19421 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19422 intcmp_mode = CCmode;
19423 code = LTU;
19425 else
19427 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19428 code = NE;
19430 break;
19431 case EQ:
19432 case UNEQ:
19433 if (code == EQ && TARGET_IEEE_FP)
19435 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19436 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19437 intcmp_mode = CCmode;
19438 code = EQ;
19440 else
19442 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19443 code = NE;
19445 break;
19446 case NE:
19447 case LTGT:
19448 if (code == NE && TARGET_IEEE_FP)
19450 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19451 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19452 GEN_INT (0x40)));
19453 code = NE;
19455 else
19457 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19458 code = EQ;
19460 break;
19462 case UNORDERED:
19463 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19464 code = NE;
19465 break;
19466 case ORDERED:
19467 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19468 code = EQ;
19469 break;
19471 default:
19472 gcc_unreachable ();
19474 break;
19476 default:
19477 gcc_unreachable();
19480 /* Return the test that should be put into the flags user, i.e.
19481 the bcc, scc, or cmov instruction. */
19482 return gen_rtx_fmt_ee (code, VOIDmode,
19483 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19484 const0_rtx);
19487 static rtx
19488 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19490 rtx ret;
19492 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19493 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19495 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19497 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19498 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19500 else
19501 ret = ix86_expand_int_compare (code, op0, op1);
19503 return ret;
19506 void
19507 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19509 enum machine_mode mode = GET_MODE (op0);
19510 rtx tmp;
19512 switch (mode)
19514 case SFmode:
19515 case DFmode:
19516 case XFmode:
19517 case QImode:
19518 case HImode:
19519 case SImode:
19520 simple:
19521 tmp = ix86_expand_compare (code, op0, op1);
19522 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19523 gen_rtx_LABEL_REF (VOIDmode, label),
19524 pc_rtx);
19525 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19526 return;
19528 case DImode:
19529 if (TARGET_64BIT)
19530 goto simple;
19531 case TImode:
19532 /* Expand DImode branch into multiple compare+branch. */
19534 rtx lo[2], hi[2], label2;
19535 enum rtx_code code1, code2, code3;
19536 enum machine_mode submode;
19538 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19540 tmp = op0, op0 = op1, op1 = tmp;
19541 code = swap_condition (code);
19544 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19545 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19547 submode = mode == DImode ? SImode : DImode;
19549 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19550 avoid two branches. This costs one extra insn, so disable when
19551 optimizing for size. */
19553 if ((code == EQ || code == NE)
19554 && (!optimize_insn_for_size_p ()
19555 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19557 rtx xor0, xor1;
19559 xor1 = hi[0];
19560 if (hi[1] != const0_rtx)
19561 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19562 NULL_RTX, 0, OPTAB_WIDEN);
19564 xor0 = lo[0];
19565 if (lo[1] != const0_rtx)
19566 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19567 NULL_RTX, 0, OPTAB_WIDEN);
19569 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19570 NULL_RTX, 0, OPTAB_WIDEN);
19572 ix86_expand_branch (code, tmp, const0_rtx, label);
19573 return;
19576 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19577 op1 is a constant and the low word is zero, then we can just
19578 examine the high word. Similarly for low word -1 and
19579 less-or-equal-than or greater-than. */
19581 if (CONST_INT_P (hi[1]))
19582 switch (code)
19584 case LT: case LTU: case GE: case GEU:
19585 if (lo[1] == const0_rtx)
19587 ix86_expand_branch (code, hi[0], hi[1], label);
19588 return;
19590 break;
19591 case LE: case LEU: case GT: case GTU:
19592 if (lo[1] == constm1_rtx)
19594 ix86_expand_branch (code, hi[0], hi[1], label);
19595 return;
19597 break;
19598 default:
19599 break;
19602 /* Otherwise, we need two or three jumps. */
19604 label2 = gen_label_rtx ();
19606 code1 = code;
19607 code2 = swap_condition (code);
19608 code3 = unsigned_condition (code);
19610 switch (code)
19612 case LT: case GT: case LTU: case GTU:
19613 break;
19615 case LE: code1 = LT; code2 = GT; break;
19616 case GE: code1 = GT; code2 = LT; break;
19617 case LEU: code1 = LTU; code2 = GTU; break;
19618 case GEU: code1 = GTU; code2 = LTU; break;
19620 case EQ: code1 = UNKNOWN; code2 = NE; break;
19621 case NE: code2 = UNKNOWN; break;
19623 default:
19624 gcc_unreachable ();
19628 * a < b =>
19629 * if (hi(a) < hi(b)) goto true;
19630 * if (hi(a) > hi(b)) goto false;
19631 * if (lo(a) < lo(b)) goto true;
19632 * false:
19635 if (code1 != UNKNOWN)
19636 ix86_expand_branch (code1, hi[0], hi[1], label);
19637 if (code2 != UNKNOWN)
19638 ix86_expand_branch (code2, hi[0], hi[1], label2);
19640 ix86_expand_branch (code3, lo[0], lo[1], label);
19642 if (code2 != UNKNOWN)
19643 emit_label (label2);
19644 return;
19647 default:
19648 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19649 goto simple;
19653 /* Split branch based on floating point condition. */
19654 void
19655 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19656 rtx target1, rtx target2, rtx tmp, rtx pushed)
19658 rtx condition;
19659 rtx i;
19661 if (target2 != pc_rtx)
19663 rtx tmp = target2;
19664 code = reverse_condition_maybe_unordered (code);
19665 target2 = target1;
19666 target1 = tmp;
19669 condition = ix86_expand_fp_compare (code, op1, op2,
19670 tmp);
19672 /* Remove pushed operand from stack. */
19673 if (pushed)
19674 ix86_free_from_memory (GET_MODE (pushed));
19676 i = emit_jump_insn (gen_rtx_SET
19677 (VOIDmode, pc_rtx,
19678 gen_rtx_IF_THEN_ELSE (VOIDmode,
19679 condition, target1, target2)));
19680 if (split_branch_probability >= 0)
19681 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19684 void
19685 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19687 rtx ret;
19689 gcc_assert (GET_MODE (dest) == QImode);
19691 ret = ix86_expand_compare (code, op0, op1);
19692 PUT_MODE (ret, QImode);
19693 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19696 /* Expand comparison setting or clearing carry flag. Return true when
19697 successful and set pop for the operation. */
19698 static bool
19699 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19701 enum machine_mode mode =
19702 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19704 /* Do not handle double-mode compares that go through special path. */
19705 if (mode == (TARGET_64BIT ? TImode : DImode))
19706 return false;
19708 if (SCALAR_FLOAT_MODE_P (mode))
19710 rtx compare_op, compare_seq;
19712 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19714 /* Shortcut: following common codes never translate
19715 into carry flag compares. */
19716 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19717 || code == ORDERED || code == UNORDERED)
19718 return false;
19720 /* These comparisons require zero flag; swap operands so they won't. */
19721 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19722 && !TARGET_IEEE_FP)
19724 rtx tmp = op0;
19725 op0 = op1;
19726 op1 = tmp;
19727 code = swap_condition (code);
19730 /* Try to expand the comparison and verify that we end up with
19731 carry flag based comparison. This fails to be true only when
19732 we decide to expand comparison using arithmetic that is not
19733 too common scenario. */
19734 start_sequence ();
19735 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19736 compare_seq = get_insns ();
19737 end_sequence ();
19739 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19740 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19741 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19742 else
19743 code = GET_CODE (compare_op);
19745 if (code != LTU && code != GEU)
19746 return false;
19748 emit_insn (compare_seq);
19749 *pop = compare_op;
19750 return true;
19753 if (!INTEGRAL_MODE_P (mode))
19754 return false;
19756 switch (code)
19758 case LTU:
19759 case GEU:
19760 break;
19762 /* Convert a==0 into (unsigned)a<1. */
19763 case EQ:
19764 case NE:
19765 if (op1 != const0_rtx)
19766 return false;
19767 op1 = const1_rtx;
19768 code = (code == EQ ? LTU : GEU);
19769 break;
19771 /* Convert a>b into b<a or a>=b-1. */
19772 case GTU:
19773 case LEU:
19774 if (CONST_INT_P (op1))
19776 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19777 /* Bail out on overflow. We still can swap operands but that
19778 would force loading of the constant into register. */
19779 if (op1 == const0_rtx
19780 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19781 return false;
19782 code = (code == GTU ? GEU : LTU);
19784 else
19786 rtx tmp = op1;
19787 op1 = op0;
19788 op0 = tmp;
19789 code = (code == GTU ? LTU : GEU);
19791 break;
19793 /* Convert a>=0 into (unsigned)a<0x80000000. */
19794 case LT:
19795 case GE:
19796 if (mode == DImode || op1 != const0_rtx)
19797 return false;
19798 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19799 code = (code == LT ? GEU : LTU);
19800 break;
19801 case LE:
19802 case GT:
19803 if (mode == DImode || op1 != constm1_rtx)
19804 return false;
19805 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19806 code = (code == LE ? GEU : LTU);
19807 break;
19809 default:
19810 return false;
19812 /* Swapping operands may cause constant to appear as first operand. */
19813 if (!nonimmediate_operand (op0, VOIDmode))
19815 if (!can_create_pseudo_p ())
19816 return false;
19817 op0 = force_reg (mode, op0);
19819 *pop = ix86_expand_compare (code, op0, op1);
19820 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19821 return true;
19824 bool
19825 ix86_expand_int_movcc (rtx operands[])
19827 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19828 rtx compare_seq, compare_op;
19829 enum machine_mode mode = GET_MODE (operands[0]);
19830 bool sign_bit_compare_p = false;
19831 rtx op0 = XEXP (operands[1], 0);
19832 rtx op1 = XEXP (operands[1], 1);
19834 if (GET_MODE (op0) == TImode
19835 || (GET_MODE (op0) == DImode
19836 && !TARGET_64BIT))
19837 return false;
19839 start_sequence ();
19840 compare_op = ix86_expand_compare (code, op0, op1);
19841 compare_seq = get_insns ();
19842 end_sequence ();
19844 compare_code = GET_CODE (compare_op);
19846 if ((op1 == const0_rtx && (code == GE || code == LT))
19847 || (op1 == constm1_rtx && (code == GT || code == LE)))
19848 sign_bit_compare_p = true;
19850 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19851 HImode insns, we'd be swallowed in word prefix ops. */
19853 if ((mode != HImode || TARGET_FAST_PREFIX)
19854 && (mode != (TARGET_64BIT ? TImode : DImode))
19855 && CONST_INT_P (operands[2])
19856 && CONST_INT_P (operands[3]))
19858 rtx out = operands[0];
19859 HOST_WIDE_INT ct = INTVAL (operands[2]);
19860 HOST_WIDE_INT cf = INTVAL (operands[3]);
19861 HOST_WIDE_INT diff;
19863 diff = ct - cf;
19864 /* Sign bit compares are better done using shifts than we do by using
19865 sbb. */
19866 if (sign_bit_compare_p
19867 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19869 /* Detect overlap between destination and compare sources. */
19870 rtx tmp = out;
19872 if (!sign_bit_compare_p)
19874 rtx flags;
19875 bool fpcmp = false;
19877 compare_code = GET_CODE (compare_op);
19879 flags = XEXP (compare_op, 0);
19881 if (GET_MODE (flags) == CCFPmode
19882 || GET_MODE (flags) == CCFPUmode)
19884 fpcmp = true;
19885 compare_code
19886 = ix86_fp_compare_code_to_integer (compare_code);
19889 /* To simplify rest of code, restrict to the GEU case. */
19890 if (compare_code == LTU)
19892 HOST_WIDE_INT tmp = ct;
19893 ct = cf;
19894 cf = tmp;
19895 compare_code = reverse_condition (compare_code);
19896 code = reverse_condition (code);
19898 else
19900 if (fpcmp)
19901 PUT_CODE (compare_op,
19902 reverse_condition_maybe_unordered
19903 (GET_CODE (compare_op)));
19904 else
19905 PUT_CODE (compare_op,
19906 reverse_condition (GET_CODE (compare_op)));
19908 diff = ct - cf;
19910 if (reg_overlap_mentioned_p (out, op0)
19911 || reg_overlap_mentioned_p (out, op1))
19912 tmp = gen_reg_rtx (mode);
19914 if (mode == DImode)
19915 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19916 else
19917 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19918 flags, compare_op));
19920 else
19922 if (code == GT || code == GE)
19923 code = reverse_condition (code);
19924 else
19926 HOST_WIDE_INT tmp = ct;
19927 ct = cf;
19928 cf = tmp;
19929 diff = ct - cf;
19931 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19934 if (diff == 1)
19937 * cmpl op0,op1
19938 * sbbl dest,dest
19939 * [addl dest, ct]
19941 * Size 5 - 8.
19943 if (ct)
19944 tmp = expand_simple_binop (mode, PLUS,
19945 tmp, GEN_INT (ct),
19946 copy_rtx (tmp), 1, OPTAB_DIRECT);
19948 else if (cf == -1)
19951 * cmpl op0,op1
19952 * sbbl dest,dest
19953 * orl $ct, dest
19955 * Size 8.
19957 tmp = expand_simple_binop (mode, IOR,
19958 tmp, GEN_INT (ct),
19959 copy_rtx (tmp), 1, OPTAB_DIRECT);
19961 else if (diff == -1 && ct)
19964 * cmpl op0,op1
19965 * sbbl dest,dest
19966 * notl dest
19967 * [addl dest, cf]
19969 * Size 8 - 11.
19971 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19972 if (cf)
19973 tmp = expand_simple_binop (mode, PLUS,
19974 copy_rtx (tmp), GEN_INT (cf),
19975 copy_rtx (tmp), 1, OPTAB_DIRECT);
19977 else
19980 * cmpl op0,op1
19981 * sbbl dest,dest
19982 * [notl dest]
19983 * andl cf - ct, dest
19984 * [addl dest, ct]
19986 * Size 8 - 11.
19989 if (cf == 0)
19991 cf = ct;
19992 ct = 0;
19993 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19996 tmp = expand_simple_binop (mode, AND,
19997 copy_rtx (tmp),
19998 gen_int_mode (cf - ct, mode),
19999 copy_rtx (tmp), 1, OPTAB_DIRECT);
20000 if (ct)
20001 tmp = expand_simple_binop (mode, PLUS,
20002 copy_rtx (tmp), GEN_INT (ct),
20003 copy_rtx (tmp), 1, OPTAB_DIRECT);
20006 if (!rtx_equal_p (tmp, out))
20007 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
20009 return true;
20012 if (diff < 0)
20014 enum machine_mode cmp_mode = GET_MODE (op0);
20016 HOST_WIDE_INT tmp;
20017 tmp = ct, ct = cf, cf = tmp;
20018 diff = -diff;
20020 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20022 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20024 /* We may be reversing unordered compare to normal compare, that
20025 is not valid in general (we may convert non-trapping condition
20026 to trapping one), however on i386 we currently emit all
20027 comparisons unordered. */
20028 compare_code = reverse_condition_maybe_unordered (compare_code);
20029 code = reverse_condition_maybe_unordered (code);
20031 else
20033 compare_code = reverse_condition (compare_code);
20034 code = reverse_condition (code);
20038 compare_code = UNKNOWN;
20039 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
20040 && CONST_INT_P (op1))
20042 if (op1 == const0_rtx
20043 && (code == LT || code == GE))
20044 compare_code = code;
20045 else if (op1 == constm1_rtx)
20047 if (code == LE)
20048 compare_code = LT;
20049 else if (code == GT)
20050 compare_code = GE;
20054 /* Optimize dest = (op0 < 0) ? -1 : cf. */
20055 if (compare_code != UNKNOWN
20056 && GET_MODE (op0) == GET_MODE (out)
20057 && (cf == -1 || ct == -1))
20059 /* If lea code below could be used, only optimize
20060 if it results in a 2 insn sequence. */
20062 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
20063 || diff == 3 || diff == 5 || diff == 9)
20064 || (compare_code == LT && ct == -1)
20065 || (compare_code == GE && cf == -1))
20068 * notl op1 (if necessary)
20069 * sarl $31, op1
20070 * orl cf, op1
20072 if (ct != -1)
20074 cf = ct;
20075 ct = -1;
20076 code = reverse_condition (code);
20079 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20081 out = expand_simple_binop (mode, IOR,
20082 out, GEN_INT (cf),
20083 out, 1, OPTAB_DIRECT);
20084 if (out != operands[0])
20085 emit_move_insn (operands[0], out);
20087 return true;
20092 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
20093 || diff == 3 || diff == 5 || diff == 9)
20094 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
20095 && (mode != DImode
20096 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
20099 * xorl dest,dest
20100 * cmpl op1,op2
20101 * setcc dest
20102 * lea cf(dest*(ct-cf)),dest
20104 * Size 14.
20106 * This also catches the degenerate setcc-only case.
20109 rtx tmp;
20110 int nops;
20112 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20114 nops = 0;
20115 /* On x86_64 the lea instruction operates on Pmode, so we need
20116 to get arithmetics done in proper mode to match. */
20117 if (diff == 1)
20118 tmp = copy_rtx (out);
20119 else
20121 rtx out1;
20122 out1 = copy_rtx (out);
20123 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
20124 nops++;
20125 if (diff & 1)
20127 tmp = gen_rtx_PLUS (mode, tmp, out1);
20128 nops++;
20131 if (cf != 0)
20133 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
20134 nops++;
20136 if (!rtx_equal_p (tmp, out))
20138 if (nops == 1)
20139 out = force_operand (tmp, copy_rtx (out));
20140 else
20141 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
20143 if (!rtx_equal_p (out, operands[0]))
20144 emit_move_insn (operands[0], copy_rtx (out));
20146 return true;
20150 * General case: Jumpful:
20151 * xorl dest,dest cmpl op1, op2
20152 * cmpl op1, op2 movl ct, dest
20153 * setcc dest jcc 1f
20154 * decl dest movl cf, dest
20155 * andl (cf-ct),dest 1:
20156 * addl ct,dest
20158 * Size 20. Size 14.
20160 * This is reasonably steep, but branch mispredict costs are
20161 * high on modern cpus, so consider failing only if optimizing
20162 * for space.
20165 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20166 && BRANCH_COST (optimize_insn_for_speed_p (),
20167 false) >= 2)
20169 if (cf == 0)
20171 enum machine_mode cmp_mode = GET_MODE (op0);
20173 cf = ct;
20174 ct = 0;
20176 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20178 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20180 /* We may be reversing unordered compare to normal compare,
20181 that is not valid in general (we may convert non-trapping
20182 condition to trapping one), however on i386 we currently
20183 emit all comparisons unordered. */
20184 code = reverse_condition_maybe_unordered (code);
20186 else
20188 code = reverse_condition (code);
20189 if (compare_code != UNKNOWN)
20190 compare_code = reverse_condition (compare_code);
20194 if (compare_code != UNKNOWN)
20196 /* notl op1 (if needed)
20197 sarl $31, op1
20198 andl (cf-ct), op1
20199 addl ct, op1
20201 For x < 0 (resp. x <= -1) there will be no notl,
20202 so if possible swap the constants to get rid of the
20203 complement.
20204 True/false will be -1/0 while code below (store flag
20205 followed by decrement) is 0/-1, so the constants need
20206 to be exchanged once more. */
20208 if (compare_code == GE || !cf)
20210 code = reverse_condition (code);
20211 compare_code = LT;
20213 else
20215 HOST_WIDE_INT tmp = cf;
20216 cf = ct;
20217 ct = tmp;
20220 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20222 else
20224 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20226 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20227 constm1_rtx,
20228 copy_rtx (out), 1, OPTAB_DIRECT);
20231 out = expand_simple_binop (mode, AND, copy_rtx (out),
20232 gen_int_mode (cf - ct, mode),
20233 copy_rtx (out), 1, OPTAB_DIRECT);
20234 if (ct)
20235 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20236 copy_rtx (out), 1, OPTAB_DIRECT);
20237 if (!rtx_equal_p (out, operands[0]))
20238 emit_move_insn (operands[0], copy_rtx (out));
20240 return true;
20244 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20246 /* Try a few things more with specific constants and a variable. */
20248 optab op;
20249 rtx var, orig_out, out, tmp;
20251 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20252 return false;
20254 /* If one of the two operands is an interesting constant, load a
20255 constant with the above and mask it in with a logical operation. */
20257 if (CONST_INT_P (operands[2]))
20259 var = operands[3];
20260 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20261 operands[3] = constm1_rtx, op = and_optab;
20262 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20263 operands[3] = const0_rtx, op = ior_optab;
20264 else
20265 return false;
20267 else if (CONST_INT_P (operands[3]))
20269 var = operands[2];
20270 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20271 operands[2] = constm1_rtx, op = and_optab;
20272 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20273 operands[2] = const0_rtx, op = ior_optab;
20274 else
20275 return false;
20277 else
20278 return false;
20280 orig_out = operands[0];
20281 tmp = gen_reg_rtx (mode);
20282 operands[0] = tmp;
20284 /* Recurse to get the constant loaded. */
20285 if (ix86_expand_int_movcc (operands) == 0)
20286 return false;
20288 /* Mask in the interesting variable. */
20289 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20290 OPTAB_WIDEN);
20291 if (!rtx_equal_p (out, orig_out))
20292 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20294 return true;
20298 * For comparison with above,
20300 * movl cf,dest
20301 * movl ct,tmp
20302 * cmpl op1,op2
20303 * cmovcc tmp,dest
20305 * Size 15.
20308 if (! nonimmediate_operand (operands[2], mode))
20309 operands[2] = force_reg (mode, operands[2]);
20310 if (! nonimmediate_operand (operands[3], mode))
20311 operands[3] = force_reg (mode, operands[3]);
20313 if (! register_operand (operands[2], VOIDmode)
20314 && (mode == QImode
20315 || ! register_operand (operands[3], VOIDmode)))
20316 operands[2] = force_reg (mode, operands[2]);
20318 if (mode == QImode
20319 && ! register_operand (operands[3], VOIDmode))
20320 operands[3] = force_reg (mode, operands[3]);
20322 emit_insn (compare_seq);
20323 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20324 gen_rtx_IF_THEN_ELSE (mode,
20325 compare_op, operands[2],
20326 operands[3])));
20327 return true;
20330 /* Swap, force into registers, or otherwise massage the two operands
20331 to an sse comparison with a mask result. Thus we differ a bit from
20332 ix86_prepare_fp_compare_args which expects to produce a flags result.
20334 The DEST operand exists to help determine whether to commute commutative
20335 operators. The POP0/POP1 operands are updated in place. The new
20336 comparison code is returned, or UNKNOWN if not implementable. */
20338 static enum rtx_code
20339 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20340 rtx *pop0, rtx *pop1)
20342 rtx tmp;
20344 switch (code)
20346 case LTGT:
20347 case UNEQ:
20348 /* AVX supports all the needed comparisons. */
20349 if (TARGET_AVX)
20350 break;
20351 /* We have no LTGT as an operator. We could implement it with
20352 NE & ORDERED, but this requires an extra temporary. It's
20353 not clear that it's worth it. */
20354 return UNKNOWN;
20356 case LT:
20357 case LE:
20358 case UNGT:
20359 case UNGE:
20360 /* These are supported directly. */
20361 break;
20363 case EQ:
20364 case NE:
20365 case UNORDERED:
20366 case ORDERED:
20367 /* AVX has 3 operand comparisons, no need to swap anything. */
20368 if (TARGET_AVX)
20369 break;
20370 /* For commutative operators, try to canonicalize the destination
20371 operand to be first in the comparison - this helps reload to
20372 avoid extra moves. */
20373 if (!dest || !rtx_equal_p (dest, *pop1))
20374 break;
20375 /* FALLTHRU */
20377 case GE:
20378 case GT:
20379 case UNLE:
20380 case UNLT:
20381 /* These are not supported directly before AVX, and furthermore
20382 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20383 comparison operands to transform into something that is
20384 supported. */
20385 tmp = *pop0;
20386 *pop0 = *pop1;
20387 *pop1 = tmp;
20388 code = swap_condition (code);
20389 break;
20391 default:
20392 gcc_unreachable ();
20395 return code;
20398 /* Detect conditional moves that exactly match min/max operational
20399 semantics. Note that this is IEEE safe, as long as we don't
20400 interchange the operands.
20402 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20403 and TRUE if the operation is successful and instructions are emitted. */
20405 static bool
20406 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20407 rtx cmp_op1, rtx if_true, rtx if_false)
20409 enum machine_mode mode;
20410 bool is_min;
20411 rtx tmp;
20413 if (code == LT)
20415 else if (code == UNGE)
20417 tmp = if_true;
20418 if_true = if_false;
20419 if_false = tmp;
20421 else
20422 return false;
20424 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20425 is_min = true;
20426 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20427 is_min = false;
20428 else
20429 return false;
20431 mode = GET_MODE (dest);
20433 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20434 but MODE may be a vector mode and thus not appropriate. */
20435 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20437 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20438 rtvec v;
20440 if_true = force_reg (mode, if_true);
20441 v = gen_rtvec (2, if_true, if_false);
20442 tmp = gen_rtx_UNSPEC (mode, v, u);
20444 else
20446 code = is_min ? SMIN : SMAX;
20447 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20450 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20451 return true;
20454 /* Expand an sse vector comparison. Return the register with the result. */
20456 static rtx
20457 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20458 rtx op_true, rtx op_false)
20460 enum machine_mode mode = GET_MODE (dest);
20461 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20462 rtx x;
20464 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20465 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20466 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20468 if (optimize
20469 || reg_overlap_mentioned_p (dest, op_true)
20470 || reg_overlap_mentioned_p (dest, op_false))
20471 dest = gen_reg_rtx (mode);
20473 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20474 if (cmp_mode != mode)
20476 x = force_reg (cmp_mode, x);
20477 convert_move (dest, x, false);
20479 else
20480 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20482 return dest;
20485 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20486 operations. This is used for both scalar and vector conditional moves. */
20488 static void
20489 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20491 enum machine_mode mode = GET_MODE (dest);
20492 rtx t2, t3, x;
20494 if (vector_all_ones_operand (op_true, mode)
20495 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20497 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20499 else if (op_false == CONST0_RTX (mode))
20501 op_true = force_reg (mode, op_true);
20502 x = gen_rtx_AND (mode, cmp, op_true);
20503 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20505 else if (op_true == CONST0_RTX (mode))
20507 op_false = force_reg (mode, op_false);
20508 x = gen_rtx_NOT (mode, cmp);
20509 x = gen_rtx_AND (mode, x, op_false);
20510 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20512 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20514 op_false = force_reg (mode, op_false);
20515 x = gen_rtx_IOR (mode, cmp, op_false);
20516 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20518 else if (TARGET_XOP)
20520 op_true = force_reg (mode, op_true);
20522 if (!nonimmediate_operand (op_false, mode))
20523 op_false = force_reg (mode, op_false);
20525 emit_insn (gen_rtx_SET (mode, dest,
20526 gen_rtx_IF_THEN_ELSE (mode, cmp,
20527 op_true,
20528 op_false)));
20530 else
20532 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20534 if (!nonimmediate_operand (op_true, mode))
20535 op_true = force_reg (mode, op_true);
20537 op_false = force_reg (mode, op_false);
20539 switch (mode)
20541 case V4SFmode:
20542 if (TARGET_SSE4_1)
20543 gen = gen_sse4_1_blendvps;
20544 break;
20545 case V2DFmode:
20546 if (TARGET_SSE4_1)
20547 gen = gen_sse4_1_blendvpd;
20548 break;
20549 case V16QImode:
20550 case V8HImode:
20551 case V4SImode:
20552 case V2DImode:
20553 if (TARGET_SSE4_1)
20555 gen = gen_sse4_1_pblendvb;
20556 dest = gen_lowpart (V16QImode, dest);
20557 op_false = gen_lowpart (V16QImode, op_false);
20558 op_true = gen_lowpart (V16QImode, op_true);
20559 cmp = gen_lowpart (V16QImode, cmp);
20561 break;
20562 case V8SFmode:
20563 if (TARGET_AVX)
20564 gen = gen_avx_blendvps256;
20565 break;
20566 case V4DFmode:
20567 if (TARGET_AVX)
20568 gen = gen_avx_blendvpd256;
20569 break;
20570 case V32QImode:
20571 case V16HImode:
20572 case V8SImode:
20573 case V4DImode:
20574 if (TARGET_AVX2)
20576 gen = gen_avx2_pblendvb;
20577 dest = gen_lowpart (V32QImode, dest);
20578 op_false = gen_lowpart (V32QImode, op_false);
20579 op_true = gen_lowpart (V32QImode, op_true);
20580 cmp = gen_lowpart (V32QImode, cmp);
20582 break;
20583 default:
20584 break;
20587 if (gen != NULL)
20588 emit_insn (gen (dest, op_false, op_true, cmp));
20589 else
20591 op_true = force_reg (mode, op_true);
20593 t2 = gen_reg_rtx (mode);
20594 if (optimize)
20595 t3 = gen_reg_rtx (mode);
20596 else
20597 t3 = dest;
20599 x = gen_rtx_AND (mode, op_true, cmp);
20600 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20602 x = gen_rtx_NOT (mode, cmp);
20603 x = gen_rtx_AND (mode, x, op_false);
20604 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20606 x = gen_rtx_IOR (mode, t3, t2);
20607 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20612 /* Expand a floating-point conditional move. Return true if successful. */
20614 bool
20615 ix86_expand_fp_movcc (rtx operands[])
20617 enum machine_mode mode = GET_MODE (operands[0]);
20618 enum rtx_code code = GET_CODE (operands[1]);
20619 rtx tmp, compare_op;
20620 rtx op0 = XEXP (operands[1], 0);
20621 rtx op1 = XEXP (operands[1], 1);
20623 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20625 enum machine_mode cmode;
20627 /* Since we've no cmove for sse registers, don't force bad register
20628 allocation just to gain access to it. Deny movcc when the
20629 comparison mode doesn't match the move mode. */
20630 cmode = GET_MODE (op0);
20631 if (cmode == VOIDmode)
20632 cmode = GET_MODE (op1);
20633 if (cmode != mode)
20634 return false;
20636 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20637 if (code == UNKNOWN)
20638 return false;
20640 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20641 operands[2], operands[3]))
20642 return true;
20644 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20645 operands[2], operands[3]);
20646 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20647 return true;
20650 if (GET_MODE (op0) == TImode
20651 || (GET_MODE (op0) == DImode
20652 && !TARGET_64BIT))
20653 return false;
20655 /* The floating point conditional move instructions don't directly
20656 support conditions resulting from a signed integer comparison. */
20658 compare_op = ix86_expand_compare (code, op0, op1);
20659 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20661 tmp = gen_reg_rtx (QImode);
20662 ix86_expand_setcc (tmp, code, op0, op1);
20664 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20667 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20668 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20669 operands[2], operands[3])));
20671 return true;
20674 /* Expand a floating-point vector conditional move; a vcond operation
20675 rather than a movcc operation. */
20677 bool
20678 ix86_expand_fp_vcond (rtx operands[])
20680 enum rtx_code code = GET_CODE (operands[3]);
20681 rtx cmp;
20683 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20684 &operands[4], &operands[5]);
20685 if (code == UNKNOWN)
20687 rtx temp;
20688 switch (GET_CODE (operands[3]))
20690 case LTGT:
20691 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20692 operands[5], operands[0], operands[0]);
20693 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20694 operands[5], operands[1], operands[2]);
20695 code = AND;
20696 break;
20697 case UNEQ:
20698 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20699 operands[5], operands[0], operands[0]);
20700 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20701 operands[5], operands[1], operands[2]);
20702 code = IOR;
20703 break;
20704 default:
20705 gcc_unreachable ();
20707 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20708 OPTAB_DIRECT);
20709 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20710 return true;
20713 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20714 operands[5], operands[1], operands[2]))
20715 return true;
20717 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20718 operands[1], operands[2]);
20719 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20720 return true;
20723 /* Expand a signed/unsigned integral vector conditional move. */
20725 bool
20726 ix86_expand_int_vcond (rtx operands[])
20728 enum machine_mode data_mode = GET_MODE (operands[0]);
20729 enum machine_mode mode = GET_MODE (operands[4]);
20730 enum rtx_code code = GET_CODE (operands[3]);
20731 bool negate = false;
20732 rtx x, cop0, cop1;
20734 cop0 = operands[4];
20735 cop1 = operands[5];
20737 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20738 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20739 if ((code == LT || code == GE)
20740 && data_mode == mode
20741 && cop1 == CONST0_RTX (mode)
20742 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20743 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20744 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20745 && (GET_MODE_SIZE (data_mode) == 16
20746 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20748 rtx negop = operands[2 - (code == LT)];
20749 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20750 if (negop == CONST1_RTX (data_mode))
20752 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20753 operands[0], 1, OPTAB_DIRECT);
20754 if (res != operands[0])
20755 emit_move_insn (operands[0], res);
20756 return true;
20758 else if (GET_MODE_INNER (data_mode) != DImode
20759 && vector_all_ones_operand (negop, data_mode))
20761 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20762 operands[0], 0, OPTAB_DIRECT);
20763 if (res != operands[0])
20764 emit_move_insn (operands[0], res);
20765 return true;
20769 if (!nonimmediate_operand (cop1, mode))
20770 cop1 = force_reg (mode, cop1);
20771 if (!general_operand (operands[1], data_mode))
20772 operands[1] = force_reg (data_mode, operands[1]);
20773 if (!general_operand (operands[2], data_mode))
20774 operands[2] = force_reg (data_mode, operands[2]);
20776 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20777 if (TARGET_XOP
20778 && (mode == V16QImode || mode == V8HImode
20779 || mode == V4SImode || mode == V2DImode))
20781 else
20783 /* Canonicalize the comparison to EQ, GT, GTU. */
20784 switch (code)
20786 case EQ:
20787 case GT:
20788 case GTU:
20789 break;
20791 case NE:
20792 case LE:
20793 case LEU:
20794 code = reverse_condition (code);
20795 negate = true;
20796 break;
20798 case GE:
20799 case GEU:
20800 code = reverse_condition (code);
20801 negate = true;
20802 /* FALLTHRU */
20804 case LT:
20805 case LTU:
20806 code = swap_condition (code);
20807 x = cop0, cop0 = cop1, cop1 = x;
20808 break;
20810 default:
20811 gcc_unreachable ();
20814 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20815 if (mode == V2DImode)
20817 switch (code)
20819 case EQ:
20820 /* SSE4.1 supports EQ. */
20821 if (!TARGET_SSE4_1)
20822 return false;
20823 break;
20825 case GT:
20826 case GTU:
20827 /* SSE4.2 supports GT/GTU. */
20828 if (!TARGET_SSE4_2)
20829 return false;
20830 break;
20832 default:
20833 gcc_unreachable ();
20837 /* Unsigned parallel compare is not supported by the hardware.
20838 Play some tricks to turn this into a signed comparison
20839 against 0. */
20840 if (code == GTU)
20842 cop0 = force_reg (mode, cop0);
20844 switch (mode)
20846 case V8SImode:
20847 case V4DImode:
20848 case V4SImode:
20849 case V2DImode:
20851 rtx t1, t2, mask;
20852 rtx (*gen_sub3) (rtx, rtx, rtx);
20854 switch (mode)
20856 case V8SImode: gen_sub3 = gen_subv8si3; break;
20857 case V4DImode: gen_sub3 = gen_subv4di3; break;
20858 case V4SImode: gen_sub3 = gen_subv4si3; break;
20859 case V2DImode: gen_sub3 = gen_subv2di3; break;
20860 default:
20861 gcc_unreachable ();
20863 /* Subtract (-(INT MAX) - 1) from both operands to make
20864 them signed. */
20865 mask = ix86_build_signbit_mask (mode, true, false);
20866 t1 = gen_reg_rtx (mode);
20867 emit_insn (gen_sub3 (t1, cop0, mask));
20869 t2 = gen_reg_rtx (mode);
20870 emit_insn (gen_sub3 (t2, cop1, mask));
20872 cop0 = t1;
20873 cop1 = t2;
20874 code = GT;
20876 break;
20878 case V32QImode:
20879 case V16HImode:
20880 case V16QImode:
20881 case V8HImode:
20882 /* Perform a parallel unsigned saturating subtraction. */
20883 x = gen_reg_rtx (mode);
20884 emit_insn (gen_rtx_SET (VOIDmode, x,
20885 gen_rtx_US_MINUS (mode, cop0, cop1)));
20887 cop0 = x;
20888 cop1 = CONST0_RTX (mode);
20889 code = EQ;
20890 negate = !negate;
20891 break;
20893 default:
20894 gcc_unreachable ();
20899 /* Allow the comparison to be done in one mode, but the movcc to
20900 happen in another mode. */
20901 if (data_mode == mode)
20903 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20904 operands[1+negate], operands[2-negate]);
20906 else
20908 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20909 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20910 code, cop0, cop1,
20911 operands[1+negate], operands[2-negate]);
20912 x = gen_lowpart (data_mode, x);
20915 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20916 operands[2-negate]);
20917 return true;
20920 /* Expand a variable vector permutation. */
20922 void
20923 ix86_expand_vec_perm (rtx operands[])
20925 rtx target = operands[0];
20926 rtx op0 = operands[1];
20927 rtx op1 = operands[2];
20928 rtx mask = operands[3];
20929 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20930 enum machine_mode mode = GET_MODE (op0);
20931 enum machine_mode maskmode = GET_MODE (mask);
20932 int w, e, i;
20933 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20935 /* Number of elements in the vector. */
20936 w = GET_MODE_NUNITS (mode);
20937 e = GET_MODE_UNIT_SIZE (mode);
20938 gcc_assert (w <= 32);
20940 if (TARGET_AVX2)
20942 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20944 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20945 an constant shuffle operand. With a tiny bit of effort we can
20946 use VPERMD instead. A re-interpretation stall for V4DFmode is
20947 unfortunate but there's no avoiding it.
20948 Similarly for V16HImode we don't have instructions for variable
20949 shuffling, while for V32QImode we can use after preparing suitable
20950 masks vpshufb; vpshufb; vpermq; vpor. */
20952 if (mode == V16HImode)
20954 maskmode = mode = V32QImode;
20955 w = 32;
20956 e = 1;
20958 else
20960 maskmode = mode = V8SImode;
20961 w = 8;
20962 e = 4;
20964 t1 = gen_reg_rtx (maskmode);
20966 /* Replicate the low bits of the V4DImode mask into V8SImode:
20967 mask = { A B C D }
20968 t1 = { A A B B C C D D }. */
20969 for (i = 0; i < w / 2; ++i)
20970 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20971 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20972 vt = force_reg (maskmode, vt);
20973 mask = gen_lowpart (maskmode, mask);
20974 if (maskmode == V8SImode)
20975 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20976 else
20977 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20979 /* Multiply the shuffle indicies by two. */
20980 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20981 OPTAB_DIRECT);
20983 /* Add one to the odd shuffle indicies:
20984 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20985 for (i = 0; i < w / 2; ++i)
20987 vec[i * 2] = const0_rtx;
20988 vec[i * 2 + 1] = const1_rtx;
20990 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20991 vt = validize_mem (force_const_mem (maskmode, vt));
20992 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20993 OPTAB_DIRECT);
20995 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20996 operands[3] = mask = t1;
20997 target = gen_lowpart (mode, target);
20998 op0 = gen_lowpart (mode, op0);
20999 op1 = gen_lowpart (mode, op1);
21002 switch (mode)
21004 case V8SImode:
21005 /* The VPERMD and VPERMPS instructions already properly ignore
21006 the high bits of the shuffle elements. No need for us to
21007 perform an AND ourselves. */
21008 if (one_operand_shuffle)
21009 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
21010 else
21012 t1 = gen_reg_rtx (V8SImode);
21013 t2 = gen_reg_rtx (V8SImode);
21014 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
21015 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
21016 goto merge_two;
21018 return;
21020 case V8SFmode:
21021 mask = gen_lowpart (V8SFmode, mask);
21022 if (one_operand_shuffle)
21023 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
21024 else
21026 t1 = gen_reg_rtx (V8SFmode);
21027 t2 = gen_reg_rtx (V8SFmode);
21028 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
21029 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
21030 goto merge_two;
21032 return;
21034 case V4SImode:
21035 /* By combining the two 128-bit input vectors into one 256-bit
21036 input vector, we can use VPERMD and VPERMPS for the full
21037 two-operand shuffle. */
21038 t1 = gen_reg_rtx (V8SImode);
21039 t2 = gen_reg_rtx (V8SImode);
21040 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
21041 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21042 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
21043 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
21044 return;
21046 case V4SFmode:
21047 t1 = gen_reg_rtx (V8SFmode);
21048 t2 = gen_reg_rtx (V8SImode);
21049 mask = gen_lowpart (V4SImode, mask);
21050 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
21051 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21052 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
21053 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
21054 return;
21056 case V32QImode:
21057 t1 = gen_reg_rtx (V32QImode);
21058 t2 = gen_reg_rtx (V32QImode);
21059 t3 = gen_reg_rtx (V32QImode);
21060 vt2 = GEN_INT (128);
21061 for (i = 0; i < 32; i++)
21062 vec[i] = vt2;
21063 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21064 vt = force_reg (V32QImode, vt);
21065 for (i = 0; i < 32; i++)
21066 vec[i] = i < 16 ? vt2 : const0_rtx;
21067 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21068 vt2 = force_reg (V32QImode, vt2);
21069 /* From mask create two adjusted masks, which contain the same
21070 bits as mask in the low 7 bits of each vector element.
21071 The first mask will have the most significant bit clear
21072 if it requests element from the same 128-bit lane
21073 and MSB set if it requests element from the other 128-bit lane.
21074 The second mask will have the opposite values of the MSB,
21075 and additionally will have its 128-bit lanes swapped.
21076 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
21077 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
21078 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
21079 stands for other 12 bytes. */
21080 /* The bit whether element is from the same lane or the other
21081 lane is bit 4, so shift it up by 3 to the MSB position. */
21082 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
21083 gen_lowpart (V4DImode, mask),
21084 GEN_INT (3)));
21085 /* Clear MSB bits from the mask just in case it had them set. */
21086 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
21087 /* After this t1 will have MSB set for elements from other lane. */
21088 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
21089 /* Clear bits other than MSB. */
21090 emit_insn (gen_andv32qi3 (t1, t1, vt));
21091 /* Or in the lower bits from mask into t3. */
21092 emit_insn (gen_iorv32qi3 (t3, t1, t2));
21093 /* And invert MSB bits in t1, so MSB is set for elements from the same
21094 lane. */
21095 emit_insn (gen_xorv32qi3 (t1, t1, vt));
21096 /* Swap 128-bit lanes in t3. */
21097 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
21098 gen_lowpart (V4DImode, t3),
21099 const2_rtx, GEN_INT (3),
21100 const0_rtx, const1_rtx));
21101 /* And or in the lower bits from mask into t1. */
21102 emit_insn (gen_iorv32qi3 (t1, t1, t2));
21103 if (one_operand_shuffle)
21105 /* Each of these shuffles will put 0s in places where
21106 element from the other 128-bit lane is needed, otherwise
21107 will shuffle in the requested value. */
21108 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
21109 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
21110 /* For t3 the 128-bit lanes are swapped again. */
21111 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
21112 gen_lowpart (V4DImode, t3),
21113 const2_rtx, GEN_INT (3),
21114 const0_rtx, const1_rtx));
21115 /* And oring both together leads to the result. */
21116 emit_insn (gen_iorv32qi3 (target, t1, t3));
21117 return;
21120 t4 = gen_reg_rtx (V32QImode);
21121 /* Similarly to the above one_operand_shuffle code,
21122 just for repeated twice for each operand. merge_two:
21123 code will merge the two results together. */
21124 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
21125 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
21126 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
21127 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
21128 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
21129 gen_lowpart (V4DImode, t4),
21130 const2_rtx, GEN_INT (3),
21131 const0_rtx, const1_rtx));
21132 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
21133 gen_lowpart (V4DImode, t3),
21134 const2_rtx, GEN_INT (3),
21135 const0_rtx, const1_rtx));
21136 emit_insn (gen_iorv32qi3 (t4, t2, t4));
21137 emit_insn (gen_iorv32qi3 (t3, t1, t3));
21138 t1 = t4;
21139 t2 = t3;
21140 goto merge_two;
21142 default:
21143 gcc_assert (GET_MODE_SIZE (mode) <= 16);
21144 break;
21148 if (TARGET_XOP)
21150 /* The XOP VPPERM insn supports three inputs. By ignoring the
21151 one_operand_shuffle special case, we avoid creating another
21152 set of constant vectors in memory. */
21153 one_operand_shuffle = false;
21155 /* mask = mask & {2*w-1, ...} */
21156 vt = GEN_INT (2*w - 1);
21158 else
21160 /* mask = mask & {w-1, ...} */
21161 vt = GEN_INT (w - 1);
21164 for (i = 0; i < w; i++)
21165 vec[i] = vt;
21166 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21167 mask = expand_simple_binop (maskmode, AND, mask, vt,
21168 NULL_RTX, 0, OPTAB_DIRECT);
21170 /* For non-QImode operations, convert the word permutation control
21171 into a byte permutation control. */
21172 if (mode != V16QImode)
21174 mask = expand_simple_binop (maskmode, ASHIFT, mask,
21175 GEN_INT (exact_log2 (e)),
21176 NULL_RTX, 0, OPTAB_DIRECT);
21178 /* Convert mask to vector of chars. */
21179 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
21181 /* Replicate each of the input bytes into byte positions:
21182 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
21183 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
21184 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
21185 for (i = 0; i < 16; ++i)
21186 vec[i] = GEN_INT (i/e * e);
21187 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21188 vt = validize_mem (force_const_mem (V16QImode, vt));
21189 if (TARGET_XOP)
21190 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
21191 else
21192 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
21194 /* Convert it into the byte positions by doing
21195 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
21196 for (i = 0; i < 16; ++i)
21197 vec[i] = GEN_INT (i % e);
21198 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21199 vt = validize_mem (force_const_mem (V16QImode, vt));
21200 emit_insn (gen_addv16qi3 (mask, mask, vt));
21203 /* The actual shuffle operations all operate on V16QImode. */
21204 op0 = gen_lowpart (V16QImode, op0);
21205 op1 = gen_lowpart (V16QImode, op1);
21206 target = gen_lowpart (V16QImode, target);
21208 if (TARGET_XOP)
21210 emit_insn (gen_xop_pperm (target, op0, op1, mask));
21212 else if (one_operand_shuffle)
21214 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
21216 else
21218 rtx xops[6];
21219 bool ok;
21221 /* Shuffle the two input vectors independently. */
21222 t1 = gen_reg_rtx (V16QImode);
21223 t2 = gen_reg_rtx (V16QImode);
21224 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
21225 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21227 merge_two:
21228 /* Then merge them together. The key is whether any given control
21229 element contained a bit set that indicates the second word. */
21230 mask = operands[3];
21231 vt = GEN_INT (w);
21232 if (maskmode == V2DImode && !TARGET_SSE4_1)
21234 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21235 more shuffle to convert the V2DI input mask into a V4SI
21236 input mask. At which point the masking that expand_int_vcond
21237 will work as desired. */
21238 rtx t3 = gen_reg_rtx (V4SImode);
21239 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21240 const0_rtx, const0_rtx,
21241 const2_rtx, const2_rtx));
21242 mask = t3;
21243 maskmode = V4SImode;
21244 e = w = 4;
21247 for (i = 0; i < w; i++)
21248 vec[i] = vt;
21249 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21250 vt = force_reg (maskmode, vt);
21251 mask = expand_simple_binop (maskmode, AND, mask, vt,
21252 NULL_RTX, 0, OPTAB_DIRECT);
21254 xops[0] = gen_lowpart (mode, operands[0]);
21255 xops[1] = gen_lowpart (mode, t2);
21256 xops[2] = gen_lowpart (mode, t1);
21257 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21258 xops[4] = mask;
21259 xops[5] = vt;
21260 ok = ix86_expand_int_vcond (xops);
21261 gcc_assert (ok);
21265 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21266 true if we should do zero extension, else sign extension. HIGH_P is
21267 true if we want the N/2 high elements, else the low elements. */
21269 void
21270 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21272 enum machine_mode imode = GET_MODE (src);
21273 rtx tmp;
21275 if (TARGET_SSE4_1)
21277 rtx (*unpack)(rtx, rtx);
21278 rtx (*extract)(rtx, rtx) = NULL;
21279 enum machine_mode halfmode = BLKmode;
21281 switch (imode)
21283 case V32QImode:
21284 if (unsigned_p)
21285 unpack = gen_avx2_zero_extendv16qiv16hi2;
21286 else
21287 unpack = gen_avx2_sign_extendv16qiv16hi2;
21288 halfmode = V16QImode;
21289 extract
21290 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21291 break;
21292 case V16HImode:
21293 if (unsigned_p)
21294 unpack = gen_avx2_zero_extendv8hiv8si2;
21295 else
21296 unpack = gen_avx2_sign_extendv8hiv8si2;
21297 halfmode = V8HImode;
21298 extract
21299 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21300 break;
21301 case V8SImode:
21302 if (unsigned_p)
21303 unpack = gen_avx2_zero_extendv4siv4di2;
21304 else
21305 unpack = gen_avx2_sign_extendv4siv4di2;
21306 halfmode = V4SImode;
21307 extract
21308 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21309 break;
21310 case V16QImode:
21311 if (unsigned_p)
21312 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21313 else
21314 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21315 break;
21316 case V8HImode:
21317 if (unsigned_p)
21318 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21319 else
21320 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21321 break;
21322 case V4SImode:
21323 if (unsigned_p)
21324 unpack = gen_sse4_1_zero_extendv2siv2di2;
21325 else
21326 unpack = gen_sse4_1_sign_extendv2siv2di2;
21327 break;
21328 default:
21329 gcc_unreachable ();
21332 if (GET_MODE_SIZE (imode) == 32)
21334 tmp = gen_reg_rtx (halfmode);
21335 emit_insn (extract (tmp, src));
21337 else if (high_p)
21339 /* Shift higher 8 bytes to lower 8 bytes. */
21340 tmp = gen_reg_rtx (imode);
21341 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
21342 gen_lowpart (V1TImode, src),
21343 GEN_INT (64)));
21345 else
21346 tmp = src;
21348 emit_insn (unpack (dest, tmp));
21350 else
21352 rtx (*unpack)(rtx, rtx, rtx);
21354 switch (imode)
21356 case V16QImode:
21357 if (high_p)
21358 unpack = gen_vec_interleave_highv16qi;
21359 else
21360 unpack = gen_vec_interleave_lowv16qi;
21361 break;
21362 case V8HImode:
21363 if (high_p)
21364 unpack = gen_vec_interleave_highv8hi;
21365 else
21366 unpack = gen_vec_interleave_lowv8hi;
21367 break;
21368 case V4SImode:
21369 if (high_p)
21370 unpack = gen_vec_interleave_highv4si;
21371 else
21372 unpack = gen_vec_interleave_lowv4si;
21373 break;
21374 default:
21375 gcc_unreachable ();
21378 if (unsigned_p)
21379 tmp = force_reg (imode, CONST0_RTX (imode));
21380 else
21381 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21382 src, pc_rtx, pc_rtx);
21384 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
21388 /* Expand conditional increment or decrement using adb/sbb instructions.
21389 The default case using setcc followed by the conditional move can be
21390 done by generic code. */
21391 bool
21392 ix86_expand_int_addcc (rtx operands[])
21394 enum rtx_code code = GET_CODE (operands[1]);
21395 rtx flags;
21396 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21397 rtx compare_op;
21398 rtx val = const0_rtx;
21399 bool fpcmp = false;
21400 enum machine_mode mode;
21401 rtx op0 = XEXP (operands[1], 0);
21402 rtx op1 = XEXP (operands[1], 1);
21404 if (operands[3] != const1_rtx
21405 && operands[3] != constm1_rtx)
21406 return false;
21407 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21408 return false;
21409 code = GET_CODE (compare_op);
21411 flags = XEXP (compare_op, 0);
21413 if (GET_MODE (flags) == CCFPmode
21414 || GET_MODE (flags) == CCFPUmode)
21416 fpcmp = true;
21417 code = ix86_fp_compare_code_to_integer (code);
21420 if (code != LTU)
21422 val = constm1_rtx;
21423 if (fpcmp)
21424 PUT_CODE (compare_op,
21425 reverse_condition_maybe_unordered
21426 (GET_CODE (compare_op)));
21427 else
21428 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21431 mode = GET_MODE (operands[0]);
21433 /* Construct either adc or sbb insn. */
21434 if ((code == LTU) == (operands[3] == constm1_rtx))
21436 switch (mode)
21438 case QImode:
21439 insn = gen_subqi3_carry;
21440 break;
21441 case HImode:
21442 insn = gen_subhi3_carry;
21443 break;
21444 case SImode:
21445 insn = gen_subsi3_carry;
21446 break;
21447 case DImode:
21448 insn = gen_subdi3_carry;
21449 break;
21450 default:
21451 gcc_unreachable ();
21454 else
21456 switch (mode)
21458 case QImode:
21459 insn = gen_addqi3_carry;
21460 break;
21461 case HImode:
21462 insn = gen_addhi3_carry;
21463 break;
21464 case SImode:
21465 insn = gen_addsi3_carry;
21466 break;
21467 case DImode:
21468 insn = gen_adddi3_carry;
21469 break;
21470 default:
21471 gcc_unreachable ();
21474 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21476 return true;
21480 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21481 but works for floating pointer parameters and nonoffsetable memories.
21482 For pushes, it returns just stack offsets; the values will be saved
21483 in the right order. Maximally three parts are generated. */
21485 static int
21486 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21488 int size;
21490 if (!TARGET_64BIT)
21491 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21492 else
21493 size = (GET_MODE_SIZE (mode) + 4) / 8;
21495 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21496 gcc_assert (size >= 2 && size <= 4);
21498 /* Optimize constant pool reference to immediates. This is used by fp
21499 moves, that force all constants to memory to allow combining. */
21500 if (MEM_P (operand) && MEM_READONLY_P (operand))
21502 rtx tmp = maybe_get_pool_constant (operand);
21503 if (tmp)
21504 operand = tmp;
21507 if (MEM_P (operand) && !offsettable_memref_p (operand))
21509 /* The only non-offsetable memories we handle are pushes. */
21510 int ok = push_operand (operand, VOIDmode);
21512 gcc_assert (ok);
21514 operand = copy_rtx (operand);
21515 PUT_MODE (operand, word_mode);
21516 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21517 return size;
21520 if (GET_CODE (operand) == CONST_VECTOR)
21522 enum machine_mode imode = int_mode_for_mode (mode);
21523 /* Caution: if we looked through a constant pool memory above,
21524 the operand may actually have a different mode now. That's
21525 ok, since we want to pun this all the way back to an integer. */
21526 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21527 gcc_assert (operand != NULL);
21528 mode = imode;
21531 if (!TARGET_64BIT)
21533 if (mode == DImode)
21534 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21535 else
21537 int i;
21539 if (REG_P (operand))
21541 gcc_assert (reload_completed);
21542 for (i = 0; i < size; i++)
21543 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21545 else if (offsettable_memref_p (operand))
21547 operand = adjust_address (operand, SImode, 0);
21548 parts[0] = operand;
21549 for (i = 1; i < size; i++)
21550 parts[i] = adjust_address (operand, SImode, 4 * i);
21552 else if (GET_CODE (operand) == CONST_DOUBLE)
21554 REAL_VALUE_TYPE r;
21555 long l[4];
21557 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21558 switch (mode)
21560 case TFmode:
21561 real_to_target (l, &r, mode);
21562 parts[3] = gen_int_mode (l[3], SImode);
21563 parts[2] = gen_int_mode (l[2], SImode);
21564 break;
21565 case XFmode:
21566 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21567 long double may not be 80-bit. */
21568 real_to_target (l, &r, mode);
21569 parts[2] = gen_int_mode (l[2], SImode);
21570 break;
21571 case DFmode:
21572 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21573 break;
21574 default:
21575 gcc_unreachable ();
21577 parts[1] = gen_int_mode (l[1], SImode);
21578 parts[0] = gen_int_mode (l[0], SImode);
21580 else
21581 gcc_unreachable ();
21584 else
21586 if (mode == TImode)
21587 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21588 if (mode == XFmode || mode == TFmode)
21590 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21591 if (REG_P (operand))
21593 gcc_assert (reload_completed);
21594 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21595 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21597 else if (offsettable_memref_p (operand))
21599 operand = adjust_address (operand, DImode, 0);
21600 parts[0] = operand;
21601 parts[1] = adjust_address (operand, upper_mode, 8);
21603 else if (GET_CODE (operand) == CONST_DOUBLE)
21605 REAL_VALUE_TYPE r;
21606 long l[4];
21608 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21609 real_to_target (l, &r, mode);
21611 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21612 if (HOST_BITS_PER_WIDE_INT >= 64)
21613 parts[0]
21614 = gen_int_mode
21615 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21616 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21617 DImode);
21618 else
21619 parts[0] = immed_double_const (l[0], l[1], DImode);
21621 if (upper_mode == SImode)
21622 parts[1] = gen_int_mode (l[2], SImode);
21623 else if (HOST_BITS_PER_WIDE_INT >= 64)
21624 parts[1]
21625 = gen_int_mode
21626 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21627 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21628 DImode);
21629 else
21630 parts[1] = immed_double_const (l[2], l[3], DImode);
21632 else
21633 gcc_unreachable ();
21637 return size;
21640 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21641 Return false when normal moves are needed; true when all required
21642 insns have been emitted. Operands 2-4 contain the input values
21643 int the correct order; operands 5-7 contain the output values. */
21645 void
21646 ix86_split_long_move (rtx operands[])
21648 rtx part[2][4];
21649 int nparts, i, j;
21650 int push = 0;
21651 int collisions = 0;
21652 enum machine_mode mode = GET_MODE (operands[0]);
21653 bool collisionparts[4];
21655 /* The DFmode expanders may ask us to move double.
21656 For 64bit target this is single move. By hiding the fact
21657 here we simplify i386.md splitters. */
21658 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21660 /* Optimize constant pool reference to immediates. This is used by
21661 fp moves, that force all constants to memory to allow combining. */
21663 if (MEM_P (operands[1])
21664 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21665 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21666 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21667 if (push_operand (operands[0], VOIDmode))
21669 operands[0] = copy_rtx (operands[0]);
21670 PUT_MODE (operands[0], word_mode);
21672 else
21673 operands[0] = gen_lowpart (DImode, operands[0]);
21674 operands[1] = gen_lowpart (DImode, operands[1]);
21675 emit_move_insn (operands[0], operands[1]);
21676 return;
21679 /* The only non-offsettable memory we handle is push. */
21680 if (push_operand (operands[0], VOIDmode))
21681 push = 1;
21682 else
21683 gcc_assert (!MEM_P (operands[0])
21684 || offsettable_memref_p (operands[0]));
21686 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21687 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21689 /* When emitting push, take care for source operands on the stack. */
21690 if (push && MEM_P (operands[1])
21691 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21693 rtx src_base = XEXP (part[1][nparts - 1], 0);
21695 /* Compensate for the stack decrement by 4. */
21696 if (!TARGET_64BIT && nparts == 3
21697 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21698 src_base = plus_constant (Pmode, src_base, 4);
21700 /* src_base refers to the stack pointer and is
21701 automatically decreased by emitted push. */
21702 for (i = 0; i < nparts; i++)
21703 part[1][i] = change_address (part[1][i],
21704 GET_MODE (part[1][i]), src_base);
21707 /* We need to do copy in the right order in case an address register
21708 of the source overlaps the destination. */
21709 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21711 rtx tmp;
21713 for (i = 0; i < nparts; i++)
21715 collisionparts[i]
21716 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21717 if (collisionparts[i])
21718 collisions++;
21721 /* Collision in the middle part can be handled by reordering. */
21722 if (collisions == 1 && nparts == 3 && collisionparts [1])
21724 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21725 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21727 else if (collisions == 1
21728 && nparts == 4
21729 && (collisionparts [1] || collisionparts [2]))
21731 if (collisionparts [1])
21733 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21734 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21736 else
21738 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21739 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21743 /* If there are more collisions, we can't handle it by reordering.
21744 Do an lea to the last part and use only one colliding move. */
21745 else if (collisions > 1)
21747 rtx base;
21749 collisions = 1;
21751 base = part[0][nparts - 1];
21753 /* Handle the case when the last part isn't valid for lea.
21754 Happens in 64-bit mode storing the 12-byte XFmode. */
21755 if (GET_MODE (base) != Pmode)
21756 base = gen_rtx_REG (Pmode, REGNO (base));
21758 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21759 part[1][0] = replace_equiv_address (part[1][0], base);
21760 for (i = 1; i < nparts; i++)
21762 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21763 part[1][i] = replace_equiv_address (part[1][i], tmp);
21768 if (push)
21770 if (!TARGET_64BIT)
21772 if (nparts == 3)
21774 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21775 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21776 stack_pointer_rtx, GEN_INT (-4)));
21777 emit_move_insn (part[0][2], part[1][2]);
21779 else if (nparts == 4)
21781 emit_move_insn (part[0][3], part[1][3]);
21782 emit_move_insn (part[0][2], part[1][2]);
21785 else
21787 /* In 64bit mode we don't have 32bit push available. In case this is
21788 register, it is OK - we will just use larger counterpart. We also
21789 retype memory - these comes from attempt to avoid REX prefix on
21790 moving of second half of TFmode value. */
21791 if (GET_MODE (part[1][1]) == SImode)
21793 switch (GET_CODE (part[1][1]))
21795 case MEM:
21796 part[1][1] = adjust_address (part[1][1], DImode, 0);
21797 break;
21799 case REG:
21800 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21801 break;
21803 default:
21804 gcc_unreachable ();
21807 if (GET_MODE (part[1][0]) == SImode)
21808 part[1][0] = part[1][1];
21811 emit_move_insn (part[0][1], part[1][1]);
21812 emit_move_insn (part[0][0], part[1][0]);
21813 return;
21816 /* Choose correct order to not overwrite the source before it is copied. */
21817 if ((REG_P (part[0][0])
21818 && REG_P (part[1][1])
21819 && (REGNO (part[0][0]) == REGNO (part[1][1])
21820 || (nparts == 3
21821 && REGNO (part[0][0]) == REGNO (part[1][2]))
21822 || (nparts == 4
21823 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21824 || (collisions > 0
21825 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21827 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21829 operands[2 + i] = part[0][j];
21830 operands[6 + i] = part[1][j];
21833 else
21835 for (i = 0; i < nparts; i++)
21837 operands[2 + i] = part[0][i];
21838 operands[6 + i] = part[1][i];
21842 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21843 if (optimize_insn_for_size_p ())
21845 for (j = 0; j < nparts - 1; j++)
21846 if (CONST_INT_P (operands[6 + j])
21847 && operands[6 + j] != const0_rtx
21848 && REG_P (operands[2 + j]))
21849 for (i = j; i < nparts - 1; i++)
21850 if (CONST_INT_P (operands[7 + i])
21851 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21852 operands[7 + i] = operands[2 + j];
21855 for (i = 0; i < nparts; i++)
21856 emit_move_insn (operands[2 + i], operands[6 + i]);
21858 return;
21861 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21862 left shift by a constant, either using a single shift or
21863 a sequence of add instructions. */
21865 static void
21866 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21868 rtx (*insn)(rtx, rtx, rtx);
21870 if (count == 1
21871 || (count * ix86_cost->add <= ix86_cost->shift_const
21872 && !optimize_insn_for_size_p ()))
21874 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21875 while (count-- > 0)
21876 emit_insn (insn (operand, operand, operand));
21878 else
21880 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21881 emit_insn (insn (operand, operand, GEN_INT (count)));
21885 void
21886 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21888 rtx (*gen_ashl3)(rtx, rtx, rtx);
21889 rtx (*gen_shld)(rtx, rtx, rtx);
21890 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21892 rtx low[2], high[2];
21893 int count;
21895 if (CONST_INT_P (operands[2]))
21897 split_double_mode (mode, operands, 2, low, high);
21898 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21900 if (count >= half_width)
21902 emit_move_insn (high[0], low[1]);
21903 emit_move_insn (low[0], const0_rtx);
21905 if (count > half_width)
21906 ix86_expand_ashl_const (high[0], count - half_width, mode);
21908 else
21910 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21912 if (!rtx_equal_p (operands[0], operands[1]))
21913 emit_move_insn (operands[0], operands[1]);
21915 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21916 ix86_expand_ashl_const (low[0], count, mode);
21918 return;
21921 split_double_mode (mode, operands, 1, low, high);
21923 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21925 if (operands[1] == const1_rtx)
21927 /* Assuming we've chosen a QImode capable registers, then 1 << N
21928 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21929 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21931 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21933 ix86_expand_clear (low[0]);
21934 ix86_expand_clear (high[0]);
21935 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21937 d = gen_lowpart (QImode, low[0]);
21938 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21939 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21940 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21942 d = gen_lowpart (QImode, high[0]);
21943 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21944 s = gen_rtx_NE (QImode, flags, const0_rtx);
21945 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21948 /* Otherwise, we can get the same results by manually performing
21949 a bit extract operation on bit 5/6, and then performing the two
21950 shifts. The two methods of getting 0/1 into low/high are exactly
21951 the same size. Avoiding the shift in the bit extract case helps
21952 pentium4 a bit; no one else seems to care much either way. */
21953 else
21955 enum machine_mode half_mode;
21956 rtx (*gen_lshr3)(rtx, rtx, rtx);
21957 rtx (*gen_and3)(rtx, rtx, rtx);
21958 rtx (*gen_xor3)(rtx, rtx, rtx);
21959 HOST_WIDE_INT bits;
21960 rtx x;
21962 if (mode == DImode)
21964 half_mode = SImode;
21965 gen_lshr3 = gen_lshrsi3;
21966 gen_and3 = gen_andsi3;
21967 gen_xor3 = gen_xorsi3;
21968 bits = 5;
21970 else
21972 half_mode = DImode;
21973 gen_lshr3 = gen_lshrdi3;
21974 gen_and3 = gen_anddi3;
21975 gen_xor3 = gen_xordi3;
21976 bits = 6;
21979 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21980 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21981 else
21982 x = gen_lowpart (half_mode, operands[2]);
21983 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21985 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21986 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21987 emit_move_insn (low[0], high[0]);
21988 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21991 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21992 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21993 return;
21996 if (operands[1] == constm1_rtx)
21998 /* For -1 << N, we can avoid the shld instruction, because we
21999 know that we're shifting 0...31/63 ones into a -1. */
22000 emit_move_insn (low[0], constm1_rtx);
22001 if (optimize_insn_for_size_p ())
22002 emit_move_insn (high[0], low[0]);
22003 else
22004 emit_move_insn (high[0], constm1_rtx);
22006 else
22008 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22010 if (!rtx_equal_p (operands[0], operands[1]))
22011 emit_move_insn (operands[0], operands[1]);
22013 split_double_mode (mode, operands, 1, low, high);
22014 emit_insn (gen_shld (high[0], low[0], operands[2]));
22017 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22019 if (TARGET_CMOVE && scratch)
22021 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22022 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22024 ix86_expand_clear (scratch);
22025 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
22027 else
22029 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22030 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22032 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
22036 void
22037 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
22039 rtx (*gen_ashr3)(rtx, rtx, rtx)
22040 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
22041 rtx (*gen_shrd)(rtx, rtx, rtx);
22042 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22044 rtx low[2], high[2];
22045 int count;
22047 if (CONST_INT_P (operands[2]))
22049 split_double_mode (mode, operands, 2, low, high);
22050 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22052 if (count == GET_MODE_BITSIZE (mode) - 1)
22054 emit_move_insn (high[0], high[1]);
22055 emit_insn (gen_ashr3 (high[0], high[0],
22056 GEN_INT (half_width - 1)));
22057 emit_move_insn (low[0], high[0]);
22060 else if (count >= half_width)
22062 emit_move_insn (low[0], high[1]);
22063 emit_move_insn (high[0], low[0]);
22064 emit_insn (gen_ashr3 (high[0], high[0],
22065 GEN_INT (half_width - 1)));
22067 if (count > half_width)
22068 emit_insn (gen_ashr3 (low[0], low[0],
22069 GEN_INT (count - half_width)));
22071 else
22073 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22075 if (!rtx_equal_p (operands[0], operands[1]))
22076 emit_move_insn (operands[0], operands[1]);
22078 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22079 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
22082 else
22084 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22086 if (!rtx_equal_p (operands[0], operands[1]))
22087 emit_move_insn (operands[0], operands[1]);
22089 split_double_mode (mode, operands, 1, low, high);
22091 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22092 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
22094 if (TARGET_CMOVE && scratch)
22096 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22097 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22099 emit_move_insn (scratch, high[0]);
22100 emit_insn (gen_ashr3 (scratch, scratch,
22101 GEN_INT (half_width - 1)));
22102 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22103 scratch));
22105 else
22107 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
22108 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
22110 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
22115 void
22116 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
22118 rtx (*gen_lshr3)(rtx, rtx, rtx)
22119 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
22120 rtx (*gen_shrd)(rtx, rtx, rtx);
22121 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22123 rtx low[2], high[2];
22124 int count;
22126 if (CONST_INT_P (operands[2]))
22128 split_double_mode (mode, operands, 2, low, high);
22129 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22131 if (count >= half_width)
22133 emit_move_insn (low[0], high[1]);
22134 ix86_expand_clear (high[0]);
22136 if (count > half_width)
22137 emit_insn (gen_lshr3 (low[0], low[0],
22138 GEN_INT (count - half_width)));
22140 else
22142 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22144 if (!rtx_equal_p (operands[0], operands[1]))
22145 emit_move_insn (operands[0], operands[1]);
22147 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22148 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
22151 else
22153 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22155 if (!rtx_equal_p (operands[0], operands[1]))
22156 emit_move_insn (operands[0], operands[1]);
22158 split_double_mode (mode, operands, 1, low, high);
22160 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22161 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
22163 if (TARGET_CMOVE && scratch)
22165 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22166 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22168 ix86_expand_clear (scratch);
22169 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22170 scratch));
22172 else
22174 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22175 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22177 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
22182 /* Predict just emitted jump instruction to be taken with probability PROB. */
22183 static void
22184 predict_jump (int prob)
22186 rtx insn = get_last_insn ();
22187 gcc_assert (JUMP_P (insn));
22188 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
22191 /* Helper function for the string operations below. Dest VARIABLE whether
22192 it is aligned to VALUE bytes. If true, jump to the label. */
22193 static rtx
22194 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
22196 rtx label = gen_label_rtx ();
22197 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
22198 if (GET_MODE (variable) == DImode)
22199 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
22200 else
22201 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
22202 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
22203 1, label);
22204 if (epilogue)
22205 predict_jump (REG_BR_PROB_BASE * 50 / 100);
22206 else
22207 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22208 return label;
22211 /* Adjust COUNTER by the VALUE. */
22212 static void
22213 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
22215 rtx (*gen_add)(rtx, rtx, rtx)
22216 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
22218 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
22221 /* Zero extend possibly SImode EXP to Pmode register. */
22223 ix86_zero_extend_to_Pmode (rtx exp)
22225 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22228 /* Divide COUNTREG by SCALE. */
22229 static rtx
22230 scale_counter (rtx countreg, int scale)
22232 rtx sc;
22234 if (scale == 1)
22235 return countreg;
22236 if (CONST_INT_P (countreg))
22237 return GEN_INT (INTVAL (countreg) / scale);
22238 gcc_assert (REG_P (countreg));
22240 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22241 GEN_INT (exact_log2 (scale)),
22242 NULL, 1, OPTAB_DIRECT);
22243 return sc;
22246 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22247 DImode for constant loop counts. */
22249 static enum machine_mode
22250 counter_mode (rtx count_exp)
22252 if (GET_MODE (count_exp) != VOIDmode)
22253 return GET_MODE (count_exp);
22254 if (!CONST_INT_P (count_exp))
22255 return Pmode;
22256 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22257 return DImode;
22258 return SImode;
22261 /* When SRCPTR is non-NULL, output simple loop to move memory
22262 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
22263 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
22264 equivalent loop to set memory by VALUE (supposed to be in MODE).
22266 The size is rounded down to whole number of chunk size moved at once.
22267 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22270 static void
22271 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22272 rtx destptr, rtx srcptr, rtx value,
22273 rtx count, enum machine_mode mode, int unroll,
22274 int expected_size)
22276 rtx out_label, top_label, iter, tmp;
22277 enum machine_mode iter_mode = counter_mode (count);
22278 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22279 rtx piece_size = GEN_INT (piece_size_n);
22280 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22281 rtx size;
22282 int i;
22284 top_label = gen_label_rtx ();
22285 out_label = gen_label_rtx ();
22286 iter = gen_reg_rtx (iter_mode);
22288 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22289 NULL, 1, OPTAB_DIRECT);
22290 /* Those two should combine. */
22291 if (piece_size == const1_rtx)
22293 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22294 true, out_label);
22295 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22297 emit_move_insn (iter, const0_rtx);
22299 emit_label (top_label);
22301 tmp = convert_modes (Pmode, iter_mode, iter, true);
22303 /* This assert could be relaxed - in this case we'll need to compute
22304 smallest power of two, containing in PIECE_SIZE_N and pass it to
22305 offset_address. */
22306 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22307 destmem = offset_address (destmem, tmp, piece_size_n);
22308 destmem = adjust_address (destmem, mode, 0);
22310 if (srcmem)
22312 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22313 srcmem = adjust_address (srcmem, mode, 0);
22315 /* When unrolling for chips that reorder memory reads and writes,
22316 we can save registers by using single temporary.
22317 Also using 4 temporaries is overkill in 32bit mode. */
22318 if (!TARGET_64BIT && 0)
22320 for (i = 0; i < unroll; i++)
22322 if (i)
22324 destmem =
22325 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22326 srcmem =
22327 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22329 emit_move_insn (destmem, srcmem);
22332 else
22334 rtx tmpreg[4];
22335 gcc_assert (unroll <= 4);
22336 for (i = 0; i < unroll; i++)
22338 tmpreg[i] = gen_reg_rtx (mode);
22339 if (i)
22341 srcmem =
22342 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22344 emit_move_insn (tmpreg[i], srcmem);
22346 for (i = 0; i < unroll; i++)
22348 if (i)
22350 destmem =
22351 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22353 emit_move_insn (destmem, tmpreg[i]);
22357 else
22358 for (i = 0; i < unroll; i++)
22360 if (i)
22361 destmem =
22362 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22363 emit_move_insn (destmem, value);
22366 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22367 true, OPTAB_LIB_WIDEN);
22368 if (tmp != iter)
22369 emit_move_insn (iter, tmp);
22371 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22372 true, top_label);
22373 if (expected_size != -1)
22375 expected_size /= GET_MODE_SIZE (mode) * unroll;
22376 if (expected_size == 0)
22377 predict_jump (0);
22378 else if (expected_size > REG_BR_PROB_BASE)
22379 predict_jump (REG_BR_PROB_BASE - 1);
22380 else
22381 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22383 else
22384 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22385 iter = ix86_zero_extend_to_Pmode (iter);
22386 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22387 true, OPTAB_LIB_WIDEN);
22388 if (tmp != destptr)
22389 emit_move_insn (destptr, tmp);
22390 if (srcptr)
22392 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22393 true, OPTAB_LIB_WIDEN);
22394 if (tmp != srcptr)
22395 emit_move_insn (srcptr, tmp);
22397 emit_label (out_label);
22400 /* Output "rep; mov" instruction.
22401 Arguments have same meaning as for previous function */
22402 static void
22403 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
22404 rtx destptr, rtx srcptr,
22405 rtx count,
22406 enum machine_mode mode)
22408 rtx destexp;
22409 rtx srcexp;
22410 rtx countreg;
22411 HOST_WIDE_INT rounded_count;
22413 /* If the size is known, it is shorter to use rep movs. */
22414 if (mode == QImode && CONST_INT_P (count)
22415 && !(INTVAL (count) & 3))
22416 mode = SImode;
22418 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22419 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22420 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22421 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22422 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22423 if (mode != QImode)
22425 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22426 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22427 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22428 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22429 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22430 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22432 else
22434 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22435 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22437 if (CONST_INT_P (count))
22439 rounded_count = (INTVAL (count)
22440 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22441 destmem = shallow_copy_rtx (destmem);
22442 srcmem = shallow_copy_rtx (srcmem);
22443 set_mem_size (destmem, rounded_count);
22444 set_mem_size (srcmem, rounded_count);
22446 else
22448 if (MEM_SIZE_KNOWN_P (destmem))
22449 clear_mem_size (destmem);
22450 if (MEM_SIZE_KNOWN_P (srcmem))
22451 clear_mem_size (srcmem);
22453 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22454 destexp, srcexp));
22457 /* Output "rep; stos" instruction.
22458 Arguments have same meaning as for previous function */
22459 static void
22460 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
22461 rtx count, enum machine_mode mode,
22462 rtx orig_value)
22464 rtx destexp;
22465 rtx countreg;
22466 HOST_WIDE_INT rounded_count;
22468 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22469 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22470 value = force_reg (mode, gen_lowpart (mode, value));
22471 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22472 if (mode != QImode)
22474 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22475 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22476 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22478 else
22479 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22480 if (orig_value == const0_rtx && CONST_INT_P (count))
22482 rounded_count = (INTVAL (count)
22483 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22484 destmem = shallow_copy_rtx (destmem);
22485 set_mem_size (destmem, rounded_count);
22487 else if (MEM_SIZE_KNOWN_P (destmem))
22488 clear_mem_size (destmem);
22489 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22492 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22493 DESTMEM.
22494 SRC is passed by pointer to be updated on return.
22495 Return value is updated DST. */
22496 static rtx
22497 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22498 HOST_WIDE_INT size_to_move)
22500 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22501 enum insn_code code;
22502 enum machine_mode move_mode;
22503 int piece_size, i;
22505 /* Find the widest mode in which we could perform moves.
22506 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22507 it until move of such size is supported. */
22508 piece_size = 1 << floor_log2 (size_to_move);
22509 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22510 code = optab_handler (mov_optab, move_mode);
22511 while (code == CODE_FOR_nothing && piece_size > 1)
22513 piece_size >>= 1;
22514 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22515 code = optab_handler (mov_optab, move_mode);
22518 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22519 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22520 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22522 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22523 move_mode = mode_for_vector (word_mode, nunits);
22524 code = optab_handler (mov_optab, move_mode);
22525 if (code == CODE_FOR_nothing)
22527 move_mode = word_mode;
22528 piece_size = GET_MODE_SIZE (move_mode);
22529 code = optab_handler (mov_optab, move_mode);
22532 gcc_assert (code != CODE_FOR_nothing);
22534 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22535 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22537 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22538 gcc_assert (size_to_move % piece_size == 0);
22539 adjust = GEN_INT (piece_size);
22540 for (i = 0; i < size_to_move; i += piece_size)
22542 /* We move from memory to memory, so we'll need to do it via
22543 a temporary register. */
22544 tempreg = gen_reg_rtx (move_mode);
22545 emit_insn (GEN_FCN (code) (tempreg, src));
22546 emit_insn (GEN_FCN (code) (dst, tempreg));
22548 emit_move_insn (destptr,
22549 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22550 emit_move_insn (srcptr,
22551 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22553 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22554 piece_size);
22555 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22556 piece_size);
22559 /* Update DST and SRC rtx. */
22560 *srcmem = src;
22561 return dst;
22564 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22565 static void
22566 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22567 rtx destptr, rtx srcptr, rtx count, int max_size)
22569 rtx src, dest;
22570 if (CONST_INT_P (count))
22572 HOST_WIDE_INT countval = INTVAL (count);
22573 HOST_WIDE_INT epilogue_size = countval % max_size;
22574 int i;
22576 /* For now MAX_SIZE should be a power of 2. This assert could be
22577 relaxed, but it'll require a bit more complicated epilogue
22578 expanding. */
22579 gcc_assert ((max_size & (max_size - 1)) == 0);
22580 for (i = max_size; i >= 1; i >>= 1)
22582 if (epilogue_size & i)
22583 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22585 return;
22587 if (max_size > 8)
22589 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22590 count, 1, OPTAB_DIRECT);
22591 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22592 count, QImode, 1, 4);
22593 return;
22596 /* When there are stringops, we can cheaply increase dest and src pointers.
22597 Otherwise we save code size by maintaining offset (zero is readily
22598 available from preceding rep operation) and using x86 addressing modes.
22600 if (TARGET_SINGLE_STRINGOP)
22602 if (max_size > 4)
22604 rtx label = ix86_expand_aligntest (count, 4, true);
22605 src = change_address (srcmem, SImode, srcptr);
22606 dest = change_address (destmem, SImode, destptr);
22607 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22608 emit_label (label);
22609 LABEL_NUSES (label) = 1;
22611 if (max_size > 2)
22613 rtx label = ix86_expand_aligntest (count, 2, true);
22614 src = change_address (srcmem, HImode, srcptr);
22615 dest = change_address (destmem, HImode, destptr);
22616 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22617 emit_label (label);
22618 LABEL_NUSES (label) = 1;
22620 if (max_size > 1)
22622 rtx label = ix86_expand_aligntest (count, 1, true);
22623 src = change_address (srcmem, QImode, srcptr);
22624 dest = change_address (destmem, QImode, destptr);
22625 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22626 emit_label (label);
22627 LABEL_NUSES (label) = 1;
22630 else
22632 rtx offset = force_reg (Pmode, const0_rtx);
22633 rtx tmp;
22635 if (max_size > 4)
22637 rtx label = ix86_expand_aligntest (count, 4, true);
22638 src = change_address (srcmem, SImode, srcptr);
22639 dest = change_address (destmem, SImode, destptr);
22640 emit_move_insn (dest, src);
22641 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22642 true, OPTAB_LIB_WIDEN);
22643 if (tmp != offset)
22644 emit_move_insn (offset, tmp);
22645 emit_label (label);
22646 LABEL_NUSES (label) = 1;
22648 if (max_size > 2)
22650 rtx label = ix86_expand_aligntest (count, 2, true);
22651 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22652 src = change_address (srcmem, HImode, tmp);
22653 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22654 dest = change_address (destmem, HImode, tmp);
22655 emit_move_insn (dest, src);
22656 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22657 true, OPTAB_LIB_WIDEN);
22658 if (tmp != offset)
22659 emit_move_insn (offset, tmp);
22660 emit_label (label);
22661 LABEL_NUSES (label) = 1;
22663 if (max_size > 1)
22665 rtx label = ix86_expand_aligntest (count, 1, true);
22666 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22667 src = change_address (srcmem, QImode, tmp);
22668 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22669 dest = change_address (destmem, QImode, tmp);
22670 emit_move_insn (dest, src);
22671 emit_label (label);
22672 LABEL_NUSES (label) = 1;
22677 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22678 static void
22679 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22680 rtx count, int max_size)
22682 count =
22683 expand_simple_binop (counter_mode (count), AND, count,
22684 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22685 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22686 gen_lowpart (QImode, value), count, QImode,
22687 1, max_size / 2);
22690 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22691 static void
22692 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22694 rtx dest;
22696 if (CONST_INT_P (count))
22698 HOST_WIDE_INT countval = INTVAL (count);
22699 int offset = 0;
22701 if ((countval & 0x10) && max_size > 16)
22703 if (TARGET_64BIT)
22705 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22706 emit_insn (gen_strset (destptr, dest, value));
22707 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22708 emit_insn (gen_strset (destptr, dest, value));
22710 else
22711 gcc_unreachable ();
22712 offset += 16;
22714 if ((countval & 0x08) && max_size > 8)
22716 if (TARGET_64BIT)
22718 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22719 emit_insn (gen_strset (destptr, dest, value));
22721 else
22723 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22724 emit_insn (gen_strset (destptr, dest, value));
22725 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22726 emit_insn (gen_strset (destptr, dest, value));
22728 offset += 8;
22730 if ((countval & 0x04) && max_size > 4)
22732 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22733 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22734 offset += 4;
22736 if ((countval & 0x02) && max_size > 2)
22738 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22739 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22740 offset += 2;
22742 if ((countval & 0x01) && max_size > 1)
22744 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22745 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22746 offset += 1;
22748 return;
22750 if (max_size > 32)
22752 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22753 return;
22755 if (max_size > 16)
22757 rtx label = ix86_expand_aligntest (count, 16, true);
22758 if (TARGET_64BIT)
22760 dest = change_address (destmem, DImode, destptr);
22761 emit_insn (gen_strset (destptr, dest, value));
22762 emit_insn (gen_strset (destptr, dest, value));
22764 else
22766 dest = change_address (destmem, SImode, destptr);
22767 emit_insn (gen_strset (destptr, dest, value));
22768 emit_insn (gen_strset (destptr, dest, value));
22769 emit_insn (gen_strset (destptr, dest, value));
22770 emit_insn (gen_strset (destptr, dest, value));
22772 emit_label (label);
22773 LABEL_NUSES (label) = 1;
22775 if (max_size > 8)
22777 rtx label = ix86_expand_aligntest (count, 8, true);
22778 if (TARGET_64BIT)
22780 dest = change_address (destmem, DImode, destptr);
22781 emit_insn (gen_strset (destptr, dest, value));
22783 else
22785 dest = change_address (destmem, SImode, destptr);
22786 emit_insn (gen_strset (destptr, dest, value));
22787 emit_insn (gen_strset (destptr, dest, value));
22789 emit_label (label);
22790 LABEL_NUSES (label) = 1;
22792 if (max_size > 4)
22794 rtx label = ix86_expand_aligntest (count, 4, true);
22795 dest = change_address (destmem, SImode, destptr);
22796 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22797 emit_label (label);
22798 LABEL_NUSES (label) = 1;
22800 if (max_size > 2)
22802 rtx label = ix86_expand_aligntest (count, 2, true);
22803 dest = change_address (destmem, HImode, destptr);
22804 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22805 emit_label (label);
22806 LABEL_NUSES (label) = 1;
22808 if (max_size > 1)
22810 rtx label = ix86_expand_aligntest (count, 1, true);
22811 dest = change_address (destmem, QImode, destptr);
22812 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22813 emit_label (label);
22814 LABEL_NUSES (label) = 1;
22818 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22819 DESIRED_ALIGNMENT.
22820 Return value is updated DESTMEM. */
22821 static rtx
22822 expand_movmem_prologue (rtx destmem, rtx srcmem,
22823 rtx destptr, rtx srcptr, rtx count,
22824 int align, int desired_alignment)
22826 int i;
22827 for (i = 1; i < desired_alignment; i <<= 1)
22829 if (align <= i)
22831 rtx label = ix86_expand_aligntest (destptr, i, false);
22832 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22833 ix86_adjust_counter (count, i);
22834 emit_label (label);
22835 LABEL_NUSES (label) = 1;
22836 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
22839 return destmem;
22842 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22843 ALIGN_BYTES is how many bytes need to be copied.
22844 The function updates DST and SRC, namely, it sets proper alignment.
22845 DST is returned via return value, SRC is updated via pointer SRCP. */
22846 static rtx
22847 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22848 int desired_align, int align_bytes)
22850 rtx src = *srcp;
22851 rtx orig_dst = dst;
22852 rtx orig_src = src;
22853 int piece_size = 1;
22854 int copied_bytes = 0;
22855 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22856 if (src_align_bytes >= 0)
22857 src_align_bytes = desired_align - src_align_bytes;
22859 for (piece_size = 1;
22860 piece_size <= desired_align && copied_bytes < align_bytes;
22861 piece_size <<= 1)
22863 if (align_bytes & piece_size)
22865 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
22866 copied_bytes += piece_size;
22870 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22871 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22872 if (src_align_bytes >= 0)
22874 unsigned int src_align;
22875 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
22877 if ((src_align_bytes & (src_align - 1))
22878 == (align_bytes & (src_align - 1)))
22879 break;
22881 if (src_align > (unsigned int) desired_align)
22882 src_align = desired_align;
22883 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22884 set_mem_align (src, src_align * BITS_PER_UNIT);
22886 if (MEM_SIZE_KNOWN_P (orig_dst))
22887 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22888 if (MEM_SIZE_KNOWN_P (orig_src))
22889 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22890 *srcp = src;
22891 return dst;
22894 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22895 DESIRED_ALIGNMENT. */
22896 static void
22897 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22898 int align, int desired_alignment)
22900 if (align <= 1 && desired_alignment > 1)
22902 rtx label = ix86_expand_aligntest (destptr, 1, false);
22903 destmem = change_address (destmem, QImode, destptr);
22904 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22905 ix86_adjust_counter (count, 1);
22906 emit_label (label);
22907 LABEL_NUSES (label) = 1;
22909 if (align <= 2 && desired_alignment > 2)
22911 rtx label = ix86_expand_aligntest (destptr, 2, false);
22912 destmem = change_address (destmem, HImode, destptr);
22913 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22914 ix86_adjust_counter (count, 2);
22915 emit_label (label);
22916 LABEL_NUSES (label) = 1;
22918 if (align <= 4 && desired_alignment > 4)
22920 rtx label = ix86_expand_aligntest (destptr, 4, false);
22921 destmem = change_address (destmem, SImode, destptr);
22922 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22923 ix86_adjust_counter (count, 4);
22924 emit_label (label);
22925 LABEL_NUSES (label) = 1;
22927 gcc_assert (desired_alignment <= 8);
22930 /* Set enough from DST to align DST known to by aligned by ALIGN to
22931 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22932 static rtx
22933 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22934 int desired_align, int align_bytes)
22936 int off = 0;
22937 rtx orig_dst = dst;
22938 if (align_bytes & 1)
22940 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22941 off = 1;
22942 emit_insn (gen_strset (destreg, dst,
22943 gen_lowpart (QImode, value)));
22945 if (align_bytes & 2)
22947 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22948 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22949 set_mem_align (dst, 2 * BITS_PER_UNIT);
22950 off = 2;
22951 emit_insn (gen_strset (destreg, dst,
22952 gen_lowpart (HImode, value)));
22954 if (align_bytes & 4)
22956 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22957 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22958 set_mem_align (dst, 4 * BITS_PER_UNIT);
22959 off = 4;
22960 emit_insn (gen_strset (destreg, dst,
22961 gen_lowpart (SImode, value)));
22963 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22964 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22965 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22966 if (MEM_SIZE_KNOWN_P (orig_dst))
22967 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22968 return dst;
22971 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22972 static enum stringop_alg
22973 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22974 int *dynamic_check, bool *noalign)
22976 const struct stringop_algs * algs;
22977 bool optimize_for_speed;
22978 /* Algorithms using the rep prefix want at least edi and ecx;
22979 additionally, memset wants eax and memcpy wants esi. Don't
22980 consider such algorithms if the user has appropriated those
22981 registers for their own purposes. */
22982 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22983 || (memset
22984 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22985 *noalign = false;
22987 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22988 || (alg != rep_prefix_1_byte \
22989 && alg != rep_prefix_4_byte \
22990 && alg != rep_prefix_8_byte))
22991 const struct processor_costs *cost;
22993 /* Even if the string operation call is cold, we still might spend a lot
22994 of time processing large blocks. */
22995 if (optimize_function_for_size_p (cfun)
22996 || (optimize_insn_for_size_p ()
22997 && expected_size != -1 && expected_size < 256))
22998 optimize_for_speed = false;
22999 else
23000 optimize_for_speed = true;
23002 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
23004 *dynamic_check = -1;
23005 if (memset)
23006 algs = &cost->memset[TARGET_64BIT != 0];
23007 else
23008 algs = &cost->memcpy[TARGET_64BIT != 0];
23009 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
23010 return ix86_stringop_alg;
23011 /* rep; movq or rep; movl is the smallest variant. */
23012 else if (!optimize_for_speed)
23014 if (!count || (count & 3))
23015 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
23016 else
23017 return rep_prefix_usable ? rep_prefix_4_byte : loop;
23019 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
23021 else if (expected_size != -1 && expected_size < 4)
23022 return loop_1_byte;
23023 else if (expected_size != -1)
23025 unsigned int i;
23026 enum stringop_alg alg = libcall;
23027 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23029 /* We get here if the algorithms that were not libcall-based
23030 were rep-prefix based and we are unable to use rep prefixes
23031 based on global register usage. Break out of the loop and
23032 use the heuristic below. */
23033 if (algs->size[i].max == 0)
23034 break;
23035 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
23037 enum stringop_alg candidate = algs->size[i].alg;
23039 if (candidate != libcall && ALG_USABLE_P (candidate))
23040 alg = candidate;
23041 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
23042 last non-libcall inline algorithm. */
23043 if (TARGET_INLINE_ALL_STRINGOPS)
23045 /* When the current size is best to be copied by a libcall,
23046 but we are still forced to inline, run the heuristic below
23047 that will pick code for medium sized blocks. */
23048 if (alg != libcall)
23049 return alg;
23050 break;
23052 else if (ALG_USABLE_P (candidate))
23054 *noalign = algs->size[i].noalign;
23055 return candidate;
23059 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
23061 /* When asked to inline the call anyway, try to pick meaningful choice.
23062 We look for maximal size of block that is faster to copy by hand and
23063 take blocks of at most of that size guessing that average size will
23064 be roughly half of the block.
23066 If this turns out to be bad, we might simply specify the preferred
23067 choice in ix86_costs. */
23068 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23069 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
23071 int max = -1;
23072 enum stringop_alg alg;
23073 int i;
23074 bool any_alg_usable_p = true;
23076 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23078 enum stringop_alg candidate = algs->size[i].alg;
23079 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
23081 if (candidate != libcall && candidate
23082 && ALG_USABLE_P (candidate))
23083 max = algs->size[i].max;
23085 /* If there aren't any usable algorithms, then recursing on
23086 smaller sizes isn't going to find anything. Just return the
23087 simple byte-at-a-time copy loop. */
23088 if (!any_alg_usable_p)
23090 /* Pick something reasonable. */
23091 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23092 *dynamic_check = 128;
23093 return loop_1_byte;
23095 if (max == -1)
23096 max = 4096;
23097 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
23098 gcc_assert (*dynamic_check == -1);
23099 gcc_assert (alg != libcall);
23100 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23101 *dynamic_check = max;
23102 return alg;
23104 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
23105 #undef ALG_USABLE_P
23108 /* Decide on alignment. We know that the operand is already aligned to ALIGN
23109 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
23110 static int
23111 decide_alignment (int align,
23112 enum stringop_alg alg,
23113 int expected_size,
23114 enum machine_mode move_mode)
23116 int desired_align = 0;
23118 gcc_assert (alg != no_stringop);
23120 if (alg == libcall)
23121 return 0;
23122 if (move_mode == VOIDmode)
23123 return 0;
23125 desired_align = GET_MODE_SIZE (move_mode);
23126 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
23127 copying whole cacheline at once. */
23128 if (TARGET_PENTIUMPRO
23129 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
23130 desired_align = 8;
23132 if (optimize_size)
23133 desired_align = 1;
23134 if (desired_align < align)
23135 desired_align = align;
23136 if (expected_size != -1 && expected_size < 4)
23137 desired_align = align;
23139 return desired_align;
23142 /* Expand string move (memcpy) operation. Use i386 string operations
23143 when profitable. expand_setmem contains similar code. The code
23144 depends upon architecture, block size and alignment, but always has
23145 the same overall structure:
23147 1) Prologue guard: Conditional that jumps up to epilogues for small
23148 blocks that can be handled by epilogue alone. This is faster
23149 but also needed for correctness, since prologue assume the block
23150 is larger than the desired alignment.
23152 Optional dynamic check for size and libcall for large
23153 blocks is emitted here too, with -minline-stringops-dynamically.
23155 2) Prologue: copy first few bytes in order to get destination
23156 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
23157 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
23158 copied. We emit either a jump tree on power of two sized
23159 blocks, or a byte loop.
23161 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23162 with specified algorithm.
23164 4) Epilogue: code copying tail of the block that is too small to be
23165 handled by main body (or up to size guarded by prologue guard). */
23167 bool
23168 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
23169 rtx expected_align_exp, rtx expected_size_exp)
23171 rtx destreg;
23172 rtx srcreg;
23173 rtx label = NULL;
23174 rtx tmp;
23175 rtx jump_around_label = NULL;
23176 HOST_WIDE_INT align = 1;
23177 unsigned HOST_WIDE_INT count = 0;
23178 HOST_WIDE_INT expected_size = -1;
23179 int size_needed = 0, epilogue_size_needed;
23180 int desired_align = 0, align_bytes = 0;
23181 enum stringop_alg alg;
23182 int dynamic_check;
23183 bool need_zero_guard = false;
23184 bool noalign;
23185 enum machine_mode move_mode = VOIDmode;
23186 int unroll_factor = 1;
23188 if (CONST_INT_P (align_exp))
23189 align = INTVAL (align_exp);
23190 /* i386 can do misaligned access on reasonably increased cost. */
23191 if (CONST_INT_P (expected_align_exp)
23192 && INTVAL (expected_align_exp) > align)
23193 align = INTVAL (expected_align_exp);
23194 /* ALIGN is the minimum of destination and source alignment, but we care here
23195 just about destination alignment. */
23196 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
23197 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
23199 if (CONST_INT_P (count_exp))
23200 count = expected_size = INTVAL (count_exp);
23201 if (CONST_INT_P (expected_size_exp) && count == 0)
23202 expected_size = INTVAL (expected_size_exp);
23204 /* Make sure we don't need to care about overflow later on. */
23205 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23206 return false;
23208 /* Step 0: Decide on preferred algorithm, desired alignment and
23209 size of chunks to be copied by main loop. */
23210 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
23211 if (alg == libcall)
23212 return false;
23213 gcc_assert (alg != no_stringop);
23215 if (!count)
23216 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
23217 destreg = copy_addr_to_reg (XEXP (dst, 0));
23218 srcreg = copy_addr_to_reg (XEXP (src, 0));
23220 unroll_factor = 1;
23221 move_mode = word_mode;
23222 switch (alg)
23224 case libcall:
23225 case no_stringop:
23226 case last_alg:
23227 gcc_unreachable ();
23228 case loop_1_byte:
23229 need_zero_guard = true;
23230 move_mode = QImode;
23231 break;
23232 case loop:
23233 need_zero_guard = true;
23234 break;
23235 case unrolled_loop:
23236 need_zero_guard = true;
23237 unroll_factor = (TARGET_64BIT ? 4 : 2);
23238 break;
23239 case vector_loop:
23240 need_zero_guard = true;
23241 unroll_factor = 4;
23242 /* Find the widest supported mode. */
23243 move_mode = word_mode;
23244 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23245 != CODE_FOR_nothing)
23246 move_mode = GET_MODE_WIDER_MODE (move_mode);
23248 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23249 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23250 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23252 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23253 move_mode = mode_for_vector (word_mode, nunits);
23254 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23255 move_mode = word_mode;
23257 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23258 break;
23259 case rep_prefix_8_byte:
23260 move_mode = DImode;
23261 break;
23262 case rep_prefix_4_byte:
23263 move_mode = SImode;
23264 break;
23265 case rep_prefix_1_byte:
23266 move_mode = QImode;
23267 break;
23269 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23270 epilogue_size_needed = size_needed;
23272 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23273 if (!TARGET_ALIGN_STRINGOPS || noalign)
23274 align = desired_align;
23276 /* Step 1: Prologue guard. */
23278 /* Alignment code needs count to be in register. */
23279 if (CONST_INT_P (count_exp) && desired_align > align)
23281 if (INTVAL (count_exp) > desired_align
23282 && INTVAL (count_exp) > size_needed)
23284 align_bytes
23285 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23286 if (align_bytes <= 0)
23287 align_bytes = 0;
23288 else
23289 align_bytes = desired_align - align_bytes;
23291 if (align_bytes == 0)
23292 count_exp = force_reg (counter_mode (count_exp), count_exp);
23294 gcc_assert (desired_align >= 1 && align >= 1);
23296 /* Ensure that alignment prologue won't copy past end of block. */
23297 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23299 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23300 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23301 Make sure it is power of 2. */
23302 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23304 if (count)
23306 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23308 /* If main algorithm works on QImode, no epilogue is needed.
23309 For small sizes just don't align anything. */
23310 if (size_needed == 1)
23311 desired_align = align;
23312 else
23313 goto epilogue;
23316 else
23318 label = gen_label_rtx ();
23319 emit_cmp_and_jump_insns (count_exp,
23320 GEN_INT (epilogue_size_needed),
23321 LTU, 0, counter_mode (count_exp), 1, label);
23322 if (expected_size == -1 || expected_size < epilogue_size_needed)
23323 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23324 else
23325 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23329 /* Emit code to decide on runtime whether library call or inline should be
23330 used. */
23331 if (dynamic_check != -1)
23333 if (CONST_INT_P (count_exp))
23335 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
23337 emit_block_move_via_libcall (dst, src, count_exp, false);
23338 count_exp = const0_rtx;
23339 goto epilogue;
23342 else
23344 rtx hot_label = gen_label_rtx ();
23345 jump_around_label = gen_label_rtx ();
23346 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23347 LEU, 0, GET_MODE (count_exp), 1, hot_label);
23348 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23349 emit_block_move_via_libcall (dst, src, count_exp, false);
23350 emit_jump (jump_around_label);
23351 emit_label (hot_label);
23355 /* Step 2: Alignment prologue. */
23357 if (desired_align > align)
23359 if (align_bytes == 0)
23361 /* Except for the first move in epilogue, we no longer know
23362 constant offset in aliasing info. It don't seems to worth
23363 the pain to maintain it for the first move, so throw away
23364 the info early. */
23365 src = change_address (src, BLKmode, srcreg);
23366 dst = change_address (dst, BLKmode, destreg);
23367 dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
23368 desired_align);
23370 else
23372 /* If we know how many bytes need to be stored before dst is
23373 sufficiently aligned, maintain aliasing info accurately. */
23374 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
23375 desired_align, align_bytes);
23376 count_exp = plus_constant (counter_mode (count_exp),
23377 count_exp, -align_bytes);
23378 count -= align_bytes;
23380 if (need_zero_guard
23381 && (count < (unsigned HOST_WIDE_INT) size_needed
23382 || (align_bytes == 0
23383 && count < ((unsigned HOST_WIDE_INT) size_needed
23384 + desired_align - align))))
23386 /* It is possible that we copied enough so the main loop will not
23387 execute. */
23388 gcc_assert (size_needed > 1);
23389 if (label == NULL_RTX)
23390 label = gen_label_rtx ();
23391 emit_cmp_and_jump_insns (count_exp,
23392 GEN_INT (size_needed),
23393 LTU, 0, counter_mode (count_exp), 1, label);
23394 if (expected_size == -1
23395 || expected_size < (desired_align - align) / 2 + size_needed)
23396 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23397 else
23398 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23401 if (label && size_needed == 1)
23403 emit_label (label);
23404 LABEL_NUSES (label) = 1;
23405 label = NULL;
23406 epilogue_size_needed = 1;
23408 else if (label == NULL_RTX)
23409 epilogue_size_needed = size_needed;
23411 /* Step 3: Main loop. */
23413 switch (alg)
23415 case libcall:
23416 case no_stringop:
23417 case last_alg:
23418 gcc_unreachable ();
23419 case loop_1_byte:
23420 case loop:
23421 case unrolled_loop:
23422 case vector_loop:
23423 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
23424 count_exp, move_mode, unroll_factor,
23425 expected_size);
23426 break;
23427 case rep_prefix_8_byte:
23428 case rep_prefix_4_byte:
23429 case rep_prefix_1_byte:
23430 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23431 move_mode);
23432 break;
23434 /* Adjust properly the offset of src and dest memory for aliasing. */
23435 if (CONST_INT_P (count_exp))
23437 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
23438 (count / size_needed) * size_needed);
23439 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23440 (count / size_needed) * size_needed);
23442 else
23444 src = change_address (src, BLKmode, srcreg);
23445 dst = change_address (dst, BLKmode, destreg);
23448 /* Step 4: Epilogue to copy the remaining bytes. */
23449 epilogue:
23450 if (label)
23452 /* When the main loop is done, COUNT_EXP might hold original count,
23453 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23454 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23455 bytes. Compensate if needed. */
23457 if (size_needed < epilogue_size_needed)
23459 tmp =
23460 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23461 GEN_INT (size_needed - 1), count_exp, 1,
23462 OPTAB_DIRECT);
23463 if (tmp != count_exp)
23464 emit_move_insn (count_exp, tmp);
23466 emit_label (label);
23467 LABEL_NUSES (label) = 1;
23470 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23471 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
23472 size_needed);
23473 if (jump_around_label)
23474 emit_label (jump_around_label);
23475 return true;
23478 /* Helper function for memcpy. For QImode value 0xXY produce
23479 0xXYXYXYXY of wide specified by MODE. This is essentially
23480 a * 0x10101010, but we can do slightly better than
23481 synth_mult by unwinding the sequence by hand on CPUs with
23482 slow multiply. */
23483 static rtx
23484 promote_duplicated_reg (enum machine_mode mode, rtx val)
23486 enum machine_mode valmode = GET_MODE (val);
23487 rtx tmp;
23488 int nops = mode == DImode ? 3 : 2;
23490 gcc_assert (mode == SImode || mode == DImode);
23491 if (val == const0_rtx)
23492 return copy_to_mode_reg (mode, const0_rtx);
23493 if (CONST_INT_P (val))
23495 HOST_WIDE_INT v = INTVAL (val) & 255;
23497 v |= v << 8;
23498 v |= v << 16;
23499 if (mode == DImode)
23500 v |= (v << 16) << 16;
23501 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23504 if (valmode == VOIDmode)
23505 valmode = QImode;
23506 if (valmode != QImode)
23507 val = gen_lowpart (QImode, val);
23508 if (mode == QImode)
23509 return val;
23510 if (!TARGET_PARTIAL_REG_STALL)
23511 nops--;
23512 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23513 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23514 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23515 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23517 rtx reg = convert_modes (mode, QImode, val, true);
23518 tmp = promote_duplicated_reg (mode, const1_rtx);
23519 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23520 OPTAB_DIRECT);
23522 else
23524 rtx reg = convert_modes (mode, QImode, val, true);
23526 if (!TARGET_PARTIAL_REG_STALL)
23527 if (mode == SImode)
23528 emit_insn (gen_movsi_insv_1 (reg, reg));
23529 else
23530 emit_insn (gen_movdi_insv_1 (reg, reg));
23531 else
23533 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23534 NULL, 1, OPTAB_DIRECT);
23535 reg =
23536 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23538 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23539 NULL, 1, OPTAB_DIRECT);
23540 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23541 if (mode == SImode)
23542 return reg;
23543 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23544 NULL, 1, OPTAB_DIRECT);
23545 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23546 return reg;
23550 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23551 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23552 alignment from ALIGN to DESIRED_ALIGN. */
23553 static rtx
23554 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23556 rtx promoted_val;
23558 if (TARGET_64BIT
23559 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23560 promoted_val = promote_duplicated_reg (DImode, val);
23561 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23562 promoted_val = promote_duplicated_reg (SImode, val);
23563 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23564 promoted_val = promote_duplicated_reg (HImode, val);
23565 else
23566 promoted_val = val;
23568 return promoted_val;
23571 /* Expand string clear operation (bzero). Use i386 string operations when
23572 profitable. See expand_movmem comment for explanation of individual
23573 steps performed. */
23574 bool
23575 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23576 rtx expected_align_exp, rtx expected_size_exp)
23578 rtx destreg;
23579 rtx label = NULL;
23580 rtx tmp;
23581 rtx jump_around_label = NULL;
23582 HOST_WIDE_INT align = 1;
23583 unsigned HOST_WIDE_INT count = 0;
23584 HOST_WIDE_INT expected_size = -1;
23585 int size_needed = 0, epilogue_size_needed;
23586 int desired_align = 0, align_bytes = 0;
23587 enum stringop_alg alg;
23588 rtx promoted_val = NULL;
23589 bool force_loopy_epilogue = false;
23590 int dynamic_check;
23591 bool need_zero_guard = false;
23592 bool noalign;
23593 enum machine_mode move_mode = VOIDmode;
23594 int unroll_factor;
23596 if (CONST_INT_P (align_exp))
23597 align = INTVAL (align_exp);
23598 /* i386 can do misaligned access on reasonably increased cost. */
23599 if (CONST_INT_P (expected_align_exp)
23600 && INTVAL (expected_align_exp) > align)
23601 align = INTVAL (expected_align_exp);
23602 if (CONST_INT_P (count_exp))
23603 count = expected_size = INTVAL (count_exp);
23604 if (CONST_INT_P (expected_size_exp) && count == 0)
23605 expected_size = INTVAL (expected_size_exp);
23607 /* Make sure we don't need to care about overflow later on. */
23608 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23609 return false;
23611 /* Step 0: Decide on preferred algorithm, desired alignment and
23612 size of chunks to be copied by main loop. */
23614 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23615 if (alg == libcall)
23616 return false;
23617 gcc_assert (alg != no_stringop);
23619 if (!count)
23620 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23621 destreg = copy_addr_to_reg (XEXP (dst, 0));
23623 move_mode = word_mode;
23624 unroll_factor = 1;
23625 switch (alg)
23627 case libcall:
23628 case no_stringop:
23629 case last_alg:
23630 gcc_unreachable ();
23631 case loop:
23632 need_zero_guard = true;
23633 break;
23634 case vector_loop:
23635 case unrolled_loop:
23636 need_zero_guard = true;
23637 unroll_factor = 4;
23638 break;
23639 case rep_prefix_8_byte:
23640 move_mode = DImode;
23641 break;
23642 case rep_prefix_4_byte:
23643 move_mode = SImode;
23644 break;
23645 case rep_prefix_1_byte:
23646 move_mode = QImode;
23647 break;
23648 case loop_1_byte:
23649 need_zero_guard = true;
23650 move_mode = QImode;
23651 break;
23653 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23654 epilogue_size_needed = size_needed;
23656 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23657 if (!TARGET_ALIGN_STRINGOPS || noalign)
23658 align = desired_align;
23660 /* Step 1: Prologue guard. */
23662 /* Alignment code needs count to be in register. */
23663 if (CONST_INT_P (count_exp) && desired_align > align)
23665 if (INTVAL (count_exp) > desired_align
23666 && INTVAL (count_exp) > size_needed)
23668 align_bytes
23669 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23670 if (align_bytes <= 0)
23671 align_bytes = 0;
23672 else
23673 align_bytes = desired_align - align_bytes;
23675 if (align_bytes == 0)
23677 enum machine_mode mode = SImode;
23678 if (TARGET_64BIT && (count & ~0xffffffff))
23679 mode = DImode;
23680 count_exp = force_reg (mode, count_exp);
23683 /* Do the cheap promotion to allow better CSE across the
23684 main loop and epilogue (ie one load of the big constant in the
23685 front of all code. */
23686 if (CONST_INT_P (val_exp))
23687 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23688 desired_align, align);
23689 /* Ensure that alignment prologue won't copy past end of block. */
23690 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23692 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23693 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23694 Make sure it is power of 2. */
23695 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23697 /* To improve performance of small blocks, we jump around the VAL
23698 promoting mode. This mean that if the promoted VAL is not constant,
23699 we might not use it in the epilogue and have to use byte
23700 loop variant. */
23701 if (epilogue_size_needed > 2 && !promoted_val)
23702 force_loopy_epilogue = true;
23703 if (count)
23705 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23707 /* If main algorithm works on QImode, no epilogue is needed.
23708 For small sizes just don't align anything. */
23709 if (size_needed == 1)
23710 desired_align = align;
23711 else
23712 goto epilogue;
23715 else
23717 label = gen_label_rtx ();
23718 emit_cmp_and_jump_insns (count_exp,
23719 GEN_INT (epilogue_size_needed),
23720 LTU, 0, counter_mode (count_exp), 1, label);
23721 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23722 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23723 else
23724 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23727 if (dynamic_check != -1)
23729 rtx hot_label = gen_label_rtx ();
23730 jump_around_label = gen_label_rtx ();
23731 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23732 LEU, 0, counter_mode (count_exp), 1, hot_label);
23733 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23734 set_storage_via_libcall (dst, count_exp, val_exp, false);
23735 emit_jump (jump_around_label);
23736 emit_label (hot_label);
23739 /* Step 2: Alignment prologue. */
23741 /* Do the expensive promotion once we branched off the small blocks. */
23742 if (!promoted_val)
23743 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23744 desired_align, align);
23745 gcc_assert (desired_align >= 1 && align >= 1);
23747 if (desired_align > align)
23749 if (align_bytes == 0)
23751 /* Except for the first move in epilogue, we no longer know
23752 constant offset in aliasing info. It don't seems to worth
23753 the pain to maintain it for the first move, so throw away
23754 the info early. */
23755 dst = change_address (dst, BLKmode, destreg);
23756 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23757 desired_align);
23759 else
23761 /* If we know how many bytes need to be stored before dst is
23762 sufficiently aligned, maintain aliasing info accurately. */
23763 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23764 desired_align, align_bytes);
23765 count_exp = plus_constant (counter_mode (count_exp),
23766 count_exp, -align_bytes);
23767 count -= align_bytes;
23769 if (need_zero_guard
23770 && (count < (unsigned HOST_WIDE_INT) size_needed
23771 || (align_bytes == 0
23772 && count < ((unsigned HOST_WIDE_INT) size_needed
23773 + desired_align - align))))
23775 /* It is possible that we copied enough so the main loop will not
23776 execute. */
23777 gcc_assert (size_needed > 1);
23778 if (label == NULL_RTX)
23779 label = gen_label_rtx ();
23780 emit_cmp_and_jump_insns (count_exp,
23781 GEN_INT (size_needed),
23782 LTU, 0, counter_mode (count_exp), 1, label);
23783 if (expected_size == -1
23784 || expected_size < (desired_align - align) / 2 + size_needed)
23785 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23786 else
23787 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23790 if (label && size_needed == 1)
23792 emit_label (label);
23793 LABEL_NUSES (label) = 1;
23794 label = NULL;
23795 promoted_val = val_exp;
23796 epilogue_size_needed = 1;
23798 else if (label == NULL_RTX)
23799 epilogue_size_needed = size_needed;
23801 /* Step 3: Main loop. */
23803 switch (alg)
23805 case libcall:
23806 case no_stringop:
23807 case last_alg:
23808 gcc_unreachable ();
23809 case loop_1_byte:
23810 case loop:
23811 case vector_loop:
23812 case unrolled_loop:
23813 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23814 count_exp, move_mode, unroll_factor,
23815 expected_size);
23816 break;
23817 case rep_prefix_8_byte:
23818 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23819 DImode, val_exp);
23820 break;
23821 case rep_prefix_4_byte:
23822 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23823 SImode, val_exp);
23824 break;
23825 case rep_prefix_1_byte:
23826 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23827 QImode, val_exp);
23828 break;
23830 /* Adjust properly the offset of src and dest memory for aliasing. */
23831 if (CONST_INT_P (count_exp))
23832 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23833 (count / size_needed) * size_needed);
23834 else
23835 dst = change_address (dst, BLKmode, destreg);
23837 /* Step 4: Epilogue to copy the remaining bytes. */
23839 if (label)
23841 /* When the main loop is done, COUNT_EXP might hold original count,
23842 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23843 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23844 bytes. Compensate if needed. */
23846 if (size_needed < epilogue_size_needed)
23848 tmp =
23849 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23850 GEN_INT (size_needed - 1), count_exp, 1,
23851 OPTAB_DIRECT);
23852 if (tmp != count_exp)
23853 emit_move_insn (count_exp, tmp);
23855 emit_label (label);
23856 LABEL_NUSES (label) = 1;
23858 epilogue:
23859 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23861 if (force_loopy_epilogue)
23862 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23863 epilogue_size_needed);
23864 else
23865 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23866 epilogue_size_needed);
23868 if (jump_around_label)
23869 emit_label (jump_around_label);
23870 return true;
23873 /* Expand the appropriate insns for doing strlen if not just doing
23874 repnz; scasb
23876 out = result, initialized with the start address
23877 align_rtx = alignment of the address.
23878 scratch = scratch register, initialized with the startaddress when
23879 not aligned, otherwise undefined
23881 This is just the body. It needs the initializations mentioned above and
23882 some address computing at the end. These things are done in i386.md. */
23884 static void
23885 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23887 int align;
23888 rtx tmp;
23889 rtx align_2_label = NULL_RTX;
23890 rtx align_3_label = NULL_RTX;
23891 rtx align_4_label = gen_label_rtx ();
23892 rtx end_0_label = gen_label_rtx ();
23893 rtx mem;
23894 rtx tmpreg = gen_reg_rtx (SImode);
23895 rtx scratch = gen_reg_rtx (SImode);
23896 rtx cmp;
23898 align = 0;
23899 if (CONST_INT_P (align_rtx))
23900 align = INTVAL (align_rtx);
23902 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23904 /* Is there a known alignment and is it less than 4? */
23905 if (align < 4)
23907 rtx scratch1 = gen_reg_rtx (Pmode);
23908 emit_move_insn (scratch1, out);
23909 /* Is there a known alignment and is it not 2? */
23910 if (align != 2)
23912 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23913 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23915 /* Leave just the 3 lower bits. */
23916 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23917 NULL_RTX, 0, OPTAB_WIDEN);
23919 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23920 Pmode, 1, align_4_label);
23921 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23922 Pmode, 1, align_2_label);
23923 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23924 Pmode, 1, align_3_label);
23926 else
23928 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23929 check if is aligned to 4 - byte. */
23931 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23932 NULL_RTX, 0, OPTAB_WIDEN);
23934 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23935 Pmode, 1, align_4_label);
23938 mem = change_address (src, QImode, out);
23940 /* Now compare the bytes. */
23942 /* Compare the first n unaligned byte on a byte per byte basis. */
23943 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23944 QImode, 1, end_0_label);
23946 /* Increment the address. */
23947 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23949 /* Not needed with an alignment of 2 */
23950 if (align != 2)
23952 emit_label (align_2_label);
23954 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23955 end_0_label);
23957 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23959 emit_label (align_3_label);
23962 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23963 end_0_label);
23965 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23968 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23969 align this loop. It gives only huge programs, but does not help to
23970 speed up. */
23971 emit_label (align_4_label);
23973 mem = change_address (src, SImode, out);
23974 emit_move_insn (scratch, mem);
23975 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23977 /* This formula yields a nonzero result iff one of the bytes is zero.
23978 This saves three branches inside loop and many cycles. */
23980 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23981 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23982 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23983 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23984 gen_int_mode (0x80808080, SImode)));
23985 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23986 align_4_label);
23988 if (TARGET_CMOVE)
23990 rtx reg = gen_reg_rtx (SImode);
23991 rtx reg2 = gen_reg_rtx (Pmode);
23992 emit_move_insn (reg, tmpreg);
23993 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23995 /* If zero is not in the first two bytes, move two bytes forward. */
23996 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23997 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23998 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23999 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
24000 gen_rtx_IF_THEN_ELSE (SImode, tmp,
24001 reg,
24002 tmpreg)));
24003 /* Emit lea manually to avoid clobbering of flags. */
24004 emit_insn (gen_rtx_SET (SImode, reg2,
24005 gen_rtx_PLUS (Pmode, out, const2_rtx)));
24007 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24008 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24009 emit_insn (gen_rtx_SET (VOIDmode, out,
24010 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
24011 reg2,
24012 out)));
24014 else
24016 rtx end_2_label = gen_label_rtx ();
24017 /* Is zero in the first two bytes? */
24019 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24020 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24021 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
24022 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24023 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
24024 pc_rtx);
24025 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
24026 JUMP_LABEL (tmp) = end_2_label;
24028 /* Not in the first two. Move two bytes forward. */
24029 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
24030 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
24032 emit_label (end_2_label);
24036 /* Avoid branch in fixing the byte. */
24037 tmpreg = gen_lowpart (QImode, tmpreg);
24038 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
24039 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
24040 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
24041 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
24043 emit_label (end_0_label);
24046 /* Expand strlen. */
24048 bool
24049 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
24051 rtx addr, scratch1, scratch2, scratch3, scratch4;
24053 /* The generic case of strlen expander is long. Avoid it's
24054 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
24056 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24057 && !TARGET_INLINE_ALL_STRINGOPS
24058 && !optimize_insn_for_size_p ()
24059 && (!CONST_INT_P (align) || INTVAL (align) < 4))
24060 return false;
24062 addr = force_reg (Pmode, XEXP (src, 0));
24063 scratch1 = gen_reg_rtx (Pmode);
24065 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24066 && !optimize_insn_for_size_p ())
24068 /* Well it seems that some optimizer does not combine a call like
24069 foo(strlen(bar), strlen(bar));
24070 when the move and the subtraction is done here. It does calculate
24071 the length just once when these instructions are done inside of
24072 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
24073 often used and I use one fewer register for the lifetime of
24074 output_strlen_unroll() this is better. */
24076 emit_move_insn (out, addr);
24078 ix86_expand_strlensi_unroll_1 (out, src, align);
24080 /* strlensi_unroll_1 returns the address of the zero at the end of
24081 the string, like memchr(), so compute the length by subtracting
24082 the start address. */
24083 emit_insn (ix86_gen_sub3 (out, out, addr));
24085 else
24087 rtx unspec;
24089 /* Can't use this if the user has appropriated eax, ecx, or edi. */
24090 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
24091 return false;
24093 scratch2 = gen_reg_rtx (Pmode);
24094 scratch3 = gen_reg_rtx (Pmode);
24095 scratch4 = force_reg (Pmode, constm1_rtx);
24097 emit_move_insn (scratch3, addr);
24098 eoschar = force_reg (QImode, eoschar);
24100 src = replace_equiv_address_nv (src, scratch3);
24102 /* If .md starts supporting :P, this can be done in .md. */
24103 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
24104 scratch4), UNSPEC_SCAS);
24105 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
24106 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
24107 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
24109 return true;
24112 /* For given symbol (function) construct code to compute address of it's PLT
24113 entry in large x86-64 PIC model. */
24114 static rtx
24115 construct_plt_address (rtx symbol)
24117 rtx tmp, unspec;
24119 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
24120 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
24121 gcc_assert (Pmode == DImode);
24123 tmp = gen_reg_rtx (Pmode);
24124 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
24126 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
24127 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
24128 return tmp;
24132 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
24133 rtx callarg2,
24134 rtx pop, bool sibcall)
24136 unsigned int const cregs_size
24137 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
24138 rtx vec[3 + cregs_size];
24139 rtx use = NULL, call;
24140 unsigned int vec_len = 0;
24142 if (pop == const0_rtx)
24143 pop = NULL;
24144 gcc_assert (!TARGET_64BIT || !pop);
24146 if (TARGET_MACHO && !TARGET_64BIT)
24148 #if TARGET_MACHO
24149 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
24150 fnaddr = machopic_indirect_call_target (fnaddr);
24151 #endif
24153 else
24155 /* Static functions and indirect calls don't need the pic register. */
24156 if (flag_pic
24157 && (!TARGET_64BIT
24158 || (ix86_cmodel == CM_LARGE_PIC
24159 && DEFAULT_ABI != MS_ABI))
24160 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24161 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
24162 use_reg (&use, pic_offset_table_rtx);
24165 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
24167 rtx al = gen_rtx_REG (QImode, AX_REG);
24168 emit_move_insn (al, callarg2);
24169 use_reg (&use, al);
24172 if (ix86_cmodel == CM_LARGE_PIC
24173 && !TARGET_PECOFF
24174 && MEM_P (fnaddr)
24175 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24176 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
24177 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
24178 else if (sibcall
24179 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
24180 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
24182 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
24183 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
24186 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
24187 if (retval)
24188 call = gen_rtx_SET (VOIDmode, retval, call);
24189 vec[vec_len++] = call;
24191 if (pop)
24193 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
24194 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
24195 vec[vec_len++] = pop;
24198 if (TARGET_64BIT_MS_ABI
24199 && (!callarg2 || INTVAL (callarg2) != -2))
24201 unsigned i;
24203 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
24204 UNSPEC_MS_TO_SYSV_CALL);
24206 for (i = 0; i < cregs_size; i++)
24208 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
24209 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
24211 vec[vec_len++]
24212 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
24216 if (vec_len > 1)
24217 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
24218 call = emit_call_insn (call);
24219 if (use)
24220 CALL_INSN_FUNCTION_USAGE (call) = use;
24222 return call;
24225 /* Output the assembly for a call instruction. */
24227 const char *
24228 ix86_output_call_insn (rtx insn, rtx call_op)
24230 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24231 bool seh_nop_p = false;
24232 const char *xasm;
24234 if (SIBLING_CALL_P (insn))
24236 if (direct_p)
24237 xasm = "jmp\t%P0";
24238 /* SEH epilogue detection requires the indirect branch case
24239 to include REX.W. */
24240 else if (TARGET_SEH)
24241 xasm = "rex.W jmp %A0";
24242 else
24243 xasm = "jmp\t%A0";
24245 output_asm_insn (xasm, &call_op);
24246 return "";
24249 /* SEH unwinding can require an extra nop to be emitted in several
24250 circumstances. Determine if we have one of those. */
24251 if (TARGET_SEH)
24253 rtx i;
24255 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24257 /* If we get to another real insn, we don't need the nop. */
24258 if (INSN_P (i))
24259 break;
24261 /* If we get to the epilogue note, prevent a catch region from
24262 being adjacent to the standard epilogue sequence. If non-
24263 call-exceptions, we'll have done this during epilogue emission. */
24264 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24265 && !flag_non_call_exceptions
24266 && !can_throw_internal (insn))
24268 seh_nop_p = true;
24269 break;
24273 /* If we didn't find a real insn following the call, prevent the
24274 unwinder from looking into the next function. */
24275 if (i == NULL)
24276 seh_nop_p = true;
24279 if (direct_p)
24280 xasm = "call\t%P0";
24281 else
24282 xasm = "call\t%A0";
24284 output_asm_insn (xasm, &call_op);
24286 if (seh_nop_p)
24287 return "nop";
24289 return "";
24292 /* Clear stack slot assignments remembered from previous functions.
24293 This is called from INIT_EXPANDERS once before RTL is emitted for each
24294 function. */
24296 static struct machine_function *
24297 ix86_init_machine_status (void)
24299 struct machine_function *f;
24301 f = ggc_alloc_cleared_machine_function ();
24302 f->use_fast_prologue_epilogue_nregs = -1;
24303 f->call_abi = ix86_abi;
24305 return f;
24308 /* Return a MEM corresponding to a stack slot with mode MODE.
24309 Allocate a new slot if necessary.
24311 The RTL for a function can have several slots available: N is
24312 which slot to use. */
24315 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24317 struct stack_local_entry *s;
24319 gcc_assert (n < MAX_386_STACK_LOCALS);
24321 for (s = ix86_stack_locals; s; s = s->next)
24322 if (s->mode == mode && s->n == n)
24323 return validize_mem (copy_rtx (s->rtl));
24325 s = ggc_alloc_stack_local_entry ();
24326 s->n = n;
24327 s->mode = mode;
24328 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24330 s->next = ix86_stack_locals;
24331 ix86_stack_locals = s;
24332 return validize_mem (s->rtl);
24335 static void
24336 ix86_instantiate_decls (void)
24338 struct stack_local_entry *s;
24340 for (s = ix86_stack_locals; s; s = s->next)
24341 if (s->rtl != NULL_RTX)
24342 instantiate_decl_rtl (s->rtl);
24345 /* Calculate the length of the memory address in the instruction encoding.
24346 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24347 or other prefixes. We never generate addr32 prefix for LEA insn. */
24350 memory_address_length (rtx addr, bool lea)
24352 struct ix86_address parts;
24353 rtx base, index, disp;
24354 int len;
24355 int ok;
24357 if (GET_CODE (addr) == PRE_DEC
24358 || GET_CODE (addr) == POST_INC
24359 || GET_CODE (addr) == PRE_MODIFY
24360 || GET_CODE (addr) == POST_MODIFY)
24361 return 0;
24363 ok = ix86_decompose_address (addr, &parts);
24364 gcc_assert (ok);
24366 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24368 /* If this is not LEA instruction, add the length of addr32 prefix. */
24369 if (TARGET_64BIT && !lea
24370 && (SImode_address_operand (addr, VOIDmode)
24371 || (parts.base && GET_MODE (parts.base) == SImode)
24372 || (parts.index && GET_MODE (parts.index) == SImode)))
24373 len++;
24375 base = parts.base;
24376 index = parts.index;
24377 disp = parts.disp;
24379 if (base && GET_CODE (base) == SUBREG)
24380 base = SUBREG_REG (base);
24381 if (index && GET_CODE (index) == SUBREG)
24382 index = SUBREG_REG (index);
24384 gcc_assert (base == NULL_RTX || REG_P (base));
24385 gcc_assert (index == NULL_RTX || REG_P (index));
24387 /* Rule of thumb:
24388 - esp as the base always wants an index,
24389 - ebp as the base always wants a displacement,
24390 - r12 as the base always wants an index,
24391 - r13 as the base always wants a displacement. */
24393 /* Register Indirect. */
24394 if (base && !index && !disp)
24396 /* esp (for its index) and ebp (for its displacement) need
24397 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24398 code. */
24399 if (base == arg_pointer_rtx
24400 || base == frame_pointer_rtx
24401 || REGNO (base) == SP_REG
24402 || REGNO (base) == BP_REG
24403 || REGNO (base) == R12_REG
24404 || REGNO (base) == R13_REG)
24405 len++;
24408 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24409 is not disp32, but disp32(%rip), so for disp32
24410 SIB byte is needed, unless print_operand_address
24411 optimizes it into disp32(%rip) or (%rip) is implied
24412 by UNSPEC. */
24413 else if (disp && !base && !index)
24415 len += 4;
24416 if (TARGET_64BIT)
24418 rtx symbol = disp;
24420 if (GET_CODE (disp) == CONST)
24421 symbol = XEXP (disp, 0);
24422 if (GET_CODE (symbol) == PLUS
24423 && CONST_INT_P (XEXP (symbol, 1)))
24424 symbol = XEXP (symbol, 0);
24426 if (GET_CODE (symbol) != LABEL_REF
24427 && (GET_CODE (symbol) != SYMBOL_REF
24428 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
24429 && (GET_CODE (symbol) != UNSPEC
24430 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
24431 && XINT (symbol, 1) != UNSPEC_PCREL
24432 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
24433 len++;
24436 else
24438 /* Find the length of the displacement constant. */
24439 if (disp)
24441 if (base && satisfies_constraint_K (disp))
24442 len += 1;
24443 else
24444 len += 4;
24446 /* ebp always wants a displacement. Similarly r13. */
24447 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24448 len++;
24450 /* An index requires the two-byte modrm form.... */
24451 if (index
24452 /* ...like esp (or r12), which always wants an index. */
24453 || base == arg_pointer_rtx
24454 || base == frame_pointer_rtx
24455 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24456 len++;
24459 return len;
24462 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24463 is set, expect that insn have 8bit immediate alternative. */
24465 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24467 int len = 0;
24468 int i;
24469 extract_insn_cached (insn);
24470 for (i = recog_data.n_operands - 1; i >= 0; --i)
24471 if (CONSTANT_P (recog_data.operand[i]))
24473 enum attr_mode mode = get_attr_mode (insn);
24475 gcc_assert (!len);
24476 if (shortform && CONST_INT_P (recog_data.operand[i]))
24478 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24479 switch (mode)
24481 case MODE_QI:
24482 len = 1;
24483 continue;
24484 case MODE_HI:
24485 ival = trunc_int_for_mode (ival, HImode);
24486 break;
24487 case MODE_SI:
24488 ival = trunc_int_for_mode (ival, SImode);
24489 break;
24490 default:
24491 break;
24493 if (IN_RANGE (ival, -128, 127))
24495 len = 1;
24496 continue;
24499 switch (mode)
24501 case MODE_QI:
24502 len = 1;
24503 break;
24504 case MODE_HI:
24505 len = 2;
24506 break;
24507 case MODE_SI:
24508 len = 4;
24509 break;
24510 /* Immediates for DImode instructions are encoded
24511 as 32bit sign extended values. */
24512 case MODE_DI:
24513 len = 4;
24514 break;
24515 default:
24516 fatal_insn ("unknown insn mode", insn);
24519 return len;
24522 /* Compute default value for "length_address" attribute. */
24524 ix86_attr_length_address_default (rtx insn)
24526 int i;
24528 if (get_attr_type (insn) == TYPE_LEA)
24530 rtx set = PATTERN (insn), addr;
24532 if (GET_CODE (set) == PARALLEL)
24533 set = XVECEXP (set, 0, 0);
24535 gcc_assert (GET_CODE (set) == SET);
24537 addr = SET_SRC (set);
24539 return memory_address_length (addr, true);
24542 extract_insn_cached (insn);
24543 for (i = recog_data.n_operands - 1; i >= 0; --i)
24544 if (MEM_P (recog_data.operand[i]))
24546 constrain_operands_cached (reload_completed);
24547 if (which_alternative != -1)
24549 const char *constraints = recog_data.constraints[i];
24550 int alt = which_alternative;
24552 while (*constraints == '=' || *constraints == '+')
24553 constraints++;
24554 while (alt-- > 0)
24555 while (*constraints++ != ',')
24557 /* Skip ignored operands. */
24558 if (*constraints == 'X')
24559 continue;
24561 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24563 return 0;
24566 /* Compute default value for "length_vex" attribute. It includes
24567 2 or 3 byte VEX prefix and 1 opcode byte. */
24570 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24572 int i;
24574 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24575 byte VEX prefix. */
24576 if (!has_0f_opcode || has_vex_w)
24577 return 3 + 1;
24579 /* We can always use 2 byte VEX prefix in 32bit. */
24580 if (!TARGET_64BIT)
24581 return 2 + 1;
24583 extract_insn_cached (insn);
24585 for (i = recog_data.n_operands - 1; i >= 0; --i)
24586 if (REG_P (recog_data.operand[i]))
24588 /* REX.W bit uses 3 byte VEX prefix. */
24589 if (GET_MODE (recog_data.operand[i]) == DImode
24590 && GENERAL_REG_P (recog_data.operand[i]))
24591 return 3 + 1;
24593 else
24595 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24596 if (MEM_P (recog_data.operand[i])
24597 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24598 return 3 + 1;
24601 return 2 + 1;
24604 /* Return the maximum number of instructions a cpu can issue. */
24606 static int
24607 ix86_issue_rate (void)
24609 switch (ix86_tune)
24611 case PROCESSOR_PENTIUM:
24612 case PROCESSOR_ATOM:
24613 case PROCESSOR_SLM:
24614 case PROCESSOR_K6:
24615 case PROCESSOR_BTVER2:
24616 return 2;
24618 case PROCESSOR_PENTIUMPRO:
24619 case PROCESSOR_PENTIUM4:
24620 case PROCESSOR_CORE2:
24621 case PROCESSOR_COREI7:
24622 case PROCESSOR_HASWELL:
24623 case PROCESSOR_ATHLON:
24624 case PROCESSOR_K8:
24625 case PROCESSOR_AMDFAM10:
24626 case PROCESSOR_NOCONA:
24627 case PROCESSOR_GENERIC32:
24628 case PROCESSOR_GENERIC64:
24629 case PROCESSOR_BDVER1:
24630 case PROCESSOR_BDVER2:
24631 case PROCESSOR_BDVER3:
24632 case PROCESSOR_BTVER1:
24633 return 3;
24635 default:
24636 return 1;
24640 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24641 by DEP_INSN and nothing set by DEP_INSN. */
24643 static bool
24644 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24646 rtx set, set2;
24648 /* Simplify the test for uninteresting insns. */
24649 if (insn_type != TYPE_SETCC
24650 && insn_type != TYPE_ICMOV
24651 && insn_type != TYPE_FCMOV
24652 && insn_type != TYPE_IBR)
24653 return false;
24655 if ((set = single_set (dep_insn)) != 0)
24657 set = SET_DEST (set);
24658 set2 = NULL_RTX;
24660 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24661 && XVECLEN (PATTERN (dep_insn), 0) == 2
24662 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24663 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24665 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24666 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24668 else
24669 return false;
24671 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24672 return false;
24674 /* This test is true if the dependent insn reads the flags but
24675 not any other potentially set register. */
24676 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24677 return false;
24679 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24680 return false;
24682 return true;
24685 /* Return true iff USE_INSN has a memory address with operands set by
24686 SET_INSN. */
24688 bool
24689 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24691 int i;
24692 extract_insn_cached (use_insn);
24693 for (i = recog_data.n_operands - 1; i >= 0; --i)
24694 if (MEM_P (recog_data.operand[i]))
24696 rtx addr = XEXP (recog_data.operand[i], 0);
24697 return modified_in_p (addr, set_insn) != 0;
24699 return false;
24702 /* Helper function for exact_store_load_dependency.
24703 Return true if addr is found in insn. */
24704 static bool
24705 exact_dependency_1 (rtx addr, rtx insn)
24707 enum rtx_code code;
24708 const char *format_ptr;
24709 int i, j;
24711 code = GET_CODE (insn);
24712 switch (code)
24714 case MEM:
24715 if (rtx_equal_p (addr, insn))
24716 return true;
24717 break;
24718 case REG:
24719 CASE_CONST_ANY:
24720 case SYMBOL_REF:
24721 case CODE_LABEL:
24722 case PC:
24723 case CC0:
24724 case EXPR_LIST:
24725 return false;
24726 default:
24727 break;
24730 format_ptr = GET_RTX_FORMAT (code);
24731 for (i = 0; i < GET_RTX_LENGTH (code); i++)
24733 switch (*format_ptr++)
24735 case 'e':
24736 if (exact_dependency_1 (addr, XEXP (insn, i)))
24737 return true;
24738 break;
24739 case 'E':
24740 for (j = 0; j < XVECLEN (insn, i); j++)
24741 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
24742 return true;
24743 break;
24746 return false;
24749 /* Return true if there exists exact dependency for store & load, i.e.
24750 the same memory address is used in them. */
24751 static bool
24752 exact_store_load_dependency (rtx store, rtx load)
24754 rtx set1, set2;
24756 set1 = single_set (store);
24757 if (!set1)
24758 return false;
24759 if (!MEM_P (SET_DEST (set1)))
24760 return false;
24761 set2 = single_set (load);
24762 if (!set2)
24763 return false;
24764 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
24765 return true;
24766 return false;
24769 static int
24770 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24772 enum attr_type insn_type, dep_insn_type;
24773 enum attr_memory memory;
24774 rtx set, set2;
24775 int dep_insn_code_number;
24777 /* Anti and output dependencies have zero cost on all CPUs. */
24778 if (REG_NOTE_KIND (link) != 0)
24779 return 0;
24781 dep_insn_code_number = recog_memoized (dep_insn);
24783 /* If we can't recognize the insns, we can't really do anything. */
24784 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24785 return cost;
24787 insn_type = get_attr_type (insn);
24788 dep_insn_type = get_attr_type (dep_insn);
24790 switch (ix86_tune)
24792 case PROCESSOR_PENTIUM:
24793 /* Address Generation Interlock adds a cycle of latency. */
24794 if (insn_type == TYPE_LEA)
24796 rtx addr = PATTERN (insn);
24798 if (GET_CODE (addr) == PARALLEL)
24799 addr = XVECEXP (addr, 0, 0);
24801 gcc_assert (GET_CODE (addr) == SET);
24803 addr = SET_SRC (addr);
24804 if (modified_in_p (addr, dep_insn))
24805 cost += 1;
24807 else if (ix86_agi_dependent (dep_insn, insn))
24808 cost += 1;
24810 /* ??? Compares pair with jump/setcc. */
24811 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24812 cost = 0;
24814 /* Floating point stores require value to be ready one cycle earlier. */
24815 if (insn_type == TYPE_FMOV
24816 && get_attr_memory (insn) == MEMORY_STORE
24817 && !ix86_agi_dependent (dep_insn, insn))
24818 cost += 1;
24819 break;
24821 case PROCESSOR_PENTIUMPRO:
24822 memory = get_attr_memory (insn);
24824 /* INT->FP conversion is expensive. */
24825 if (get_attr_fp_int_src (dep_insn))
24826 cost += 5;
24828 /* There is one cycle extra latency between an FP op and a store. */
24829 if (insn_type == TYPE_FMOV
24830 && (set = single_set (dep_insn)) != NULL_RTX
24831 && (set2 = single_set (insn)) != NULL_RTX
24832 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24833 && MEM_P (SET_DEST (set2)))
24834 cost += 1;
24836 /* Show ability of reorder buffer to hide latency of load by executing
24837 in parallel with previous instruction in case
24838 previous instruction is not needed to compute the address. */
24839 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24840 && !ix86_agi_dependent (dep_insn, insn))
24842 /* Claim moves to take one cycle, as core can issue one load
24843 at time and the next load can start cycle later. */
24844 if (dep_insn_type == TYPE_IMOV
24845 || dep_insn_type == TYPE_FMOV)
24846 cost = 1;
24847 else if (cost > 1)
24848 cost--;
24850 break;
24852 case PROCESSOR_K6:
24853 memory = get_attr_memory (insn);
24855 /* The esp dependency is resolved before the instruction is really
24856 finished. */
24857 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24858 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24859 return 1;
24861 /* INT->FP conversion is expensive. */
24862 if (get_attr_fp_int_src (dep_insn))
24863 cost += 5;
24865 /* Show ability of reorder buffer to hide latency of load by executing
24866 in parallel with previous instruction in case
24867 previous instruction is not needed to compute the address. */
24868 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24869 && !ix86_agi_dependent (dep_insn, insn))
24871 /* Claim moves to take one cycle, as core can issue one load
24872 at time and the next load can start cycle later. */
24873 if (dep_insn_type == TYPE_IMOV
24874 || dep_insn_type == TYPE_FMOV)
24875 cost = 1;
24876 else if (cost > 2)
24877 cost -= 2;
24878 else
24879 cost = 1;
24881 break;
24883 case PROCESSOR_ATHLON:
24884 case PROCESSOR_K8:
24885 case PROCESSOR_AMDFAM10:
24886 case PROCESSOR_BDVER1:
24887 case PROCESSOR_BDVER2:
24888 case PROCESSOR_BDVER3:
24889 case PROCESSOR_BTVER1:
24890 case PROCESSOR_BTVER2:
24891 case PROCESSOR_ATOM:
24892 case PROCESSOR_GENERIC32:
24893 case PROCESSOR_GENERIC64:
24894 memory = get_attr_memory (insn);
24896 /* Show ability of reorder buffer to hide latency of load by executing
24897 in parallel with previous instruction in case
24898 previous instruction is not needed to compute the address. */
24899 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24900 && !ix86_agi_dependent (dep_insn, insn))
24902 enum attr_unit unit = get_attr_unit (insn);
24903 int loadcost = 3;
24905 /* Because of the difference between the length of integer and
24906 floating unit pipeline preparation stages, the memory operands
24907 for floating point are cheaper.
24909 ??? For Athlon it the difference is most probably 2. */
24910 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24911 loadcost = 3;
24912 else
24913 loadcost = TARGET_ATHLON ? 2 : 0;
24915 if (cost >= loadcost)
24916 cost -= loadcost;
24917 else
24918 cost = 0;
24920 break;
24922 case PROCESSOR_SLM:
24923 if (!reload_completed)
24924 return cost;
24926 /* Increase cost of integer loads. */
24927 memory = get_attr_memory (dep_insn);
24928 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24930 enum attr_unit unit = get_attr_unit (dep_insn);
24931 if (unit == UNIT_INTEGER && cost == 1)
24933 if (memory == MEMORY_LOAD)
24934 cost = 3;
24935 else
24937 /* Increase cost of ld/st for short int types only
24938 because of store forwarding issue. */
24939 rtx set = single_set (dep_insn);
24940 if (set && (GET_MODE (SET_DEST (set)) == QImode
24941 || GET_MODE (SET_DEST (set)) == HImode))
24943 /* Increase cost of store/load insn if exact
24944 dependence exists and it is load insn. */
24945 enum attr_memory insn_memory = get_attr_memory (insn);
24946 if (insn_memory == MEMORY_LOAD
24947 && exact_store_load_dependency (dep_insn, insn))
24948 cost = 3;
24954 default:
24955 break;
24958 return cost;
24961 /* How many alternative schedules to try. This should be as wide as the
24962 scheduling freedom in the DFA, but no wider. Making this value too
24963 large results extra work for the scheduler. */
24965 static int
24966 ia32_multipass_dfa_lookahead (void)
24968 switch (ix86_tune)
24970 case PROCESSOR_PENTIUM:
24971 return 2;
24973 case PROCESSOR_PENTIUMPRO:
24974 case PROCESSOR_K6:
24975 return 1;
24977 case PROCESSOR_CORE2:
24978 case PROCESSOR_COREI7:
24979 case PROCESSOR_HASWELL:
24980 case PROCESSOR_ATOM:
24981 case PROCESSOR_SLM:
24982 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24983 as many instructions can be executed on a cycle, i.e.,
24984 issue_rate. I wonder why tuning for many CPUs does not do this. */
24985 if (reload_completed)
24986 return ix86_issue_rate ();
24987 /* Don't use lookahead for pre-reload schedule to save compile time. */
24988 return 0;
24990 default:
24991 return 0;
24995 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24996 execution. It is applied if
24997 (1) IMUL instruction is on the top of list;
24998 (2) There exists the only producer of independent IMUL instruction in
24999 ready list.
25000 Return index of IMUL producer if it was found and -1 otherwise. */
25001 static int
25002 do_reorder_for_imul (rtx *ready, int n_ready)
25004 rtx insn, set, insn1, insn2;
25005 sd_iterator_def sd_it;
25006 dep_t dep;
25007 int index = -1;
25008 int i;
25010 if (ix86_tune != PROCESSOR_ATOM)
25011 return index;
25013 /* Check that IMUL instruction is on the top of ready list. */
25014 insn = ready[n_ready - 1];
25015 set = single_set (insn);
25016 if (!set)
25017 return index;
25018 if (!(GET_CODE (SET_SRC (set)) == MULT
25019 && GET_MODE (SET_SRC (set)) == SImode))
25020 return index;
25022 /* Search for producer of independent IMUL instruction. */
25023 for (i = n_ready - 2; i >= 0; i--)
25025 insn = ready[i];
25026 if (!NONDEBUG_INSN_P (insn))
25027 continue;
25028 /* Skip IMUL instruction. */
25029 insn2 = PATTERN (insn);
25030 if (GET_CODE (insn2) == PARALLEL)
25031 insn2 = XVECEXP (insn2, 0, 0);
25032 if (GET_CODE (insn2) == SET
25033 && GET_CODE (SET_SRC (insn2)) == MULT
25034 && GET_MODE (SET_SRC (insn2)) == SImode)
25035 continue;
25037 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
25039 rtx con;
25040 con = DEP_CON (dep);
25041 if (!NONDEBUG_INSN_P (con))
25042 continue;
25043 insn1 = PATTERN (con);
25044 if (GET_CODE (insn1) == PARALLEL)
25045 insn1 = XVECEXP (insn1, 0, 0);
25047 if (GET_CODE (insn1) == SET
25048 && GET_CODE (SET_SRC (insn1)) == MULT
25049 && GET_MODE (SET_SRC (insn1)) == SImode)
25051 sd_iterator_def sd_it1;
25052 dep_t dep1;
25053 /* Check if there is no other dependee for IMUL. */
25054 index = i;
25055 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
25057 rtx pro;
25058 pro = DEP_PRO (dep1);
25059 if (!NONDEBUG_INSN_P (pro))
25060 continue;
25061 if (pro != insn)
25062 index = -1;
25064 if (index >= 0)
25065 break;
25068 if (index >= 0)
25069 break;
25071 return index;
25074 /* Try to find the best candidate on the top of ready list if two insns
25075 have the same priority - candidate is best if its dependees were
25076 scheduled earlier. Applied for Silvermont only.
25077 Return true if top 2 insns must be interchanged. */
25078 static bool
25079 swap_top_of_ready_list (rtx *ready, int n_ready)
25081 rtx top = ready[n_ready - 1];
25082 rtx next = ready[n_ready - 2];
25083 rtx set;
25084 sd_iterator_def sd_it;
25085 dep_t dep;
25086 int clock1 = -1;
25087 int clock2 = -1;
25088 #define INSN_TICK(INSN) (HID (INSN)->tick)
25090 if (ix86_tune != PROCESSOR_SLM)
25091 return false;
25093 if (!NONDEBUG_INSN_P (top))
25094 return false;
25095 if (!NONJUMP_INSN_P (top))
25096 return false;
25097 if (!NONDEBUG_INSN_P (next))
25098 return false;
25099 if (!NONJUMP_INSN_P (next))
25100 return false;
25101 set = single_set (top);
25102 if (!set)
25103 return false;
25104 set = single_set (next);
25105 if (!set)
25106 return false;
25108 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
25110 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
25111 return false;
25112 /* Determine winner more precise. */
25113 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
25115 rtx pro;
25116 pro = DEP_PRO (dep);
25117 if (!NONDEBUG_INSN_P (pro))
25118 continue;
25119 if (INSN_TICK (pro) > clock1)
25120 clock1 = INSN_TICK (pro);
25122 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
25124 rtx pro;
25125 pro = DEP_PRO (dep);
25126 if (!NONDEBUG_INSN_P (pro))
25127 continue;
25128 if (INSN_TICK (pro) > clock2)
25129 clock2 = INSN_TICK (pro);
25132 if (clock1 == clock2)
25134 /* Determine winner - load must win. */
25135 enum attr_memory memory1, memory2;
25136 memory1 = get_attr_memory (top);
25137 memory2 = get_attr_memory (next);
25138 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
25139 return true;
25141 return (bool) (clock2 < clock1);
25143 return false;
25144 #undef INSN_TICK
25147 /* Perform possible reodering of ready list for Atom/Silvermont only.
25148 Return issue rate. */
25149 static int
25150 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
25151 int clock_var)
25153 int issue_rate = -1;
25154 int n_ready = *pn_ready;
25155 int i;
25156 rtx insn;
25157 int index = -1;
25159 /* Set up issue rate. */
25160 issue_rate = ix86_issue_rate ();
25162 /* Do reodering for Atom/SLM only. */
25163 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
25164 return issue_rate;
25166 /* Nothing to do if ready list contains only 1 instruction. */
25167 if (n_ready <= 1)
25168 return issue_rate;
25170 /* Do reodering for post-reload scheduler only. */
25171 if (!reload_completed)
25172 return issue_rate;
25174 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
25176 if (sched_verbose > 1)
25177 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
25178 INSN_UID (ready[index]));
25180 /* Put IMUL producer (ready[index]) at the top of ready list. */
25181 insn = ready[index];
25182 for (i = index; i < n_ready - 1; i++)
25183 ready[i] = ready[i + 1];
25184 ready[n_ready - 1] = insn;
25185 return issue_rate;
25187 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
25189 if (sched_verbose > 1)
25190 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
25191 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
25192 /* Swap 2 top elements of ready list. */
25193 insn = ready[n_ready - 1];
25194 ready[n_ready - 1] = ready[n_ready - 2];
25195 ready[n_ready - 2] = insn;
25197 return issue_rate;
25200 static bool
25201 ix86_class_likely_spilled_p (reg_class_t);
25203 /* Returns true if lhs of insn is HW function argument register and set up
25204 is_spilled to true if it is likely spilled HW register. */
25205 static bool
25206 insn_is_function_arg (rtx insn, bool* is_spilled)
25208 rtx dst;
25210 if (!NONDEBUG_INSN_P (insn))
25211 return false;
25212 /* Call instructions are not movable, ignore it. */
25213 if (CALL_P (insn))
25214 return false;
25215 insn = PATTERN (insn);
25216 if (GET_CODE (insn) == PARALLEL)
25217 insn = XVECEXP (insn, 0, 0);
25218 if (GET_CODE (insn) != SET)
25219 return false;
25220 dst = SET_DEST (insn);
25221 if (REG_P (dst) && HARD_REGISTER_P (dst)
25222 && ix86_function_arg_regno_p (REGNO (dst)))
25224 /* Is it likely spilled HW register? */
25225 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
25226 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
25227 *is_spilled = true;
25228 return true;
25230 return false;
25233 /* Add output dependencies for chain of function adjacent arguments if only
25234 there is a move to likely spilled HW register. Return first argument
25235 if at least one dependence was added or NULL otherwise. */
25236 static rtx
25237 add_parameter_dependencies (rtx call, rtx head)
25239 rtx insn;
25240 rtx last = call;
25241 rtx first_arg = NULL;
25242 bool is_spilled = false;
25244 head = PREV_INSN (head);
25246 /* Find nearest to call argument passing instruction. */
25247 while (true)
25249 last = PREV_INSN (last);
25250 if (last == head)
25251 return NULL;
25252 if (!NONDEBUG_INSN_P (last))
25253 continue;
25254 if (insn_is_function_arg (last, &is_spilled))
25255 break;
25256 return NULL;
25259 first_arg = last;
25260 while (true)
25262 insn = PREV_INSN (last);
25263 if (!INSN_P (insn))
25264 break;
25265 if (insn == head)
25266 break;
25267 if (!NONDEBUG_INSN_P (insn))
25269 last = insn;
25270 continue;
25272 if (insn_is_function_arg (insn, &is_spilled))
25274 /* Add output depdendence between two function arguments if chain
25275 of output arguments contains likely spilled HW registers. */
25276 if (is_spilled)
25277 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25278 first_arg = last = insn;
25280 else
25281 break;
25283 if (!is_spilled)
25284 return NULL;
25285 return first_arg;
25288 /* Add output or anti dependency from insn to first_arg to restrict its code
25289 motion. */
25290 static void
25291 avoid_func_arg_motion (rtx first_arg, rtx insn)
25293 rtx set;
25294 rtx tmp;
25296 set = single_set (insn);
25297 if (!set)
25298 return;
25299 tmp = SET_DEST (set);
25300 if (REG_P (tmp))
25302 /* Add output dependency to the first function argument. */
25303 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25304 return;
25306 /* Add anti dependency. */
25307 add_dependence (first_arg, insn, REG_DEP_ANTI);
25310 /* Avoid cross block motion of function argument through adding dependency
25311 from the first non-jump instruction in bb. */
25312 static void
25313 add_dependee_for_func_arg (rtx arg, basic_block bb)
25315 rtx insn = BB_END (bb);
25317 while (insn)
25319 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25321 rtx set = single_set (insn);
25322 if (set)
25324 avoid_func_arg_motion (arg, insn);
25325 return;
25328 if (insn == BB_HEAD (bb))
25329 return;
25330 insn = PREV_INSN (insn);
25334 /* Hook for pre-reload schedule - avoid motion of function arguments
25335 passed in likely spilled HW registers. */
25336 static void
25337 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25339 rtx insn;
25340 rtx first_arg = NULL;
25341 if (reload_completed)
25342 return;
25343 while (head != tail && DEBUG_INSN_P (head))
25344 head = NEXT_INSN (head);
25345 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25346 if (INSN_P (insn) && CALL_P (insn))
25348 first_arg = add_parameter_dependencies (insn, head);
25349 if (first_arg)
25351 /* Add dependee for first argument to predecessors if only
25352 region contains more than one block. */
25353 basic_block bb = BLOCK_FOR_INSN (insn);
25354 int rgn = CONTAINING_RGN (bb->index);
25355 int nr_blks = RGN_NR_BLOCKS (rgn);
25356 /* Skip trivial regions and region head blocks that can have
25357 predecessors outside of region. */
25358 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25360 edge e;
25361 edge_iterator ei;
25362 /* Assume that region is SCC, i.e. all immediate predecessors
25363 of non-head block are in the same region. */
25364 FOR_EACH_EDGE (e, ei, bb->preds)
25366 /* Avoid creating of loop-carried dependencies through
25367 using topological odering in region. */
25368 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25369 add_dependee_for_func_arg (first_arg, e->src);
25372 insn = first_arg;
25373 if (insn == head)
25374 break;
25377 else if (first_arg)
25378 avoid_func_arg_motion (first_arg, insn);
25381 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25382 HW registers to maximum, to schedule them at soon as possible. These are
25383 moves from function argument registers at the top of the function entry
25384 and moves from function return value registers after call. */
25385 static int
25386 ix86_adjust_priority (rtx insn, int priority)
25388 rtx set;
25390 if (reload_completed)
25391 return priority;
25393 if (!NONDEBUG_INSN_P (insn))
25394 return priority;
25396 set = single_set (insn);
25397 if (set)
25399 rtx tmp = SET_SRC (set);
25400 if (REG_P (tmp)
25401 && HARD_REGISTER_P (tmp)
25402 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25403 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25404 return current_sched_info->sched_max_insns_priority;
25407 return priority;
25410 /* Model decoder of Core 2/i7.
25411 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25412 track the instruction fetch block boundaries and make sure that long
25413 (9+ bytes) instructions are assigned to D0. */
25415 /* Maximum length of an insn that can be handled by
25416 a secondary decoder unit. '8' for Core 2/i7. */
25417 static int core2i7_secondary_decoder_max_insn_size;
25419 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25420 '16' for Core 2/i7. */
25421 static int core2i7_ifetch_block_size;
25423 /* Maximum number of instructions decoder can handle per cycle.
25424 '6' for Core 2/i7. */
25425 static int core2i7_ifetch_block_max_insns;
25427 typedef struct ix86_first_cycle_multipass_data_ *
25428 ix86_first_cycle_multipass_data_t;
25429 typedef const struct ix86_first_cycle_multipass_data_ *
25430 const_ix86_first_cycle_multipass_data_t;
25432 /* A variable to store target state across calls to max_issue within
25433 one cycle. */
25434 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25435 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25437 /* Initialize DATA. */
25438 static void
25439 core2i7_first_cycle_multipass_init (void *_data)
25441 ix86_first_cycle_multipass_data_t data
25442 = (ix86_first_cycle_multipass_data_t) _data;
25444 data->ifetch_block_len = 0;
25445 data->ifetch_block_n_insns = 0;
25446 data->ready_try_change = NULL;
25447 data->ready_try_change_size = 0;
25450 /* Advancing the cycle; reset ifetch block counts. */
25451 static void
25452 core2i7_dfa_post_advance_cycle (void)
25454 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25456 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25458 data->ifetch_block_len = 0;
25459 data->ifetch_block_n_insns = 0;
25462 static int min_insn_size (rtx);
25464 /* Filter out insns from ready_try that the core will not be able to issue
25465 on current cycle due to decoder. */
25466 static void
25467 core2i7_first_cycle_multipass_filter_ready_try
25468 (const_ix86_first_cycle_multipass_data_t data,
25469 char *ready_try, int n_ready, bool first_cycle_insn_p)
25471 while (n_ready--)
25473 rtx insn;
25474 int insn_size;
25476 if (ready_try[n_ready])
25477 continue;
25479 insn = get_ready_element (n_ready);
25480 insn_size = min_insn_size (insn);
25482 if (/* If this is a too long an insn for a secondary decoder ... */
25483 (!first_cycle_insn_p
25484 && insn_size > core2i7_secondary_decoder_max_insn_size)
25485 /* ... or it would not fit into the ifetch block ... */
25486 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25487 /* ... or the decoder is full already ... */
25488 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25489 /* ... mask the insn out. */
25491 ready_try[n_ready] = 1;
25493 if (data->ready_try_change)
25494 bitmap_set_bit (data->ready_try_change, n_ready);
25499 /* Prepare for a new round of multipass lookahead scheduling. */
25500 static void
25501 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25502 bool first_cycle_insn_p)
25504 ix86_first_cycle_multipass_data_t data
25505 = (ix86_first_cycle_multipass_data_t) _data;
25506 const_ix86_first_cycle_multipass_data_t prev_data
25507 = ix86_first_cycle_multipass_data;
25509 /* Restore the state from the end of the previous round. */
25510 data->ifetch_block_len = prev_data->ifetch_block_len;
25511 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
25513 /* Filter instructions that cannot be issued on current cycle due to
25514 decoder restrictions. */
25515 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25516 first_cycle_insn_p);
25519 /* INSN is being issued in current solution. Account for its impact on
25520 the decoder model. */
25521 static void
25522 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
25523 rtx insn, const void *_prev_data)
25525 ix86_first_cycle_multipass_data_t data
25526 = (ix86_first_cycle_multipass_data_t) _data;
25527 const_ix86_first_cycle_multipass_data_t prev_data
25528 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
25530 int insn_size = min_insn_size (insn);
25532 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
25533 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
25534 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
25535 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25537 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
25538 if (!data->ready_try_change)
25540 data->ready_try_change = sbitmap_alloc (n_ready);
25541 data->ready_try_change_size = n_ready;
25543 else if (data->ready_try_change_size < n_ready)
25545 data->ready_try_change = sbitmap_resize (data->ready_try_change,
25546 n_ready, 0);
25547 data->ready_try_change_size = n_ready;
25549 bitmap_clear (data->ready_try_change);
25551 /* Filter out insns from ready_try that the core will not be able to issue
25552 on current cycle due to decoder. */
25553 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25554 false);
25557 /* Revert the effect on ready_try. */
25558 static void
25559 core2i7_first_cycle_multipass_backtrack (const void *_data,
25560 char *ready_try,
25561 int n_ready ATTRIBUTE_UNUSED)
25563 const_ix86_first_cycle_multipass_data_t data
25564 = (const_ix86_first_cycle_multipass_data_t) _data;
25565 unsigned int i = 0;
25566 sbitmap_iterator sbi;
25568 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
25569 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
25571 ready_try[i] = 0;
25575 /* Save the result of multipass lookahead scheduling for the next round. */
25576 static void
25577 core2i7_first_cycle_multipass_end (const void *_data)
25579 const_ix86_first_cycle_multipass_data_t data
25580 = (const_ix86_first_cycle_multipass_data_t) _data;
25581 ix86_first_cycle_multipass_data_t next_data
25582 = ix86_first_cycle_multipass_data;
25584 if (data != NULL)
25586 next_data->ifetch_block_len = data->ifetch_block_len;
25587 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
25591 /* Deallocate target data. */
25592 static void
25593 core2i7_first_cycle_multipass_fini (void *_data)
25595 ix86_first_cycle_multipass_data_t data
25596 = (ix86_first_cycle_multipass_data_t) _data;
25598 if (data->ready_try_change)
25600 sbitmap_free (data->ready_try_change);
25601 data->ready_try_change = NULL;
25602 data->ready_try_change_size = 0;
25606 /* Prepare for scheduling pass. */
25607 static void
25608 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
25609 int verbose ATTRIBUTE_UNUSED,
25610 int max_uid ATTRIBUTE_UNUSED)
25612 /* Install scheduling hooks for current CPU. Some of these hooks are used
25613 in time-critical parts of the scheduler, so we only set them up when
25614 they are actually used. */
25615 switch (ix86_tune)
25617 case PROCESSOR_CORE2:
25618 case PROCESSOR_COREI7:
25619 case PROCESSOR_HASWELL:
25620 /* Do not perform multipass scheduling for pre-reload schedule
25621 to save compile time. */
25622 if (reload_completed)
25624 targetm.sched.dfa_post_advance_cycle
25625 = core2i7_dfa_post_advance_cycle;
25626 targetm.sched.first_cycle_multipass_init
25627 = core2i7_first_cycle_multipass_init;
25628 targetm.sched.first_cycle_multipass_begin
25629 = core2i7_first_cycle_multipass_begin;
25630 targetm.sched.first_cycle_multipass_issue
25631 = core2i7_first_cycle_multipass_issue;
25632 targetm.sched.first_cycle_multipass_backtrack
25633 = core2i7_first_cycle_multipass_backtrack;
25634 targetm.sched.first_cycle_multipass_end
25635 = core2i7_first_cycle_multipass_end;
25636 targetm.sched.first_cycle_multipass_fini
25637 = core2i7_first_cycle_multipass_fini;
25639 /* Set decoder parameters. */
25640 core2i7_secondary_decoder_max_insn_size = 8;
25641 core2i7_ifetch_block_size = 16;
25642 core2i7_ifetch_block_max_insns = 6;
25643 break;
25645 /* ... Fall through ... */
25646 default:
25647 targetm.sched.dfa_post_advance_cycle = NULL;
25648 targetm.sched.first_cycle_multipass_init = NULL;
25649 targetm.sched.first_cycle_multipass_begin = NULL;
25650 targetm.sched.first_cycle_multipass_issue = NULL;
25651 targetm.sched.first_cycle_multipass_backtrack = NULL;
25652 targetm.sched.first_cycle_multipass_end = NULL;
25653 targetm.sched.first_cycle_multipass_fini = NULL;
25654 break;
25659 /* Compute the alignment given to a constant that is being placed in memory.
25660 EXP is the constant and ALIGN is the alignment that the object would
25661 ordinarily have.
25662 The value of this function is used instead of that alignment to align
25663 the object. */
25666 ix86_constant_alignment (tree exp, int align)
25668 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
25669 || TREE_CODE (exp) == INTEGER_CST)
25671 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
25672 return 64;
25673 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
25674 return 128;
25676 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
25677 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
25678 return BITS_PER_WORD;
25680 return align;
25683 /* Compute the alignment for a static variable.
25684 TYPE is the data type, and ALIGN is the alignment that
25685 the object would ordinarily have. The value of this function is used
25686 instead of that alignment to align the object. */
25689 ix86_data_alignment (tree type, int align, bool opt)
25691 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
25693 if (opt
25694 && AGGREGATE_TYPE_P (type)
25695 && TYPE_SIZE (type)
25696 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25697 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
25698 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
25699 && align < max_align)
25700 align = max_align;
25702 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25703 to 16byte boundary. */
25704 if (TARGET_64BIT)
25706 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
25707 && TYPE_SIZE (type)
25708 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25709 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
25710 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25711 return 128;
25714 if (!opt)
25715 return align;
25717 if (TREE_CODE (type) == ARRAY_TYPE)
25719 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25720 return 64;
25721 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25722 return 128;
25724 else if (TREE_CODE (type) == COMPLEX_TYPE)
25727 if (TYPE_MODE (type) == DCmode && align < 64)
25728 return 64;
25729 if ((TYPE_MODE (type) == XCmode
25730 || TYPE_MODE (type) == TCmode) && align < 128)
25731 return 128;
25733 else if ((TREE_CODE (type) == RECORD_TYPE
25734 || TREE_CODE (type) == UNION_TYPE
25735 || TREE_CODE (type) == QUAL_UNION_TYPE)
25736 && TYPE_FIELDS (type))
25738 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25739 return 64;
25740 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25741 return 128;
25743 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25744 || TREE_CODE (type) == INTEGER_TYPE)
25746 if (TYPE_MODE (type) == DFmode && align < 64)
25747 return 64;
25748 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25749 return 128;
25752 return align;
25755 /* Compute the alignment for a local variable or a stack slot. EXP is
25756 the data type or decl itself, MODE is the widest mode available and
25757 ALIGN is the alignment that the object would ordinarily have. The
25758 value of this macro is used instead of that alignment to align the
25759 object. */
25761 unsigned int
25762 ix86_local_alignment (tree exp, enum machine_mode mode,
25763 unsigned int align)
25765 tree type, decl;
25767 if (exp && DECL_P (exp))
25769 type = TREE_TYPE (exp);
25770 decl = exp;
25772 else
25774 type = exp;
25775 decl = NULL;
25778 /* Don't do dynamic stack realignment for long long objects with
25779 -mpreferred-stack-boundary=2. */
25780 if (!TARGET_64BIT
25781 && align == 64
25782 && ix86_preferred_stack_boundary < 64
25783 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25784 && (!type || !TYPE_USER_ALIGN (type))
25785 && (!decl || !DECL_USER_ALIGN (decl)))
25786 align = 32;
25788 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25789 register in MODE. We will return the largest alignment of XF
25790 and DF. */
25791 if (!type)
25793 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25794 align = GET_MODE_ALIGNMENT (DFmode);
25795 return align;
25798 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25799 to 16byte boundary. Exact wording is:
25801 An array uses the same alignment as its elements, except that a local or
25802 global array variable of length at least 16 bytes or
25803 a C99 variable-length array variable always has alignment of at least 16 bytes.
25805 This was added to allow use of aligned SSE instructions at arrays. This
25806 rule is meant for static storage (where compiler can not do the analysis
25807 by itself). We follow it for automatic variables only when convenient.
25808 We fully control everything in the function compiled and functions from
25809 other unit can not rely on the alignment.
25811 Exclude va_list type. It is the common case of local array where
25812 we can not benefit from the alignment.
25814 TODO: Probably one should optimize for size only when var is not escaping. */
25815 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25816 && TARGET_SSE)
25818 if (AGGREGATE_TYPE_P (type)
25819 && (va_list_type_node == NULL_TREE
25820 || (TYPE_MAIN_VARIANT (type)
25821 != TYPE_MAIN_VARIANT (va_list_type_node)))
25822 && TYPE_SIZE (type)
25823 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25824 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25825 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25826 return 128;
25828 if (TREE_CODE (type) == ARRAY_TYPE)
25830 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25831 return 64;
25832 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25833 return 128;
25835 else if (TREE_CODE (type) == COMPLEX_TYPE)
25837 if (TYPE_MODE (type) == DCmode && align < 64)
25838 return 64;
25839 if ((TYPE_MODE (type) == XCmode
25840 || TYPE_MODE (type) == TCmode) && align < 128)
25841 return 128;
25843 else if ((TREE_CODE (type) == RECORD_TYPE
25844 || TREE_CODE (type) == UNION_TYPE
25845 || TREE_CODE (type) == QUAL_UNION_TYPE)
25846 && TYPE_FIELDS (type))
25848 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25849 return 64;
25850 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25851 return 128;
25853 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25854 || TREE_CODE (type) == INTEGER_TYPE)
25857 if (TYPE_MODE (type) == DFmode && align < 64)
25858 return 64;
25859 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25860 return 128;
25862 return align;
25865 /* Compute the minimum required alignment for dynamic stack realignment
25866 purposes for a local variable, parameter or a stack slot. EXP is
25867 the data type or decl itself, MODE is its mode and ALIGN is the
25868 alignment that the object would ordinarily have. */
25870 unsigned int
25871 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25872 unsigned int align)
25874 tree type, decl;
25876 if (exp && DECL_P (exp))
25878 type = TREE_TYPE (exp);
25879 decl = exp;
25881 else
25883 type = exp;
25884 decl = NULL;
25887 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25888 return align;
25890 /* Don't do dynamic stack realignment for long long objects with
25891 -mpreferred-stack-boundary=2. */
25892 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25893 && (!type || !TYPE_USER_ALIGN (type))
25894 && (!decl || !DECL_USER_ALIGN (decl)))
25895 return 32;
25897 return align;
25900 /* Find a location for the static chain incoming to a nested function.
25901 This is a register, unless all free registers are used by arguments. */
25903 static rtx
25904 ix86_static_chain (const_tree fndecl, bool incoming_p)
25906 unsigned regno;
25908 if (!DECL_STATIC_CHAIN (fndecl))
25909 return NULL;
25911 if (TARGET_64BIT)
25913 /* We always use R10 in 64-bit mode. */
25914 regno = R10_REG;
25916 else
25918 tree fntype;
25919 unsigned int ccvt;
25921 /* By default in 32-bit mode we use ECX to pass the static chain. */
25922 regno = CX_REG;
25924 fntype = TREE_TYPE (fndecl);
25925 ccvt = ix86_get_callcvt (fntype);
25926 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25928 /* Fastcall functions use ecx/edx for arguments, which leaves
25929 us with EAX for the static chain.
25930 Thiscall functions use ecx for arguments, which also
25931 leaves us with EAX for the static chain. */
25932 regno = AX_REG;
25934 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25936 /* Thiscall functions use ecx for arguments, which leaves
25937 us with EAX and EDX for the static chain.
25938 We are using for abi-compatibility EAX. */
25939 regno = AX_REG;
25941 else if (ix86_function_regparm (fntype, fndecl) == 3)
25943 /* For regparm 3, we have no free call-clobbered registers in
25944 which to store the static chain. In order to implement this,
25945 we have the trampoline push the static chain to the stack.
25946 However, we can't push a value below the return address when
25947 we call the nested function directly, so we have to use an
25948 alternate entry point. For this we use ESI, and have the
25949 alternate entry point push ESI, so that things appear the
25950 same once we're executing the nested function. */
25951 if (incoming_p)
25953 if (fndecl == current_function_decl)
25954 ix86_static_chain_on_stack = true;
25955 return gen_frame_mem (SImode,
25956 plus_constant (Pmode,
25957 arg_pointer_rtx, -8));
25959 regno = SI_REG;
25963 return gen_rtx_REG (Pmode, regno);
25966 /* Emit RTL insns to initialize the variable parts of a trampoline.
25967 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25968 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25969 to be passed to the target function. */
25971 static void
25972 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25974 rtx mem, fnaddr;
25975 int opcode;
25976 int offset = 0;
25978 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25980 if (TARGET_64BIT)
25982 int size;
25984 /* Load the function address to r11. Try to load address using
25985 the shorter movl instead of movabs. We may want to support
25986 movq for kernel mode, but kernel does not use trampolines at
25987 the moment. FNADDR is a 32bit address and may not be in
25988 DImode when ptr_mode == SImode. Always use movl in this
25989 case. */
25990 if (ptr_mode == SImode
25991 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25993 fnaddr = copy_addr_to_reg (fnaddr);
25995 mem = adjust_address (m_tramp, HImode, offset);
25996 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25998 mem = adjust_address (m_tramp, SImode, offset + 2);
25999 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
26000 offset += 6;
26002 else
26004 mem = adjust_address (m_tramp, HImode, offset);
26005 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
26007 mem = adjust_address (m_tramp, DImode, offset + 2);
26008 emit_move_insn (mem, fnaddr);
26009 offset += 10;
26012 /* Load static chain using movabs to r10. Use the shorter movl
26013 instead of movabs when ptr_mode == SImode. */
26014 if (ptr_mode == SImode)
26016 opcode = 0xba41;
26017 size = 6;
26019 else
26021 opcode = 0xba49;
26022 size = 10;
26025 mem = adjust_address (m_tramp, HImode, offset);
26026 emit_move_insn (mem, gen_int_mode (opcode, HImode));
26028 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
26029 emit_move_insn (mem, chain_value);
26030 offset += size;
26032 /* Jump to r11; the last (unused) byte is a nop, only there to
26033 pad the write out to a single 32-bit store. */
26034 mem = adjust_address (m_tramp, SImode, offset);
26035 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
26036 offset += 4;
26038 else
26040 rtx disp, chain;
26042 /* Depending on the static chain location, either load a register
26043 with a constant, or push the constant to the stack. All of the
26044 instructions are the same size. */
26045 chain = ix86_static_chain (fndecl, true);
26046 if (REG_P (chain))
26048 switch (REGNO (chain))
26050 case AX_REG:
26051 opcode = 0xb8; break;
26052 case CX_REG:
26053 opcode = 0xb9; break;
26054 default:
26055 gcc_unreachable ();
26058 else
26059 opcode = 0x68;
26061 mem = adjust_address (m_tramp, QImode, offset);
26062 emit_move_insn (mem, gen_int_mode (opcode, QImode));
26064 mem = adjust_address (m_tramp, SImode, offset + 1);
26065 emit_move_insn (mem, chain_value);
26066 offset += 5;
26068 mem = adjust_address (m_tramp, QImode, offset);
26069 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
26071 mem = adjust_address (m_tramp, SImode, offset + 1);
26073 /* Compute offset from the end of the jmp to the target function.
26074 In the case in which the trampoline stores the static chain on
26075 the stack, we need to skip the first insn which pushes the
26076 (call-saved) register static chain; this push is 1 byte. */
26077 offset += 5;
26078 disp = expand_binop (SImode, sub_optab, fnaddr,
26079 plus_constant (Pmode, XEXP (m_tramp, 0),
26080 offset - (MEM_P (chain) ? 1 : 0)),
26081 NULL_RTX, 1, OPTAB_DIRECT);
26082 emit_move_insn (mem, disp);
26085 gcc_assert (offset <= TRAMPOLINE_SIZE);
26087 #ifdef HAVE_ENABLE_EXECUTE_STACK
26088 #ifdef CHECK_EXECUTE_STACK_ENABLED
26089 if (CHECK_EXECUTE_STACK_ENABLED)
26090 #endif
26091 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
26092 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
26093 #endif
26096 /* The following file contains several enumerations and data structures
26097 built from the definitions in i386-builtin-types.def. */
26099 #include "i386-builtin-types.inc"
26101 /* Table for the ix86 builtin non-function types. */
26102 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
26104 /* Retrieve an element from the above table, building some of
26105 the types lazily. */
26107 static tree
26108 ix86_get_builtin_type (enum ix86_builtin_type tcode)
26110 unsigned int index;
26111 tree type, itype;
26113 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
26115 type = ix86_builtin_type_tab[(int) tcode];
26116 if (type != NULL)
26117 return type;
26119 gcc_assert (tcode > IX86_BT_LAST_PRIM);
26120 if (tcode <= IX86_BT_LAST_VECT)
26122 enum machine_mode mode;
26124 index = tcode - IX86_BT_LAST_PRIM - 1;
26125 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
26126 mode = ix86_builtin_type_vect_mode[index];
26128 type = build_vector_type_for_mode (itype, mode);
26130 else
26132 int quals;
26134 index = tcode - IX86_BT_LAST_VECT - 1;
26135 if (tcode <= IX86_BT_LAST_PTR)
26136 quals = TYPE_UNQUALIFIED;
26137 else
26138 quals = TYPE_QUAL_CONST;
26140 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
26141 if (quals != TYPE_UNQUALIFIED)
26142 itype = build_qualified_type (itype, quals);
26144 type = build_pointer_type (itype);
26147 ix86_builtin_type_tab[(int) tcode] = type;
26148 return type;
26151 /* Table for the ix86 builtin function types. */
26152 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
26154 /* Retrieve an element from the above table, building some of
26155 the types lazily. */
26157 static tree
26158 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
26160 tree type;
26162 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
26164 type = ix86_builtin_func_type_tab[(int) tcode];
26165 if (type != NULL)
26166 return type;
26168 if (tcode <= IX86_BT_LAST_FUNC)
26170 unsigned start = ix86_builtin_func_start[(int) tcode];
26171 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
26172 tree rtype, atype, args = void_list_node;
26173 unsigned i;
26175 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
26176 for (i = after - 1; i > start; --i)
26178 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
26179 args = tree_cons (NULL, atype, args);
26182 type = build_function_type (rtype, args);
26184 else
26186 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
26187 enum ix86_builtin_func_type icode;
26189 icode = ix86_builtin_func_alias_base[index];
26190 type = ix86_get_builtin_func_type (icode);
26193 ix86_builtin_func_type_tab[(int) tcode] = type;
26194 return type;
26198 /* Codes for all the SSE/MMX builtins. */
26199 enum ix86_builtins
26201 IX86_BUILTIN_ADDPS,
26202 IX86_BUILTIN_ADDSS,
26203 IX86_BUILTIN_DIVPS,
26204 IX86_BUILTIN_DIVSS,
26205 IX86_BUILTIN_MULPS,
26206 IX86_BUILTIN_MULSS,
26207 IX86_BUILTIN_SUBPS,
26208 IX86_BUILTIN_SUBSS,
26210 IX86_BUILTIN_CMPEQPS,
26211 IX86_BUILTIN_CMPLTPS,
26212 IX86_BUILTIN_CMPLEPS,
26213 IX86_BUILTIN_CMPGTPS,
26214 IX86_BUILTIN_CMPGEPS,
26215 IX86_BUILTIN_CMPNEQPS,
26216 IX86_BUILTIN_CMPNLTPS,
26217 IX86_BUILTIN_CMPNLEPS,
26218 IX86_BUILTIN_CMPNGTPS,
26219 IX86_BUILTIN_CMPNGEPS,
26220 IX86_BUILTIN_CMPORDPS,
26221 IX86_BUILTIN_CMPUNORDPS,
26222 IX86_BUILTIN_CMPEQSS,
26223 IX86_BUILTIN_CMPLTSS,
26224 IX86_BUILTIN_CMPLESS,
26225 IX86_BUILTIN_CMPNEQSS,
26226 IX86_BUILTIN_CMPNLTSS,
26227 IX86_BUILTIN_CMPNLESS,
26228 IX86_BUILTIN_CMPORDSS,
26229 IX86_BUILTIN_CMPUNORDSS,
26231 IX86_BUILTIN_COMIEQSS,
26232 IX86_BUILTIN_COMILTSS,
26233 IX86_BUILTIN_COMILESS,
26234 IX86_BUILTIN_COMIGTSS,
26235 IX86_BUILTIN_COMIGESS,
26236 IX86_BUILTIN_COMINEQSS,
26237 IX86_BUILTIN_UCOMIEQSS,
26238 IX86_BUILTIN_UCOMILTSS,
26239 IX86_BUILTIN_UCOMILESS,
26240 IX86_BUILTIN_UCOMIGTSS,
26241 IX86_BUILTIN_UCOMIGESS,
26242 IX86_BUILTIN_UCOMINEQSS,
26244 IX86_BUILTIN_CVTPI2PS,
26245 IX86_BUILTIN_CVTPS2PI,
26246 IX86_BUILTIN_CVTSI2SS,
26247 IX86_BUILTIN_CVTSI642SS,
26248 IX86_BUILTIN_CVTSS2SI,
26249 IX86_BUILTIN_CVTSS2SI64,
26250 IX86_BUILTIN_CVTTPS2PI,
26251 IX86_BUILTIN_CVTTSS2SI,
26252 IX86_BUILTIN_CVTTSS2SI64,
26254 IX86_BUILTIN_MAXPS,
26255 IX86_BUILTIN_MAXSS,
26256 IX86_BUILTIN_MINPS,
26257 IX86_BUILTIN_MINSS,
26259 IX86_BUILTIN_LOADUPS,
26260 IX86_BUILTIN_STOREUPS,
26261 IX86_BUILTIN_MOVSS,
26263 IX86_BUILTIN_MOVHLPS,
26264 IX86_BUILTIN_MOVLHPS,
26265 IX86_BUILTIN_LOADHPS,
26266 IX86_BUILTIN_LOADLPS,
26267 IX86_BUILTIN_STOREHPS,
26268 IX86_BUILTIN_STORELPS,
26270 IX86_BUILTIN_MASKMOVQ,
26271 IX86_BUILTIN_MOVMSKPS,
26272 IX86_BUILTIN_PMOVMSKB,
26274 IX86_BUILTIN_MOVNTPS,
26275 IX86_BUILTIN_MOVNTQ,
26277 IX86_BUILTIN_LOADDQU,
26278 IX86_BUILTIN_STOREDQU,
26280 IX86_BUILTIN_PACKSSWB,
26281 IX86_BUILTIN_PACKSSDW,
26282 IX86_BUILTIN_PACKUSWB,
26284 IX86_BUILTIN_PADDB,
26285 IX86_BUILTIN_PADDW,
26286 IX86_BUILTIN_PADDD,
26287 IX86_BUILTIN_PADDQ,
26288 IX86_BUILTIN_PADDSB,
26289 IX86_BUILTIN_PADDSW,
26290 IX86_BUILTIN_PADDUSB,
26291 IX86_BUILTIN_PADDUSW,
26292 IX86_BUILTIN_PSUBB,
26293 IX86_BUILTIN_PSUBW,
26294 IX86_BUILTIN_PSUBD,
26295 IX86_BUILTIN_PSUBQ,
26296 IX86_BUILTIN_PSUBSB,
26297 IX86_BUILTIN_PSUBSW,
26298 IX86_BUILTIN_PSUBUSB,
26299 IX86_BUILTIN_PSUBUSW,
26301 IX86_BUILTIN_PAND,
26302 IX86_BUILTIN_PANDN,
26303 IX86_BUILTIN_POR,
26304 IX86_BUILTIN_PXOR,
26306 IX86_BUILTIN_PAVGB,
26307 IX86_BUILTIN_PAVGW,
26309 IX86_BUILTIN_PCMPEQB,
26310 IX86_BUILTIN_PCMPEQW,
26311 IX86_BUILTIN_PCMPEQD,
26312 IX86_BUILTIN_PCMPGTB,
26313 IX86_BUILTIN_PCMPGTW,
26314 IX86_BUILTIN_PCMPGTD,
26316 IX86_BUILTIN_PMADDWD,
26318 IX86_BUILTIN_PMAXSW,
26319 IX86_BUILTIN_PMAXUB,
26320 IX86_BUILTIN_PMINSW,
26321 IX86_BUILTIN_PMINUB,
26323 IX86_BUILTIN_PMULHUW,
26324 IX86_BUILTIN_PMULHW,
26325 IX86_BUILTIN_PMULLW,
26327 IX86_BUILTIN_PSADBW,
26328 IX86_BUILTIN_PSHUFW,
26330 IX86_BUILTIN_PSLLW,
26331 IX86_BUILTIN_PSLLD,
26332 IX86_BUILTIN_PSLLQ,
26333 IX86_BUILTIN_PSRAW,
26334 IX86_BUILTIN_PSRAD,
26335 IX86_BUILTIN_PSRLW,
26336 IX86_BUILTIN_PSRLD,
26337 IX86_BUILTIN_PSRLQ,
26338 IX86_BUILTIN_PSLLWI,
26339 IX86_BUILTIN_PSLLDI,
26340 IX86_BUILTIN_PSLLQI,
26341 IX86_BUILTIN_PSRAWI,
26342 IX86_BUILTIN_PSRADI,
26343 IX86_BUILTIN_PSRLWI,
26344 IX86_BUILTIN_PSRLDI,
26345 IX86_BUILTIN_PSRLQI,
26347 IX86_BUILTIN_PUNPCKHBW,
26348 IX86_BUILTIN_PUNPCKHWD,
26349 IX86_BUILTIN_PUNPCKHDQ,
26350 IX86_BUILTIN_PUNPCKLBW,
26351 IX86_BUILTIN_PUNPCKLWD,
26352 IX86_BUILTIN_PUNPCKLDQ,
26354 IX86_BUILTIN_SHUFPS,
26356 IX86_BUILTIN_RCPPS,
26357 IX86_BUILTIN_RCPSS,
26358 IX86_BUILTIN_RSQRTPS,
26359 IX86_BUILTIN_RSQRTPS_NR,
26360 IX86_BUILTIN_RSQRTSS,
26361 IX86_BUILTIN_RSQRTF,
26362 IX86_BUILTIN_SQRTPS,
26363 IX86_BUILTIN_SQRTPS_NR,
26364 IX86_BUILTIN_SQRTSS,
26366 IX86_BUILTIN_UNPCKHPS,
26367 IX86_BUILTIN_UNPCKLPS,
26369 IX86_BUILTIN_ANDPS,
26370 IX86_BUILTIN_ANDNPS,
26371 IX86_BUILTIN_ORPS,
26372 IX86_BUILTIN_XORPS,
26374 IX86_BUILTIN_EMMS,
26375 IX86_BUILTIN_LDMXCSR,
26376 IX86_BUILTIN_STMXCSR,
26377 IX86_BUILTIN_SFENCE,
26379 IX86_BUILTIN_FXSAVE,
26380 IX86_BUILTIN_FXRSTOR,
26381 IX86_BUILTIN_FXSAVE64,
26382 IX86_BUILTIN_FXRSTOR64,
26384 IX86_BUILTIN_XSAVE,
26385 IX86_BUILTIN_XRSTOR,
26386 IX86_BUILTIN_XSAVE64,
26387 IX86_BUILTIN_XRSTOR64,
26389 IX86_BUILTIN_XSAVEOPT,
26390 IX86_BUILTIN_XSAVEOPT64,
26392 /* 3DNow! Original */
26393 IX86_BUILTIN_FEMMS,
26394 IX86_BUILTIN_PAVGUSB,
26395 IX86_BUILTIN_PF2ID,
26396 IX86_BUILTIN_PFACC,
26397 IX86_BUILTIN_PFADD,
26398 IX86_BUILTIN_PFCMPEQ,
26399 IX86_BUILTIN_PFCMPGE,
26400 IX86_BUILTIN_PFCMPGT,
26401 IX86_BUILTIN_PFMAX,
26402 IX86_BUILTIN_PFMIN,
26403 IX86_BUILTIN_PFMUL,
26404 IX86_BUILTIN_PFRCP,
26405 IX86_BUILTIN_PFRCPIT1,
26406 IX86_BUILTIN_PFRCPIT2,
26407 IX86_BUILTIN_PFRSQIT1,
26408 IX86_BUILTIN_PFRSQRT,
26409 IX86_BUILTIN_PFSUB,
26410 IX86_BUILTIN_PFSUBR,
26411 IX86_BUILTIN_PI2FD,
26412 IX86_BUILTIN_PMULHRW,
26414 /* 3DNow! Athlon Extensions */
26415 IX86_BUILTIN_PF2IW,
26416 IX86_BUILTIN_PFNACC,
26417 IX86_BUILTIN_PFPNACC,
26418 IX86_BUILTIN_PI2FW,
26419 IX86_BUILTIN_PSWAPDSI,
26420 IX86_BUILTIN_PSWAPDSF,
26422 /* SSE2 */
26423 IX86_BUILTIN_ADDPD,
26424 IX86_BUILTIN_ADDSD,
26425 IX86_BUILTIN_DIVPD,
26426 IX86_BUILTIN_DIVSD,
26427 IX86_BUILTIN_MULPD,
26428 IX86_BUILTIN_MULSD,
26429 IX86_BUILTIN_SUBPD,
26430 IX86_BUILTIN_SUBSD,
26432 IX86_BUILTIN_CMPEQPD,
26433 IX86_BUILTIN_CMPLTPD,
26434 IX86_BUILTIN_CMPLEPD,
26435 IX86_BUILTIN_CMPGTPD,
26436 IX86_BUILTIN_CMPGEPD,
26437 IX86_BUILTIN_CMPNEQPD,
26438 IX86_BUILTIN_CMPNLTPD,
26439 IX86_BUILTIN_CMPNLEPD,
26440 IX86_BUILTIN_CMPNGTPD,
26441 IX86_BUILTIN_CMPNGEPD,
26442 IX86_BUILTIN_CMPORDPD,
26443 IX86_BUILTIN_CMPUNORDPD,
26444 IX86_BUILTIN_CMPEQSD,
26445 IX86_BUILTIN_CMPLTSD,
26446 IX86_BUILTIN_CMPLESD,
26447 IX86_BUILTIN_CMPNEQSD,
26448 IX86_BUILTIN_CMPNLTSD,
26449 IX86_BUILTIN_CMPNLESD,
26450 IX86_BUILTIN_CMPORDSD,
26451 IX86_BUILTIN_CMPUNORDSD,
26453 IX86_BUILTIN_COMIEQSD,
26454 IX86_BUILTIN_COMILTSD,
26455 IX86_BUILTIN_COMILESD,
26456 IX86_BUILTIN_COMIGTSD,
26457 IX86_BUILTIN_COMIGESD,
26458 IX86_BUILTIN_COMINEQSD,
26459 IX86_BUILTIN_UCOMIEQSD,
26460 IX86_BUILTIN_UCOMILTSD,
26461 IX86_BUILTIN_UCOMILESD,
26462 IX86_BUILTIN_UCOMIGTSD,
26463 IX86_BUILTIN_UCOMIGESD,
26464 IX86_BUILTIN_UCOMINEQSD,
26466 IX86_BUILTIN_MAXPD,
26467 IX86_BUILTIN_MAXSD,
26468 IX86_BUILTIN_MINPD,
26469 IX86_BUILTIN_MINSD,
26471 IX86_BUILTIN_ANDPD,
26472 IX86_BUILTIN_ANDNPD,
26473 IX86_BUILTIN_ORPD,
26474 IX86_BUILTIN_XORPD,
26476 IX86_BUILTIN_SQRTPD,
26477 IX86_BUILTIN_SQRTSD,
26479 IX86_BUILTIN_UNPCKHPD,
26480 IX86_BUILTIN_UNPCKLPD,
26482 IX86_BUILTIN_SHUFPD,
26484 IX86_BUILTIN_LOADUPD,
26485 IX86_BUILTIN_STOREUPD,
26486 IX86_BUILTIN_MOVSD,
26488 IX86_BUILTIN_LOADHPD,
26489 IX86_BUILTIN_LOADLPD,
26491 IX86_BUILTIN_CVTDQ2PD,
26492 IX86_BUILTIN_CVTDQ2PS,
26494 IX86_BUILTIN_CVTPD2DQ,
26495 IX86_BUILTIN_CVTPD2PI,
26496 IX86_BUILTIN_CVTPD2PS,
26497 IX86_BUILTIN_CVTTPD2DQ,
26498 IX86_BUILTIN_CVTTPD2PI,
26500 IX86_BUILTIN_CVTPI2PD,
26501 IX86_BUILTIN_CVTSI2SD,
26502 IX86_BUILTIN_CVTSI642SD,
26504 IX86_BUILTIN_CVTSD2SI,
26505 IX86_BUILTIN_CVTSD2SI64,
26506 IX86_BUILTIN_CVTSD2SS,
26507 IX86_BUILTIN_CVTSS2SD,
26508 IX86_BUILTIN_CVTTSD2SI,
26509 IX86_BUILTIN_CVTTSD2SI64,
26511 IX86_BUILTIN_CVTPS2DQ,
26512 IX86_BUILTIN_CVTPS2PD,
26513 IX86_BUILTIN_CVTTPS2DQ,
26515 IX86_BUILTIN_MOVNTI,
26516 IX86_BUILTIN_MOVNTI64,
26517 IX86_BUILTIN_MOVNTPD,
26518 IX86_BUILTIN_MOVNTDQ,
26520 IX86_BUILTIN_MOVQ128,
26522 /* SSE2 MMX */
26523 IX86_BUILTIN_MASKMOVDQU,
26524 IX86_BUILTIN_MOVMSKPD,
26525 IX86_BUILTIN_PMOVMSKB128,
26527 IX86_BUILTIN_PACKSSWB128,
26528 IX86_BUILTIN_PACKSSDW128,
26529 IX86_BUILTIN_PACKUSWB128,
26531 IX86_BUILTIN_PADDB128,
26532 IX86_BUILTIN_PADDW128,
26533 IX86_BUILTIN_PADDD128,
26534 IX86_BUILTIN_PADDQ128,
26535 IX86_BUILTIN_PADDSB128,
26536 IX86_BUILTIN_PADDSW128,
26537 IX86_BUILTIN_PADDUSB128,
26538 IX86_BUILTIN_PADDUSW128,
26539 IX86_BUILTIN_PSUBB128,
26540 IX86_BUILTIN_PSUBW128,
26541 IX86_BUILTIN_PSUBD128,
26542 IX86_BUILTIN_PSUBQ128,
26543 IX86_BUILTIN_PSUBSB128,
26544 IX86_BUILTIN_PSUBSW128,
26545 IX86_BUILTIN_PSUBUSB128,
26546 IX86_BUILTIN_PSUBUSW128,
26548 IX86_BUILTIN_PAND128,
26549 IX86_BUILTIN_PANDN128,
26550 IX86_BUILTIN_POR128,
26551 IX86_BUILTIN_PXOR128,
26553 IX86_BUILTIN_PAVGB128,
26554 IX86_BUILTIN_PAVGW128,
26556 IX86_BUILTIN_PCMPEQB128,
26557 IX86_BUILTIN_PCMPEQW128,
26558 IX86_BUILTIN_PCMPEQD128,
26559 IX86_BUILTIN_PCMPGTB128,
26560 IX86_BUILTIN_PCMPGTW128,
26561 IX86_BUILTIN_PCMPGTD128,
26563 IX86_BUILTIN_PMADDWD128,
26565 IX86_BUILTIN_PMAXSW128,
26566 IX86_BUILTIN_PMAXUB128,
26567 IX86_BUILTIN_PMINSW128,
26568 IX86_BUILTIN_PMINUB128,
26570 IX86_BUILTIN_PMULUDQ,
26571 IX86_BUILTIN_PMULUDQ128,
26572 IX86_BUILTIN_PMULHUW128,
26573 IX86_BUILTIN_PMULHW128,
26574 IX86_BUILTIN_PMULLW128,
26576 IX86_BUILTIN_PSADBW128,
26577 IX86_BUILTIN_PSHUFHW,
26578 IX86_BUILTIN_PSHUFLW,
26579 IX86_BUILTIN_PSHUFD,
26581 IX86_BUILTIN_PSLLDQI128,
26582 IX86_BUILTIN_PSLLWI128,
26583 IX86_BUILTIN_PSLLDI128,
26584 IX86_BUILTIN_PSLLQI128,
26585 IX86_BUILTIN_PSRAWI128,
26586 IX86_BUILTIN_PSRADI128,
26587 IX86_BUILTIN_PSRLDQI128,
26588 IX86_BUILTIN_PSRLWI128,
26589 IX86_BUILTIN_PSRLDI128,
26590 IX86_BUILTIN_PSRLQI128,
26592 IX86_BUILTIN_PSLLDQ128,
26593 IX86_BUILTIN_PSLLW128,
26594 IX86_BUILTIN_PSLLD128,
26595 IX86_BUILTIN_PSLLQ128,
26596 IX86_BUILTIN_PSRAW128,
26597 IX86_BUILTIN_PSRAD128,
26598 IX86_BUILTIN_PSRLW128,
26599 IX86_BUILTIN_PSRLD128,
26600 IX86_BUILTIN_PSRLQ128,
26602 IX86_BUILTIN_PUNPCKHBW128,
26603 IX86_BUILTIN_PUNPCKHWD128,
26604 IX86_BUILTIN_PUNPCKHDQ128,
26605 IX86_BUILTIN_PUNPCKHQDQ128,
26606 IX86_BUILTIN_PUNPCKLBW128,
26607 IX86_BUILTIN_PUNPCKLWD128,
26608 IX86_BUILTIN_PUNPCKLDQ128,
26609 IX86_BUILTIN_PUNPCKLQDQ128,
26611 IX86_BUILTIN_CLFLUSH,
26612 IX86_BUILTIN_MFENCE,
26613 IX86_BUILTIN_LFENCE,
26614 IX86_BUILTIN_PAUSE,
26616 IX86_BUILTIN_BSRSI,
26617 IX86_BUILTIN_BSRDI,
26618 IX86_BUILTIN_RDPMC,
26619 IX86_BUILTIN_RDTSC,
26620 IX86_BUILTIN_RDTSCP,
26621 IX86_BUILTIN_ROLQI,
26622 IX86_BUILTIN_ROLHI,
26623 IX86_BUILTIN_RORQI,
26624 IX86_BUILTIN_RORHI,
26626 /* SSE3. */
26627 IX86_BUILTIN_ADDSUBPS,
26628 IX86_BUILTIN_HADDPS,
26629 IX86_BUILTIN_HSUBPS,
26630 IX86_BUILTIN_MOVSHDUP,
26631 IX86_BUILTIN_MOVSLDUP,
26632 IX86_BUILTIN_ADDSUBPD,
26633 IX86_BUILTIN_HADDPD,
26634 IX86_BUILTIN_HSUBPD,
26635 IX86_BUILTIN_LDDQU,
26637 IX86_BUILTIN_MONITOR,
26638 IX86_BUILTIN_MWAIT,
26640 /* SSSE3. */
26641 IX86_BUILTIN_PHADDW,
26642 IX86_BUILTIN_PHADDD,
26643 IX86_BUILTIN_PHADDSW,
26644 IX86_BUILTIN_PHSUBW,
26645 IX86_BUILTIN_PHSUBD,
26646 IX86_BUILTIN_PHSUBSW,
26647 IX86_BUILTIN_PMADDUBSW,
26648 IX86_BUILTIN_PMULHRSW,
26649 IX86_BUILTIN_PSHUFB,
26650 IX86_BUILTIN_PSIGNB,
26651 IX86_BUILTIN_PSIGNW,
26652 IX86_BUILTIN_PSIGND,
26653 IX86_BUILTIN_PALIGNR,
26654 IX86_BUILTIN_PABSB,
26655 IX86_BUILTIN_PABSW,
26656 IX86_BUILTIN_PABSD,
26658 IX86_BUILTIN_PHADDW128,
26659 IX86_BUILTIN_PHADDD128,
26660 IX86_BUILTIN_PHADDSW128,
26661 IX86_BUILTIN_PHSUBW128,
26662 IX86_BUILTIN_PHSUBD128,
26663 IX86_BUILTIN_PHSUBSW128,
26664 IX86_BUILTIN_PMADDUBSW128,
26665 IX86_BUILTIN_PMULHRSW128,
26666 IX86_BUILTIN_PSHUFB128,
26667 IX86_BUILTIN_PSIGNB128,
26668 IX86_BUILTIN_PSIGNW128,
26669 IX86_BUILTIN_PSIGND128,
26670 IX86_BUILTIN_PALIGNR128,
26671 IX86_BUILTIN_PABSB128,
26672 IX86_BUILTIN_PABSW128,
26673 IX86_BUILTIN_PABSD128,
26675 /* AMDFAM10 - SSE4A New Instructions. */
26676 IX86_BUILTIN_MOVNTSD,
26677 IX86_BUILTIN_MOVNTSS,
26678 IX86_BUILTIN_EXTRQI,
26679 IX86_BUILTIN_EXTRQ,
26680 IX86_BUILTIN_INSERTQI,
26681 IX86_BUILTIN_INSERTQ,
26683 /* SSE4.1. */
26684 IX86_BUILTIN_BLENDPD,
26685 IX86_BUILTIN_BLENDPS,
26686 IX86_BUILTIN_BLENDVPD,
26687 IX86_BUILTIN_BLENDVPS,
26688 IX86_BUILTIN_PBLENDVB128,
26689 IX86_BUILTIN_PBLENDW128,
26691 IX86_BUILTIN_DPPD,
26692 IX86_BUILTIN_DPPS,
26694 IX86_BUILTIN_INSERTPS128,
26696 IX86_BUILTIN_MOVNTDQA,
26697 IX86_BUILTIN_MPSADBW128,
26698 IX86_BUILTIN_PACKUSDW128,
26699 IX86_BUILTIN_PCMPEQQ,
26700 IX86_BUILTIN_PHMINPOSUW128,
26702 IX86_BUILTIN_PMAXSB128,
26703 IX86_BUILTIN_PMAXSD128,
26704 IX86_BUILTIN_PMAXUD128,
26705 IX86_BUILTIN_PMAXUW128,
26707 IX86_BUILTIN_PMINSB128,
26708 IX86_BUILTIN_PMINSD128,
26709 IX86_BUILTIN_PMINUD128,
26710 IX86_BUILTIN_PMINUW128,
26712 IX86_BUILTIN_PMOVSXBW128,
26713 IX86_BUILTIN_PMOVSXBD128,
26714 IX86_BUILTIN_PMOVSXBQ128,
26715 IX86_BUILTIN_PMOVSXWD128,
26716 IX86_BUILTIN_PMOVSXWQ128,
26717 IX86_BUILTIN_PMOVSXDQ128,
26719 IX86_BUILTIN_PMOVZXBW128,
26720 IX86_BUILTIN_PMOVZXBD128,
26721 IX86_BUILTIN_PMOVZXBQ128,
26722 IX86_BUILTIN_PMOVZXWD128,
26723 IX86_BUILTIN_PMOVZXWQ128,
26724 IX86_BUILTIN_PMOVZXDQ128,
26726 IX86_BUILTIN_PMULDQ128,
26727 IX86_BUILTIN_PMULLD128,
26729 IX86_BUILTIN_ROUNDSD,
26730 IX86_BUILTIN_ROUNDSS,
26732 IX86_BUILTIN_ROUNDPD,
26733 IX86_BUILTIN_ROUNDPS,
26735 IX86_BUILTIN_FLOORPD,
26736 IX86_BUILTIN_CEILPD,
26737 IX86_BUILTIN_TRUNCPD,
26738 IX86_BUILTIN_RINTPD,
26739 IX86_BUILTIN_ROUNDPD_AZ,
26741 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26742 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26743 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26745 IX86_BUILTIN_FLOORPS,
26746 IX86_BUILTIN_CEILPS,
26747 IX86_BUILTIN_TRUNCPS,
26748 IX86_BUILTIN_RINTPS,
26749 IX86_BUILTIN_ROUNDPS_AZ,
26751 IX86_BUILTIN_FLOORPS_SFIX,
26752 IX86_BUILTIN_CEILPS_SFIX,
26753 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26755 IX86_BUILTIN_PTESTZ,
26756 IX86_BUILTIN_PTESTC,
26757 IX86_BUILTIN_PTESTNZC,
26759 IX86_BUILTIN_VEC_INIT_V2SI,
26760 IX86_BUILTIN_VEC_INIT_V4HI,
26761 IX86_BUILTIN_VEC_INIT_V8QI,
26762 IX86_BUILTIN_VEC_EXT_V2DF,
26763 IX86_BUILTIN_VEC_EXT_V2DI,
26764 IX86_BUILTIN_VEC_EXT_V4SF,
26765 IX86_BUILTIN_VEC_EXT_V4SI,
26766 IX86_BUILTIN_VEC_EXT_V8HI,
26767 IX86_BUILTIN_VEC_EXT_V2SI,
26768 IX86_BUILTIN_VEC_EXT_V4HI,
26769 IX86_BUILTIN_VEC_EXT_V16QI,
26770 IX86_BUILTIN_VEC_SET_V2DI,
26771 IX86_BUILTIN_VEC_SET_V4SF,
26772 IX86_BUILTIN_VEC_SET_V4SI,
26773 IX86_BUILTIN_VEC_SET_V8HI,
26774 IX86_BUILTIN_VEC_SET_V4HI,
26775 IX86_BUILTIN_VEC_SET_V16QI,
26777 IX86_BUILTIN_VEC_PACK_SFIX,
26778 IX86_BUILTIN_VEC_PACK_SFIX256,
26780 /* SSE4.2. */
26781 IX86_BUILTIN_CRC32QI,
26782 IX86_BUILTIN_CRC32HI,
26783 IX86_BUILTIN_CRC32SI,
26784 IX86_BUILTIN_CRC32DI,
26786 IX86_BUILTIN_PCMPESTRI128,
26787 IX86_BUILTIN_PCMPESTRM128,
26788 IX86_BUILTIN_PCMPESTRA128,
26789 IX86_BUILTIN_PCMPESTRC128,
26790 IX86_BUILTIN_PCMPESTRO128,
26791 IX86_BUILTIN_PCMPESTRS128,
26792 IX86_BUILTIN_PCMPESTRZ128,
26793 IX86_BUILTIN_PCMPISTRI128,
26794 IX86_BUILTIN_PCMPISTRM128,
26795 IX86_BUILTIN_PCMPISTRA128,
26796 IX86_BUILTIN_PCMPISTRC128,
26797 IX86_BUILTIN_PCMPISTRO128,
26798 IX86_BUILTIN_PCMPISTRS128,
26799 IX86_BUILTIN_PCMPISTRZ128,
26801 IX86_BUILTIN_PCMPGTQ,
26803 /* AES instructions */
26804 IX86_BUILTIN_AESENC128,
26805 IX86_BUILTIN_AESENCLAST128,
26806 IX86_BUILTIN_AESDEC128,
26807 IX86_BUILTIN_AESDECLAST128,
26808 IX86_BUILTIN_AESIMC128,
26809 IX86_BUILTIN_AESKEYGENASSIST128,
26811 /* PCLMUL instruction */
26812 IX86_BUILTIN_PCLMULQDQ128,
26814 /* AVX */
26815 IX86_BUILTIN_ADDPD256,
26816 IX86_BUILTIN_ADDPS256,
26817 IX86_BUILTIN_ADDSUBPD256,
26818 IX86_BUILTIN_ADDSUBPS256,
26819 IX86_BUILTIN_ANDPD256,
26820 IX86_BUILTIN_ANDPS256,
26821 IX86_BUILTIN_ANDNPD256,
26822 IX86_BUILTIN_ANDNPS256,
26823 IX86_BUILTIN_BLENDPD256,
26824 IX86_BUILTIN_BLENDPS256,
26825 IX86_BUILTIN_BLENDVPD256,
26826 IX86_BUILTIN_BLENDVPS256,
26827 IX86_BUILTIN_DIVPD256,
26828 IX86_BUILTIN_DIVPS256,
26829 IX86_BUILTIN_DPPS256,
26830 IX86_BUILTIN_HADDPD256,
26831 IX86_BUILTIN_HADDPS256,
26832 IX86_BUILTIN_HSUBPD256,
26833 IX86_BUILTIN_HSUBPS256,
26834 IX86_BUILTIN_MAXPD256,
26835 IX86_BUILTIN_MAXPS256,
26836 IX86_BUILTIN_MINPD256,
26837 IX86_BUILTIN_MINPS256,
26838 IX86_BUILTIN_MULPD256,
26839 IX86_BUILTIN_MULPS256,
26840 IX86_BUILTIN_ORPD256,
26841 IX86_BUILTIN_ORPS256,
26842 IX86_BUILTIN_SHUFPD256,
26843 IX86_BUILTIN_SHUFPS256,
26844 IX86_BUILTIN_SUBPD256,
26845 IX86_BUILTIN_SUBPS256,
26846 IX86_BUILTIN_XORPD256,
26847 IX86_BUILTIN_XORPS256,
26848 IX86_BUILTIN_CMPSD,
26849 IX86_BUILTIN_CMPSS,
26850 IX86_BUILTIN_CMPPD,
26851 IX86_BUILTIN_CMPPS,
26852 IX86_BUILTIN_CMPPD256,
26853 IX86_BUILTIN_CMPPS256,
26854 IX86_BUILTIN_CVTDQ2PD256,
26855 IX86_BUILTIN_CVTDQ2PS256,
26856 IX86_BUILTIN_CVTPD2PS256,
26857 IX86_BUILTIN_CVTPS2DQ256,
26858 IX86_BUILTIN_CVTPS2PD256,
26859 IX86_BUILTIN_CVTTPD2DQ256,
26860 IX86_BUILTIN_CVTPD2DQ256,
26861 IX86_BUILTIN_CVTTPS2DQ256,
26862 IX86_BUILTIN_EXTRACTF128PD256,
26863 IX86_BUILTIN_EXTRACTF128PS256,
26864 IX86_BUILTIN_EXTRACTF128SI256,
26865 IX86_BUILTIN_VZEROALL,
26866 IX86_BUILTIN_VZEROUPPER,
26867 IX86_BUILTIN_VPERMILVARPD,
26868 IX86_BUILTIN_VPERMILVARPS,
26869 IX86_BUILTIN_VPERMILVARPD256,
26870 IX86_BUILTIN_VPERMILVARPS256,
26871 IX86_BUILTIN_VPERMILPD,
26872 IX86_BUILTIN_VPERMILPS,
26873 IX86_BUILTIN_VPERMILPD256,
26874 IX86_BUILTIN_VPERMILPS256,
26875 IX86_BUILTIN_VPERMIL2PD,
26876 IX86_BUILTIN_VPERMIL2PS,
26877 IX86_BUILTIN_VPERMIL2PD256,
26878 IX86_BUILTIN_VPERMIL2PS256,
26879 IX86_BUILTIN_VPERM2F128PD256,
26880 IX86_BUILTIN_VPERM2F128PS256,
26881 IX86_BUILTIN_VPERM2F128SI256,
26882 IX86_BUILTIN_VBROADCASTSS,
26883 IX86_BUILTIN_VBROADCASTSD256,
26884 IX86_BUILTIN_VBROADCASTSS256,
26885 IX86_BUILTIN_VBROADCASTPD256,
26886 IX86_BUILTIN_VBROADCASTPS256,
26887 IX86_BUILTIN_VINSERTF128PD256,
26888 IX86_BUILTIN_VINSERTF128PS256,
26889 IX86_BUILTIN_VINSERTF128SI256,
26890 IX86_BUILTIN_LOADUPD256,
26891 IX86_BUILTIN_LOADUPS256,
26892 IX86_BUILTIN_STOREUPD256,
26893 IX86_BUILTIN_STOREUPS256,
26894 IX86_BUILTIN_LDDQU256,
26895 IX86_BUILTIN_MOVNTDQ256,
26896 IX86_BUILTIN_MOVNTPD256,
26897 IX86_BUILTIN_MOVNTPS256,
26898 IX86_BUILTIN_LOADDQU256,
26899 IX86_BUILTIN_STOREDQU256,
26900 IX86_BUILTIN_MASKLOADPD,
26901 IX86_BUILTIN_MASKLOADPS,
26902 IX86_BUILTIN_MASKSTOREPD,
26903 IX86_BUILTIN_MASKSTOREPS,
26904 IX86_BUILTIN_MASKLOADPD256,
26905 IX86_BUILTIN_MASKLOADPS256,
26906 IX86_BUILTIN_MASKSTOREPD256,
26907 IX86_BUILTIN_MASKSTOREPS256,
26908 IX86_BUILTIN_MOVSHDUP256,
26909 IX86_BUILTIN_MOVSLDUP256,
26910 IX86_BUILTIN_MOVDDUP256,
26912 IX86_BUILTIN_SQRTPD256,
26913 IX86_BUILTIN_SQRTPS256,
26914 IX86_BUILTIN_SQRTPS_NR256,
26915 IX86_BUILTIN_RSQRTPS256,
26916 IX86_BUILTIN_RSQRTPS_NR256,
26918 IX86_BUILTIN_RCPPS256,
26920 IX86_BUILTIN_ROUNDPD256,
26921 IX86_BUILTIN_ROUNDPS256,
26923 IX86_BUILTIN_FLOORPD256,
26924 IX86_BUILTIN_CEILPD256,
26925 IX86_BUILTIN_TRUNCPD256,
26926 IX86_BUILTIN_RINTPD256,
26927 IX86_BUILTIN_ROUNDPD_AZ256,
26929 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26930 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26931 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26933 IX86_BUILTIN_FLOORPS256,
26934 IX86_BUILTIN_CEILPS256,
26935 IX86_BUILTIN_TRUNCPS256,
26936 IX86_BUILTIN_RINTPS256,
26937 IX86_BUILTIN_ROUNDPS_AZ256,
26939 IX86_BUILTIN_FLOORPS_SFIX256,
26940 IX86_BUILTIN_CEILPS_SFIX256,
26941 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26943 IX86_BUILTIN_UNPCKHPD256,
26944 IX86_BUILTIN_UNPCKLPD256,
26945 IX86_BUILTIN_UNPCKHPS256,
26946 IX86_BUILTIN_UNPCKLPS256,
26948 IX86_BUILTIN_SI256_SI,
26949 IX86_BUILTIN_PS256_PS,
26950 IX86_BUILTIN_PD256_PD,
26951 IX86_BUILTIN_SI_SI256,
26952 IX86_BUILTIN_PS_PS256,
26953 IX86_BUILTIN_PD_PD256,
26955 IX86_BUILTIN_VTESTZPD,
26956 IX86_BUILTIN_VTESTCPD,
26957 IX86_BUILTIN_VTESTNZCPD,
26958 IX86_BUILTIN_VTESTZPS,
26959 IX86_BUILTIN_VTESTCPS,
26960 IX86_BUILTIN_VTESTNZCPS,
26961 IX86_BUILTIN_VTESTZPD256,
26962 IX86_BUILTIN_VTESTCPD256,
26963 IX86_BUILTIN_VTESTNZCPD256,
26964 IX86_BUILTIN_VTESTZPS256,
26965 IX86_BUILTIN_VTESTCPS256,
26966 IX86_BUILTIN_VTESTNZCPS256,
26967 IX86_BUILTIN_PTESTZ256,
26968 IX86_BUILTIN_PTESTC256,
26969 IX86_BUILTIN_PTESTNZC256,
26971 IX86_BUILTIN_MOVMSKPD256,
26972 IX86_BUILTIN_MOVMSKPS256,
26974 /* AVX2 */
26975 IX86_BUILTIN_MPSADBW256,
26976 IX86_BUILTIN_PABSB256,
26977 IX86_BUILTIN_PABSW256,
26978 IX86_BUILTIN_PABSD256,
26979 IX86_BUILTIN_PACKSSDW256,
26980 IX86_BUILTIN_PACKSSWB256,
26981 IX86_BUILTIN_PACKUSDW256,
26982 IX86_BUILTIN_PACKUSWB256,
26983 IX86_BUILTIN_PADDB256,
26984 IX86_BUILTIN_PADDW256,
26985 IX86_BUILTIN_PADDD256,
26986 IX86_BUILTIN_PADDQ256,
26987 IX86_BUILTIN_PADDSB256,
26988 IX86_BUILTIN_PADDSW256,
26989 IX86_BUILTIN_PADDUSB256,
26990 IX86_BUILTIN_PADDUSW256,
26991 IX86_BUILTIN_PALIGNR256,
26992 IX86_BUILTIN_AND256I,
26993 IX86_BUILTIN_ANDNOT256I,
26994 IX86_BUILTIN_PAVGB256,
26995 IX86_BUILTIN_PAVGW256,
26996 IX86_BUILTIN_PBLENDVB256,
26997 IX86_BUILTIN_PBLENDVW256,
26998 IX86_BUILTIN_PCMPEQB256,
26999 IX86_BUILTIN_PCMPEQW256,
27000 IX86_BUILTIN_PCMPEQD256,
27001 IX86_BUILTIN_PCMPEQQ256,
27002 IX86_BUILTIN_PCMPGTB256,
27003 IX86_BUILTIN_PCMPGTW256,
27004 IX86_BUILTIN_PCMPGTD256,
27005 IX86_BUILTIN_PCMPGTQ256,
27006 IX86_BUILTIN_PHADDW256,
27007 IX86_BUILTIN_PHADDD256,
27008 IX86_BUILTIN_PHADDSW256,
27009 IX86_BUILTIN_PHSUBW256,
27010 IX86_BUILTIN_PHSUBD256,
27011 IX86_BUILTIN_PHSUBSW256,
27012 IX86_BUILTIN_PMADDUBSW256,
27013 IX86_BUILTIN_PMADDWD256,
27014 IX86_BUILTIN_PMAXSB256,
27015 IX86_BUILTIN_PMAXSW256,
27016 IX86_BUILTIN_PMAXSD256,
27017 IX86_BUILTIN_PMAXUB256,
27018 IX86_BUILTIN_PMAXUW256,
27019 IX86_BUILTIN_PMAXUD256,
27020 IX86_BUILTIN_PMINSB256,
27021 IX86_BUILTIN_PMINSW256,
27022 IX86_BUILTIN_PMINSD256,
27023 IX86_BUILTIN_PMINUB256,
27024 IX86_BUILTIN_PMINUW256,
27025 IX86_BUILTIN_PMINUD256,
27026 IX86_BUILTIN_PMOVMSKB256,
27027 IX86_BUILTIN_PMOVSXBW256,
27028 IX86_BUILTIN_PMOVSXBD256,
27029 IX86_BUILTIN_PMOVSXBQ256,
27030 IX86_BUILTIN_PMOVSXWD256,
27031 IX86_BUILTIN_PMOVSXWQ256,
27032 IX86_BUILTIN_PMOVSXDQ256,
27033 IX86_BUILTIN_PMOVZXBW256,
27034 IX86_BUILTIN_PMOVZXBD256,
27035 IX86_BUILTIN_PMOVZXBQ256,
27036 IX86_BUILTIN_PMOVZXWD256,
27037 IX86_BUILTIN_PMOVZXWQ256,
27038 IX86_BUILTIN_PMOVZXDQ256,
27039 IX86_BUILTIN_PMULDQ256,
27040 IX86_BUILTIN_PMULHRSW256,
27041 IX86_BUILTIN_PMULHUW256,
27042 IX86_BUILTIN_PMULHW256,
27043 IX86_BUILTIN_PMULLW256,
27044 IX86_BUILTIN_PMULLD256,
27045 IX86_BUILTIN_PMULUDQ256,
27046 IX86_BUILTIN_POR256,
27047 IX86_BUILTIN_PSADBW256,
27048 IX86_BUILTIN_PSHUFB256,
27049 IX86_BUILTIN_PSHUFD256,
27050 IX86_BUILTIN_PSHUFHW256,
27051 IX86_BUILTIN_PSHUFLW256,
27052 IX86_BUILTIN_PSIGNB256,
27053 IX86_BUILTIN_PSIGNW256,
27054 IX86_BUILTIN_PSIGND256,
27055 IX86_BUILTIN_PSLLDQI256,
27056 IX86_BUILTIN_PSLLWI256,
27057 IX86_BUILTIN_PSLLW256,
27058 IX86_BUILTIN_PSLLDI256,
27059 IX86_BUILTIN_PSLLD256,
27060 IX86_BUILTIN_PSLLQI256,
27061 IX86_BUILTIN_PSLLQ256,
27062 IX86_BUILTIN_PSRAWI256,
27063 IX86_BUILTIN_PSRAW256,
27064 IX86_BUILTIN_PSRADI256,
27065 IX86_BUILTIN_PSRAD256,
27066 IX86_BUILTIN_PSRLDQI256,
27067 IX86_BUILTIN_PSRLWI256,
27068 IX86_BUILTIN_PSRLW256,
27069 IX86_BUILTIN_PSRLDI256,
27070 IX86_BUILTIN_PSRLD256,
27071 IX86_BUILTIN_PSRLQI256,
27072 IX86_BUILTIN_PSRLQ256,
27073 IX86_BUILTIN_PSUBB256,
27074 IX86_BUILTIN_PSUBW256,
27075 IX86_BUILTIN_PSUBD256,
27076 IX86_BUILTIN_PSUBQ256,
27077 IX86_BUILTIN_PSUBSB256,
27078 IX86_BUILTIN_PSUBSW256,
27079 IX86_BUILTIN_PSUBUSB256,
27080 IX86_BUILTIN_PSUBUSW256,
27081 IX86_BUILTIN_PUNPCKHBW256,
27082 IX86_BUILTIN_PUNPCKHWD256,
27083 IX86_BUILTIN_PUNPCKHDQ256,
27084 IX86_BUILTIN_PUNPCKHQDQ256,
27085 IX86_BUILTIN_PUNPCKLBW256,
27086 IX86_BUILTIN_PUNPCKLWD256,
27087 IX86_BUILTIN_PUNPCKLDQ256,
27088 IX86_BUILTIN_PUNPCKLQDQ256,
27089 IX86_BUILTIN_PXOR256,
27090 IX86_BUILTIN_MOVNTDQA256,
27091 IX86_BUILTIN_VBROADCASTSS_PS,
27092 IX86_BUILTIN_VBROADCASTSS_PS256,
27093 IX86_BUILTIN_VBROADCASTSD_PD256,
27094 IX86_BUILTIN_VBROADCASTSI256,
27095 IX86_BUILTIN_PBLENDD256,
27096 IX86_BUILTIN_PBLENDD128,
27097 IX86_BUILTIN_PBROADCASTB256,
27098 IX86_BUILTIN_PBROADCASTW256,
27099 IX86_BUILTIN_PBROADCASTD256,
27100 IX86_BUILTIN_PBROADCASTQ256,
27101 IX86_BUILTIN_PBROADCASTB128,
27102 IX86_BUILTIN_PBROADCASTW128,
27103 IX86_BUILTIN_PBROADCASTD128,
27104 IX86_BUILTIN_PBROADCASTQ128,
27105 IX86_BUILTIN_VPERMVARSI256,
27106 IX86_BUILTIN_VPERMDF256,
27107 IX86_BUILTIN_VPERMVARSF256,
27108 IX86_BUILTIN_VPERMDI256,
27109 IX86_BUILTIN_VPERMTI256,
27110 IX86_BUILTIN_VEXTRACT128I256,
27111 IX86_BUILTIN_VINSERT128I256,
27112 IX86_BUILTIN_MASKLOADD,
27113 IX86_BUILTIN_MASKLOADQ,
27114 IX86_BUILTIN_MASKLOADD256,
27115 IX86_BUILTIN_MASKLOADQ256,
27116 IX86_BUILTIN_MASKSTORED,
27117 IX86_BUILTIN_MASKSTOREQ,
27118 IX86_BUILTIN_MASKSTORED256,
27119 IX86_BUILTIN_MASKSTOREQ256,
27120 IX86_BUILTIN_PSLLVV4DI,
27121 IX86_BUILTIN_PSLLVV2DI,
27122 IX86_BUILTIN_PSLLVV8SI,
27123 IX86_BUILTIN_PSLLVV4SI,
27124 IX86_BUILTIN_PSRAVV8SI,
27125 IX86_BUILTIN_PSRAVV4SI,
27126 IX86_BUILTIN_PSRLVV4DI,
27127 IX86_BUILTIN_PSRLVV2DI,
27128 IX86_BUILTIN_PSRLVV8SI,
27129 IX86_BUILTIN_PSRLVV4SI,
27131 IX86_BUILTIN_GATHERSIV2DF,
27132 IX86_BUILTIN_GATHERSIV4DF,
27133 IX86_BUILTIN_GATHERDIV2DF,
27134 IX86_BUILTIN_GATHERDIV4DF,
27135 IX86_BUILTIN_GATHERSIV4SF,
27136 IX86_BUILTIN_GATHERSIV8SF,
27137 IX86_BUILTIN_GATHERDIV4SF,
27138 IX86_BUILTIN_GATHERDIV8SF,
27139 IX86_BUILTIN_GATHERSIV2DI,
27140 IX86_BUILTIN_GATHERSIV4DI,
27141 IX86_BUILTIN_GATHERDIV2DI,
27142 IX86_BUILTIN_GATHERDIV4DI,
27143 IX86_BUILTIN_GATHERSIV4SI,
27144 IX86_BUILTIN_GATHERSIV8SI,
27145 IX86_BUILTIN_GATHERDIV4SI,
27146 IX86_BUILTIN_GATHERDIV8SI,
27148 /* Alternate 4 element gather for the vectorizer where
27149 all operands are 32-byte wide. */
27150 IX86_BUILTIN_GATHERALTSIV4DF,
27151 IX86_BUILTIN_GATHERALTDIV8SF,
27152 IX86_BUILTIN_GATHERALTSIV4DI,
27153 IX86_BUILTIN_GATHERALTDIV8SI,
27155 /* TFmode support builtins. */
27156 IX86_BUILTIN_INFQ,
27157 IX86_BUILTIN_HUGE_VALQ,
27158 IX86_BUILTIN_FABSQ,
27159 IX86_BUILTIN_COPYSIGNQ,
27161 /* Vectorizer support builtins. */
27162 IX86_BUILTIN_CPYSGNPS,
27163 IX86_BUILTIN_CPYSGNPD,
27164 IX86_BUILTIN_CPYSGNPS256,
27165 IX86_BUILTIN_CPYSGNPD256,
27167 /* FMA4 instructions. */
27168 IX86_BUILTIN_VFMADDSS,
27169 IX86_BUILTIN_VFMADDSD,
27170 IX86_BUILTIN_VFMADDPS,
27171 IX86_BUILTIN_VFMADDPD,
27172 IX86_BUILTIN_VFMADDPS256,
27173 IX86_BUILTIN_VFMADDPD256,
27174 IX86_BUILTIN_VFMADDSUBPS,
27175 IX86_BUILTIN_VFMADDSUBPD,
27176 IX86_BUILTIN_VFMADDSUBPS256,
27177 IX86_BUILTIN_VFMADDSUBPD256,
27179 /* FMA3 instructions. */
27180 IX86_BUILTIN_VFMADDSS3,
27181 IX86_BUILTIN_VFMADDSD3,
27183 /* XOP instructions. */
27184 IX86_BUILTIN_VPCMOV,
27185 IX86_BUILTIN_VPCMOV_V2DI,
27186 IX86_BUILTIN_VPCMOV_V4SI,
27187 IX86_BUILTIN_VPCMOV_V8HI,
27188 IX86_BUILTIN_VPCMOV_V16QI,
27189 IX86_BUILTIN_VPCMOV_V4SF,
27190 IX86_BUILTIN_VPCMOV_V2DF,
27191 IX86_BUILTIN_VPCMOV256,
27192 IX86_BUILTIN_VPCMOV_V4DI256,
27193 IX86_BUILTIN_VPCMOV_V8SI256,
27194 IX86_BUILTIN_VPCMOV_V16HI256,
27195 IX86_BUILTIN_VPCMOV_V32QI256,
27196 IX86_BUILTIN_VPCMOV_V8SF256,
27197 IX86_BUILTIN_VPCMOV_V4DF256,
27199 IX86_BUILTIN_VPPERM,
27201 IX86_BUILTIN_VPMACSSWW,
27202 IX86_BUILTIN_VPMACSWW,
27203 IX86_BUILTIN_VPMACSSWD,
27204 IX86_BUILTIN_VPMACSWD,
27205 IX86_BUILTIN_VPMACSSDD,
27206 IX86_BUILTIN_VPMACSDD,
27207 IX86_BUILTIN_VPMACSSDQL,
27208 IX86_BUILTIN_VPMACSSDQH,
27209 IX86_BUILTIN_VPMACSDQL,
27210 IX86_BUILTIN_VPMACSDQH,
27211 IX86_BUILTIN_VPMADCSSWD,
27212 IX86_BUILTIN_VPMADCSWD,
27214 IX86_BUILTIN_VPHADDBW,
27215 IX86_BUILTIN_VPHADDBD,
27216 IX86_BUILTIN_VPHADDBQ,
27217 IX86_BUILTIN_VPHADDWD,
27218 IX86_BUILTIN_VPHADDWQ,
27219 IX86_BUILTIN_VPHADDDQ,
27220 IX86_BUILTIN_VPHADDUBW,
27221 IX86_BUILTIN_VPHADDUBD,
27222 IX86_BUILTIN_VPHADDUBQ,
27223 IX86_BUILTIN_VPHADDUWD,
27224 IX86_BUILTIN_VPHADDUWQ,
27225 IX86_BUILTIN_VPHADDUDQ,
27226 IX86_BUILTIN_VPHSUBBW,
27227 IX86_BUILTIN_VPHSUBWD,
27228 IX86_BUILTIN_VPHSUBDQ,
27230 IX86_BUILTIN_VPROTB,
27231 IX86_BUILTIN_VPROTW,
27232 IX86_BUILTIN_VPROTD,
27233 IX86_BUILTIN_VPROTQ,
27234 IX86_BUILTIN_VPROTB_IMM,
27235 IX86_BUILTIN_VPROTW_IMM,
27236 IX86_BUILTIN_VPROTD_IMM,
27237 IX86_BUILTIN_VPROTQ_IMM,
27239 IX86_BUILTIN_VPSHLB,
27240 IX86_BUILTIN_VPSHLW,
27241 IX86_BUILTIN_VPSHLD,
27242 IX86_BUILTIN_VPSHLQ,
27243 IX86_BUILTIN_VPSHAB,
27244 IX86_BUILTIN_VPSHAW,
27245 IX86_BUILTIN_VPSHAD,
27246 IX86_BUILTIN_VPSHAQ,
27248 IX86_BUILTIN_VFRCZSS,
27249 IX86_BUILTIN_VFRCZSD,
27250 IX86_BUILTIN_VFRCZPS,
27251 IX86_BUILTIN_VFRCZPD,
27252 IX86_BUILTIN_VFRCZPS256,
27253 IX86_BUILTIN_VFRCZPD256,
27255 IX86_BUILTIN_VPCOMEQUB,
27256 IX86_BUILTIN_VPCOMNEUB,
27257 IX86_BUILTIN_VPCOMLTUB,
27258 IX86_BUILTIN_VPCOMLEUB,
27259 IX86_BUILTIN_VPCOMGTUB,
27260 IX86_BUILTIN_VPCOMGEUB,
27261 IX86_BUILTIN_VPCOMFALSEUB,
27262 IX86_BUILTIN_VPCOMTRUEUB,
27264 IX86_BUILTIN_VPCOMEQUW,
27265 IX86_BUILTIN_VPCOMNEUW,
27266 IX86_BUILTIN_VPCOMLTUW,
27267 IX86_BUILTIN_VPCOMLEUW,
27268 IX86_BUILTIN_VPCOMGTUW,
27269 IX86_BUILTIN_VPCOMGEUW,
27270 IX86_BUILTIN_VPCOMFALSEUW,
27271 IX86_BUILTIN_VPCOMTRUEUW,
27273 IX86_BUILTIN_VPCOMEQUD,
27274 IX86_BUILTIN_VPCOMNEUD,
27275 IX86_BUILTIN_VPCOMLTUD,
27276 IX86_BUILTIN_VPCOMLEUD,
27277 IX86_BUILTIN_VPCOMGTUD,
27278 IX86_BUILTIN_VPCOMGEUD,
27279 IX86_BUILTIN_VPCOMFALSEUD,
27280 IX86_BUILTIN_VPCOMTRUEUD,
27282 IX86_BUILTIN_VPCOMEQUQ,
27283 IX86_BUILTIN_VPCOMNEUQ,
27284 IX86_BUILTIN_VPCOMLTUQ,
27285 IX86_BUILTIN_VPCOMLEUQ,
27286 IX86_BUILTIN_VPCOMGTUQ,
27287 IX86_BUILTIN_VPCOMGEUQ,
27288 IX86_BUILTIN_VPCOMFALSEUQ,
27289 IX86_BUILTIN_VPCOMTRUEUQ,
27291 IX86_BUILTIN_VPCOMEQB,
27292 IX86_BUILTIN_VPCOMNEB,
27293 IX86_BUILTIN_VPCOMLTB,
27294 IX86_BUILTIN_VPCOMLEB,
27295 IX86_BUILTIN_VPCOMGTB,
27296 IX86_BUILTIN_VPCOMGEB,
27297 IX86_BUILTIN_VPCOMFALSEB,
27298 IX86_BUILTIN_VPCOMTRUEB,
27300 IX86_BUILTIN_VPCOMEQW,
27301 IX86_BUILTIN_VPCOMNEW,
27302 IX86_BUILTIN_VPCOMLTW,
27303 IX86_BUILTIN_VPCOMLEW,
27304 IX86_BUILTIN_VPCOMGTW,
27305 IX86_BUILTIN_VPCOMGEW,
27306 IX86_BUILTIN_VPCOMFALSEW,
27307 IX86_BUILTIN_VPCOMTRUEW,
27309 IX86_BUILTIN_VPCOMEQD,
27310 IX86_BUILTIN_VPCOMNED,
27311 IX86_BUILTIN_VPCOMLTD,
27312 IX86_BUILTIN_VPCOMLED,
27313 IX86_BUILTIN_VPCOMGTD,
27314 IX86_BUILTIN_VPCOMGED,
27315 IX86_BUILTIN_VPCOMFALSED,
27316 IX86_BUILTIN_VPCOMTRUED,
27318 IX86_BUILTIN_VPCOMEQQ,
27319 IX86_BUILTIN_VPCOMNEQ,
27320 IX86_BUILTIN_VPCOMLTQ,
27321 IX86_BUILTIN_VPCOMLEQ,
27322 IX86_BUILTIN_VPCOMGTQ,
27323 IX86_BUILTIN_VPCOMGEQ,
27324 IX86_BUILTIN_VPCOMFALSEQ,
27325 IX86_BUILTIN_VPCOMTRUEQ,
27327 /* LWP instructions. */
27328 IX86_BUILTIN_LLWPCB,
27329 IX86_BUILTIN_SLWPCB,
27330 IX86_BUILTIN_LWPVAL32,
27331 IX86_BUILTIN_LWPVAL64,
27332 IX86_BUILTIN_LWPINS32,
27333 IX86_BUILTIN_LWPINS64,
27335 IX86_BUILTIN_CLZS,
27337 /* RTM */
27338 IX86_BUILTIN_XBEGIN,
27339 IX86_BUILTIN_XEND,
27340 IX86_BUILTIN_XABORT,
27341 IX86_BUILTIN_XTEST,
27343 /* BMI instructions. */
27344 IX86_BUILTIN_BEXTR32,
27345 IX86_BUILTIN_BEXTR64,
27346 IX86_BUILTIN_CTZS,
27348 /* TBM instructions. */
27349 IX86_BUILTIN_BEXTRI32,
27350 IX86_BUILTIN_BEXTRI64,
27352 /* BMI2 instructions. */
27353 IX86_BUILTIN_BZHI32,
27354 IX86_BUILTIN_BZHI64,
27355 IX86_BUILTIN_PDEP32,
27356 IX86_BUILTIN_PDEP64,
27357 IX86_BUILTIN_PEXT32,
27358 IX86_BUILTIN_PEXT64,
27360 /* ADX instructions. */
27361 IX86_BUILTIN_ADDCARRYX32,
27362 IX86_BUILTIN_ADDCARRYX64,
27364 /* FSGSBASE instructions. */
27365 IX86_BUILTIN_RDFSBASE32,
27366 IX86_BUILTIN_RDFSBASE64,
27367 IX86_BUILTIN_RDGSBASE32,
27368 IX86_BUILTIN_RDGSBASE64,
27369 IX86_BUILTIN_WRFSBASE32,
27370 IX86_BUILTIN_WRFSBASE64,
27371 IX86_BUILTIN_WRGSBASE32,
27372 IX86_BUILTIN_WRGSBASE64,
27374 /* RDRND instructions. */
27375 IX86_BUILTIN_RDRAND16_STEP,
27376 IX86_BUILTIN_RDRAND32_STEP,
27377 IX86_BUILTIN_RDRAND64_STEP,
27379 /* RDSEED instructions. */
27380 IX86_BUILTIN_RDSEED16_STEP,
27381 IX86_BUILTIN_RDSEED32_STEP,
27382 IX86_BUILTIN_RDSEED64_STEP,
27384 /* F16C instructions. */
27385 IX86_BUILTIN_CVTPH2PS,
27386 IX86_BUILTIN_CVTPH2PS256,
27387 IX86_BUILTIN_CVTPS2PH,
27388 IX86_BUILTIN_CVTPS2PH256,
27390 /* CFString built-in for darwin */
27391 IX86_BUILTIN_CFSTRING,
27393 /* Builtins to get CPU type and supported features. */
27394 IX86_BUILTIN_CPU_INIT,
27395 IX86_BUILTIN_CPU_IS,
27396 IX86_BUILTIN_CPU_SUPPORTS,
27398 IX86_BUILTIN_MAX
27401 /* Table for the ix86 builtin decls. */
27402 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27404 /* Table of all of the builtin functions that are possible with different ISA's
27405 but are waiting to be built until a function is declared to use that
27406 ISA. */
27407 struct builtin_isa {
27408 const char *name; /* function name */
27409 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27410 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27411 bool const_p; /* true if the declaration is constant */
27412 bool set_and_not_built_p;
27415 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27418 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27419 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27420 function decl in the ix86_builtins array. Returns the function decl or
27421 NULL_TREE, if the builtin was not added.
27423 If the front end has a special hook for builtin functions, delay adding
27424 builtin functions that aren't in the current ISA until the ISA is changed
27425 with function specific optimization. Doing so, can save about 300K for the
27426 default compiler. When the builtin is expanded, check at that time whether
27427 it is valid.
27429 If the front end doesn't have a special hook, record all builtins, even if
27430 it isn't an instruction set in the current ISA in case the user uses
27431 function specific options for a different ISA, so that we don't get scope
27432 errors if a builtin is added in the middle of a function scope. */
27434 static inline tree
27435 def_builtin (HOST_WIDE_INT mask, const char *name,
27436 enum ix86_builtin_func_type tcode,
27437 enum ix86_builtins code)
27439 tree decl = NULL_TREE;
27441 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27443 ix86_builtins_isa[(int) code].isa = mask;
27445 mask &= ~OPTION_MASK_ISA_64BIT;
27446 if (mask == 0
27447 || (mask & ix86_isa_flags) != 0
27448 || (lang_hooks.builtin_function
27449 == lang_hooks.builtin_function_ext_scope))
27452 tree type = ix86_get_builtin_func_type (tcode);
27453 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27454 NULL, NULL_TREE);
27455 ix86_builtins[(int) code] = decl;
27456 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27458 else
27460 ix86_builtins[(int) code] = NULL_TREE;
27461 ix86_builtins_isa[(int) code].tcode = tcode;
27462 ix86_builtins_isa[(int) code].name = name;
27463 ix86_builtins_isa[(int) code].const_p = false;
27464 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27468 return decl;
27471 /* Like def_builtin, but also marks the function decl "const". */
27473 static inline tree
27474 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27475 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27477 tree decl = def_builtin (mask, name, tcode, code);
27478 if (decl)
27479 TREE_READONLY (decl) = 1;
27480 else
27481 ix86_builtins_isa[(int) code].const_p = true;
27483 return decl;
27486 /* Add any new builtin functions for a given ISA that may not have been
27487 declared. This saves a bit of space compared to adding all of the
27488 declarations to the tree, even if we didn't use them. */
27490 static void
27491 ix86_add_new_builtins (HOST_WIDE_INT isa)
27493 int i;
27495 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27497 if ((ix86_builtins_isa[i].isa & isa) != 0
27498 && ix86_builtins_isa[i].set_and_not_built_p)
27500 tree decl, type;
27502 /* Don't define the builtin again. */
27503 ix86_builtins_isa[i].set_and_not_built_p = false;
27505 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
27506 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
27507 type, i, BUILT_IN_MD, NULL,
27508 NULL_TREE);
27510 ix86_builtins[i] = decl;
27511 if (ix86_builtins_isa[i].const_p)
27512 TREE_READONLY (decl) = 1;
27517 /* Bits for builtin_description.flag. */
27519 /* Set when we don't support the comparison natively, and should
27520 swap_comparison in order to support it. */
27521 #define BUILTIN_DESC_SWAP_OPERANDS 1
27523 struct builtin_description
27525 const HOST_WIDE_INT mask;
27526 const enum insn_code icode;
27527 const char *const name;
27528 const enum ix86_builtins code;
27529 const enum rtx_code comparison;
27530 const int flag;
27533 static const struct builtin_description bdesc_comi[] =
27535 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
27536 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
27537 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
27538 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
27539 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
27540 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
27541 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
27542 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
27543 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
27544 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
27545 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
27546 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
27547 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
27548 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
27549 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
27550 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
27551 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
27552 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
27553 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
27554 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
27555 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
27556 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
27557 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
27558 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
27561 static const struct builtin_description bdesc_pcmpestr[] =
27563 /* SSE4.2 */
27564 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
27565 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
27566 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
27567 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
27568 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
27569 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
27570 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
27573 static const struct builtin_description bdesc_pcmpistr[] =
27575 /* SSE4.2 */
27576 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
27577 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
27578 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
27579 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
27580 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
27581 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
27582 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
27585 /* Special builtins with variable number of arguments. */
27586 static const struct builtin_description bdesc_special_args[] =
27588 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
27589 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
27590 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
27592 /* MMX */
27593 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27595 /* 3DNow! */
27596 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27598 /* FXSR, XSAVE and XSAVEOPT */
27599 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
27600 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
27601 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27602 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27603 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27605 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27606 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27607 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27608 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27609 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27611 /* SSE */
27612 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27613 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27614 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27616 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27617 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27618 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27619 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27621 /* SSE or 3DNow!A */
27622 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27623 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
27625 /* SSE2 */
27626 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27627 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27628 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27629 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
27630 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27631 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
27632 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
27633 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
27634 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
27635 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27637 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27638 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27640 /* SSE3 */
27641 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27643 /* SSE4.1 */
27644 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
27646 /* SSE4A */
27647 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27648 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27650 /* AVX */
27651 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
27652 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
27654 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27655 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27656 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27657 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
27658 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
27660 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27661 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27662 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27663 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27664 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27665 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
27666 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27668 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
27669 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27670 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27672 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
27673 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
27674 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
27675 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
27676 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
27677 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
27678 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
27679 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
27681 /* AVX2 */
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
27692 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
27693 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
27694 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
27695 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
27696 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
27697 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
27699 /* FSGSBASE */
27700 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27701 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27702 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27703 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27704 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27705 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27706 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27707 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27709 /* RTM */
27710 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27711 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
27712 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
27715 /* Builtins with variable number of arguments. */
27716 static const struct builtin_description bdesc_args[] =
27718 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
27719 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
27720 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
27721 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27722 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27723 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27724 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27726 /* MMX */
27727 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27728 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27729 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27730 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27731 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27732 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27734 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27735 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27736 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27737 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27738 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27739 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27740 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27741 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27743 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27744 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27746 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27747 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27748 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27749 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27751 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27752 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27753 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27754 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27755 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27756 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27758 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27759 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27760 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27761 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27762 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27763 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27765 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27766 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27767 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27769 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27771 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27772 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27773 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27774 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27775 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27776 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27778 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27779 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27780 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27781 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27782 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27783 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27785 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27786 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27787 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27788 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27790 /* 3DNow! */
27791 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27792 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27793 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27794 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27796 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27797 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27798 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27799 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27800 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27801 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27802 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27803 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27804 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27805 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27806 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27807 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27808 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27809 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27810 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27812 /* 3DNow!A */
27813 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27814 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27815 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27816 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27817 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27818 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27820 /* SSE */
27821 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27822 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27823 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27824 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27825 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27826 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27827 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27828 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27829 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27830 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27831 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27832 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27834 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27836 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27837 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27838 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27839 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27840 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27841 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27842 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27843 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27845 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27846 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27847 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27848 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27849 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27850 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27851 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27852 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27853 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27854 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27855 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27856 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27857 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27858 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27859 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27860 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27861 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27862 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27863 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27864 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27866 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27867 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27868 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27869 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27871 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27872 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27873 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27874 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27876 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27878 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27879 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27880 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27881 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27882 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27884 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27885 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27886 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27888 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27890 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27891 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27892 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27894 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27895 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27897 /* SSE MMX or 3Dnow!A */
27898 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27899 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27900 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27902 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27903 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27904 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27905 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27907 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27908 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27910 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27912 /* SSE2 */
27913 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27915 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27916 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27917 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27918 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27919 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27921 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27922 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27923 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27924 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27925 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27927 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27929 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27930 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27931 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27932 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27934 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27935 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27936 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27938 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27939 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27940 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27941 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27942 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27943 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27944 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27945 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27947 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27948 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27949 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27950 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27951 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27952 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27953 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27954 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27955 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27956 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27957 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27958 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27959 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27960 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27961 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27962 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27963 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27964 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27965 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27966 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27968 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27969 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27970 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27971 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27973 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27974 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27975 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27976 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27978 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27980 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27981 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27982 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27984 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27986 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27987 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27988 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27989 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27990 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27991 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27992 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27993 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27995 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27996 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27997 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27998 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27999 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28000 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28001 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28002 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28004 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28005 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
28007 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28008 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28009 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28010 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28012 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28013 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28015 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28016 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28017 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28018 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28019 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28020 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28022 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28023 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28024 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28025 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28027 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28028 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28029 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28030 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28031 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28032 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28033 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28034 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28036 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28037 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28038 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28040 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28041 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
28043 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
28044 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28046 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
28048 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
28049 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
28050 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
28051 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
28053 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28054 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28055 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28056 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28057 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28058 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28059 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28061 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28062 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28063 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28064 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28065 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28066 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28067 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28069 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28070 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28071 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28072 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28074 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
28075 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28076 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28078 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
28080 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28082 /* SSE2 MMX */
28083 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28084 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28086 /* SSE3 */
28087 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
28088 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28090 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28091 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28092 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28093 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28094 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28095 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28097 /* SSSE3 */
28098 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28099 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
28100 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28101 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
28102 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28103 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28105 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28106 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28107 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28108 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28109 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28110 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28111 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28112 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28113 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28114 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28115 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28116 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28117 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
28118 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
28119 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28120 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28121 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28122 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28123 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28124 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28125 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28126 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28127 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28128 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28130 /* SSSE3. */
28131 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
28132 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
28134 /* SSE4.1 */
28135 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28136 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28137 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
28138 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
28139 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28140 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28141 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28142 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
28143 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
28144 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
28146 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28147 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28148 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28149 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28150 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28151 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28152 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28153 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28154 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28155 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28156 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28157 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28158 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28160 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28161 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28162 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28163 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28164 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28165 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28166 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28167 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28168 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28169 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28170 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28171 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28173 /* SSE4.1 */
28174 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28175 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28176 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28177 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28179 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
28180 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
28181 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
28182 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
28184 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28185 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28187 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28188 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28190 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
28191 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
28192 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
28193 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
28195 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
28196 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
28198 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28199 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28201 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28202 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28203 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28205 /* SSE4.2 */
28206 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28207 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
28208 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
28209 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28210 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28212 /* SSE4A */
28213 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
28214 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
28215 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
28216 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28218 /* AES */
28219 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
28220 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28222 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28223 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28224 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28225 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28227 /* PCLMUL */
28228 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28230 /* AVX */
28231 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28232 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28233 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28234 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28235 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28236 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28237 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28238 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28239 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28240 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28241 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28242 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28243 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28244 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28245 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28246 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28247 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28248 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28249 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28250 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28251 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28252 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28253 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28254 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28255 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28256 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28258 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28259 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28260 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28261 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28263 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28264 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28265 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28266 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28267 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28268 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28269 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28270 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28271 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28272 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28273 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28274 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28275 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28276 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28277 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28278 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28279 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28280 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28281 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28282 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28283 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28284 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28285 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28286 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28287 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28288 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28289 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28290 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28291 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28292 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28293 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28294 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28295 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28296 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28298 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28299 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28300 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28302 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28303 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28304 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28305 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28306 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28308 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28310 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28311 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28313 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28314 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28315 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28316 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28318 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28319 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28321 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28322 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28324 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28325 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28326 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28327 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28329 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28330 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28332 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28333 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28335 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28336 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28337 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28338 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28340 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28341 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28342 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28343 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28344 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28345 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28347 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28348 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28349 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28350 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28351 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28352 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28353 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28354 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28355 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28356 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28357 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28358 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28359 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28360 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28361 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28363 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28364 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28366 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28367 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28369 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28371 /* AVX2 */
28372 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28373 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28374 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28375 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28376 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28377 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28378 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28379 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28380 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28381 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28382 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28383 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28384 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28385 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28386 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28387 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28388 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28389 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28390 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28391 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28392 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28393 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28394 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28395 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28396 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28397 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28398 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28399 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28400 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28401 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28402 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28403 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28404 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28405 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28406 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28407 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28408 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28409 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28410 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28411 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28412 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28413 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28414 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28415 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28416 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28417 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28418 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28419 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28420 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28421 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28422 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28423 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28424 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28425 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28426 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28427 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28428 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28429 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28430 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28431 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28432 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28433 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28434 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28435 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28436 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28437 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28438 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28439 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28440 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28441 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28442 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28443 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28444 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28445 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28446 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28447 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28448 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28449 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28450 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28451 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28452 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28453 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28454 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28455 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28456 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28457 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28458 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28459 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28460 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28461 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28462 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28463 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28464 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28465 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28466 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28467 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28468 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28469 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28470 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28471 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28472 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28473 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28474 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28475 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28476 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28477 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28478 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28479 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28480 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28481 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28482 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28483 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28484 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28485 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28486 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28487 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28488 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28489 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28490 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28491 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28492 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28493 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28494 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28495 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28496 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28497 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28498 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28499 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28500 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28501 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28502 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28503 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28504 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
28505 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
28506 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
28507 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
28508 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28509 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28510 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28511 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28512 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28513 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28514 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28515 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28516 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28517 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28519 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28521 /* BMI */
28522 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28523 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28524 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28526 /* TBM */
28527 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28528 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28530 /* F16C */
28531 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
28532 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
28533 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
28534 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
28536 /* BMI2 */
28537 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28538 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28539 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28540 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28541 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28542 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28545 /* FMA4 and XOP. */
28546 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
28547 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
28548 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
28549 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
28550 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
28551 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
28552 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
28553 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
28554 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
28555 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
28556 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
28557 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
28558 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
28559 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
28560 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
28561 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
28562 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
28563 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
28564 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
28565 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
28566 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
28567 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
28568 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
28569 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
28570 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
28571 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
28572 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
28573 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
28574 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
28575 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
28576 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
28577 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
28578 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
28579 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
28580 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
28581 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
28582 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
28583 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
28584 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
28585 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
28586 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
28587 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
28588 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
28589 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
28590 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
28591 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
28592 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
28593 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
28594 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
28595 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
28596 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
28597 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
28599 static const struct builtin_description bdesc_multi_arg[] =
28601 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
28602 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
28603 UNKNOWN, (int)MULTI_ARG_3_SF },
28604 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
28605 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
28606 UNKNOWN, (int)MULTI_ARG_3_DF },
28608 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
28609 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
28610 UNKNOWN, (int)MULTI_ARG_3_SF },
28611 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
28612 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
28613 UNKNOWN, (int)MULTI_ARG_3_DF },
28615 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
28616 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
28617 UNKNOWN, (int)MULTI_ARG_3_SF },
28618 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
28619 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
28620 UNKNOWN, (int)MULTI_ARG_3_DF },
28621 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
28622 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
28623 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28624 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
28625 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
28626 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28628 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
28629 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
28630 UNKNOWN, (int)MULTI_ARG_3_SF },
28631 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
28632 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
28633 UNKNOWN, (int)MULTI_ARG_3_DF },
28634 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
28635 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
28636 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28637 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
28638 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
28639 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28641 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
28642 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
28643 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
28644 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
28645 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
28646 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
28647 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
28649 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28650 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28651 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
28652 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
28653 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
28654 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
28655 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
28657 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
28659 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28660 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28661 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28662 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28663 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28664 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28665 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28666 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28667 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28668 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28669 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28670 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28672 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28673 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
28674 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
28675 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
28676 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
28677 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
28678 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
28679 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
28680 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28681 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
28682 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
28683 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
28684 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28685 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
28686 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
28687 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
28689 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
28690 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
28691 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
28692 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
28693 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
28694 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
28696 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28697 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28698 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28699 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28700 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28701 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28702 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28703 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28704 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28705 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28706 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28707 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28708 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28709 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28710 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28712 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
28713 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28714 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28715 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
28716 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
28717 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
28718 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
28720 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
28721 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28722 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28723 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
28724 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28725 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28726 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28728 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28729 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28730 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28731 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28732 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28733 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28734 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28736 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28737 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28738 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28739 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28740 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28741 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28742 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28744 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28745 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28746 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28747 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28748 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28749 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28750 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28752 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28753 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28754 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28755 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28756 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28757 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28758 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28760 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28761 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28762 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28763 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28764 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28765 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28766 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28768 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28769 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28770 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28771 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28772 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28773 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28774 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28776 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28777 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28778 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28779 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28780 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28781 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28782 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28783 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28785 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28786 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28787 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28788 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28789 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28790 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28791 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28792 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28794 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28795 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28796 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28797 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28801 /* TM vector builtins. */
28803 /* Reuse the existing x86-specific `struct builtin_description' cause
28804 we're lazy. Add casts to make them fit. */
28805 static const struct builtin_description bdesc_tm[] =
28807 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28808 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28809 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28810 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28811 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28812 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28813 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28815 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28816 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28817 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28818 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28819 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28820 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28821 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28823 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28824 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28825 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28826 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28827 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28828 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28829 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28831 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28832 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28833 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28836 /* TM callbacks. */
28838 /* Return the builtin decl needed to load a vector of TYPE. */
28840 static tree
28841 ix86_builtin_tm_load (tree type)
28843 if (TREE_CODE (type) == VECTOR_TYPE)
28845 switch (tree_low_cst (TYPE_SIZE (type), 1))
28847 case 64:
28848 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28849 case 128:
28850 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28851 case 256:
28852 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28855 return NULL_TREE;
28858 /* Return the builtin decl needed to store a vector of TYPE. */
28860 static tree
28861 ix86_builtin_tm_store (tree type)
28863 if (TREE_CODE (type) == VECTOR_TYPE)
28865 switch (tree_low_cst (TYPE_SIZE (type), 1))
28867 case 64:
28868 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28869 case 128:
28870 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28871 case 256:
28872 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28875 return NULL_TREE;
28878 /* Initialize the transactional memory vector load/store builtins. */
28880 static void
28881 ix86_init_tm_builtins (void)
28883 enum ix86_builtin_func_type ftype;
28884 const struct builtin_description *d;
28885 size_t i;
28886 tree decl;
28887 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28888 tree attrs_log, attrs_type_log;
28890 if (!flag_tm)
28891 return;
28893 /* If there are no builtins defined, we must be compiling in a
28894 language without trans-mem support. */
28895 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28896 return;
28898 /* Use whatever attributes a normal TM load has. */
28899 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28900 attrs_load = DECL_ATTRIBUTES (decl);
28901 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28902 /* Use whatever attributes a normal TM store has. */
28903 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28904 attrs_store = DECL_ATTRIBUTES (decl);
28905 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28906 /* Use whatever attributes a normal TM log has. */
28907 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28908 attrs_log = DECL_ATTRIBUTES (decl);
28909 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28911 for (i = 0, d = bdesc_tm;
28912 i < ARRAY_SIZE (bdesc_tm);
28913 i++, d++)
28915 if ((d->mask & ix86_isa_flags) != 0
28916 || (lang_hooks.builtin_function
28917 == lang_hooks.builtin_function_ext_scope))
28919 tree type, attrs, attrs_type;
28920 enum built_in_function code = (enum built_in_function) d->code;
28922 ftype = (enum ix86_builtin_func_type) d->flag;
28923 type = ix86_get_builtin_func_type (ftype);
28925 if (BUILTIN_TM_LOAD_P (code))
28927 attrs = attrs_load;
28928 attrs_type = attrs_type_load;
28930 else if (BUILTIN_TM_STORE_P (code))
28932 attrs = attrs_store;
28933 attrs_type = attrs_type_store;
28935 else
28937 attrs = attrs_log;
28938 attrs_type = attrs_type_log;
28940 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28941 /* The builtin without the prefix for
28942 calling it directly. */
28943 d->name + strlen ("__builtin_"),
28944 attrs);
28945 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28946 set the TYPE_ATTRIBUTES. */
28947 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28949 set_builtin_decl (code, decl, false);
28954 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28955 in the current target ISA to allow the user to compile particular modules
28956 with different target specific options that differ from the command line
28957 options. */
28958 static void
28959 ix86_init_mmx_sse_builtins (void)
28961 const struct builtin_description * d;
28962 enum ix86_builtin_func_type ftype;
28963 size_t i;
28965 /* Add all special builtins with variable number of operands. */
28966 for (i = 0, d = bdesc_special_args;
28967 i < ARRAY_SIZE (bdesc_special_args);
28968 i++, d++)
28970 if (d->name == 0)
28971 continue;
28973 ftype = (enum ix86_builtin_func_type) d->flag;
28974 def_builtin (d->mask, d->name, ftype, d->code);
28977 /* Add all builtins with variable number of operands. */
28978 for (i = 0, d = bdesc_args;
28979 i < ARRAY_SIZE (bdesc_args);
28980 i++, d++)
28982 if (d->name == 0)
28983 continue;
28985 ftype = (enum ix86_builtin_func_type) d->flag;
28986 def_builtin_const (d->mask, d->name, ftype, d->code);
28989 /* pcmpestr[im] insns. */
28990 for (i = 0, d = bdesc_pcmpestr;
28991 i < ARRAY_SIZE (bdesc_pcmpestr);
28992 i++, d++)
28994 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28995 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28996 else
28997 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28998 def_builtin_const (d->mask, d->name, ftype, d->code);
29001 /* pcmpistr[im] insns. */
29002 for (i = 0, d = bdesc_pcmpistr;
29003 i < ARRAY_SIZE (bdesc_pcmpistr);
29004 i++, d++)
29006 if (d->code == IX86_BUILTIN_PCMPISTRM128)
29007 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
29008 else
29009 ftype = INT_FTYPE_V16QI_V16QI_INT;
29010 def_builtin_const (d->mask, d->name, ftype, d->code);
29013 /* comi/ucomi insns. */
29014 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29016 if (d->mask == OPTION_MASK_ISA_SSE2)
29017 ftype = INT_FTYPE_V2DF_V2DF;
29018 else
29019 ftype = INT_FTYPE_V4SF_V4SF;
29020 def_builtin_const (d->mask, d->name, ftype, d->code);
29023 /* SSE */
29024 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
29025 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
29026 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
29027 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
29029 /* SSE or 3DNow!A */
29030 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29031 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
29032 IX86_BUILTIN_MASKMOVQ);
29034 /* SSE2 */
29035 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
29036 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
29038 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
29039 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
29040 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
29041 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
29043 /* SSE3. */
29044 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
29045 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
29046 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
29047 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
29049 /* AES */
29050 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
29051 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
29052 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
29053 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
29054 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
29055 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
29056 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
29057 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
29058 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
29059 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
29060 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
29061 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
29063 /* PCLMUL */
29064 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
29065 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
29067 /* RDRND */
29068 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
29069 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
29070 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
29071 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
29072 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
29073 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
29074 IX86_BUILTIN_RDRAND64_STEP);
29076 /* AVX2 */
29077 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
29078 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
29079 IX86_BUILTIN_GATHERSIV2DF);
29081 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
29082 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
29083 IX86_BUILTIN_GATHERSIV4DF);
29085 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
29086 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
29087 IX86_BUILTIN_GATHERDIV2DF);
29089 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
29090 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
29091 IX86_BUILTIN_GATHERDIV4DF);
29093 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
29094 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
29095 IX86_BUILTIN_GATHERSIV4SF);
29097 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
29098 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
29099 IX86_BUILTIN_GATHERSIV8SF);
29101 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
29102 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
29103 IX86_BUILTIN_GATHERDIV4SF);
29105 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
29106 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
29107 IX86_BUILTIN_GATHERDIV8SF);
29109 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
29110 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
29111 IX86_BUILTIN_GATHERSIV2DI);
29113 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
29114 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
29115 IX86_BUILTIN_GATHERSIV4DI);
29117 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
29118 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
29119 IX86_BUILTIN_GATHERDIV2DI);
29121 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
29122 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
29123 IX86_BUILTIN_GATHERDIV4DI);
29125 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
29126 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
29127 IX86_BUILTIN_GATHERSIV4SI);
29129 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
29130 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
29131 IX86_BUILTIN_GATHERSIV8SI);
29133 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
29134 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
29135 IX86_BUILTIN_GATHERDIV4SI);
29137 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
29138 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
29139 IX86_BUILTIN_GATHERDIV8SI);
29141 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
29142 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
29143 IX86_BUILTIN_GATHERALTSIV4DF);
29145 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
29146 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
29147 IX86_BUILTIN_GATHERALTDIV8SF);
29149 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
29150 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
29151 IX86_BUILTIN_GATHERALTSIV4DI);
29153 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
29154 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
29155 IX86_BUILTIN_GATHERALTDIV8SI);
29157 /* RTM. */
29158 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
29159 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
29161 /* MMX access to the vec_init patterns. */
29162 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
29163 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
29165 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
29166 V4HI_FTYPE_HI_HI_HI_HI,
29167 IX86_BUILTIN_VEC_INIT_V4HI);
29169 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
29170 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
29171 IX86_BUILTIN_VEC_INIT_V8QI);
29173 /* Access to the vec_extract patterns. */
29174 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
29175 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
29176 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
29177 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
29178 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
29179 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
29180 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
29181 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
29182 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
29183 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
29185 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29186 "__builtin_ia32_vec_ext_v4hi",
29187 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
29189 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
29190 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
29192 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
29193 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
29195 /* Access to the vec_set patterns. */
29196 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
29197 "__builtin_ia32_vec_set_v2di",
29198 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
29200 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
29201 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
29203 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
29204 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
29206 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
29207 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
29209 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29210 "__builtin_ia32_vec_set_v4hi",
29211 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
29213 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
29214 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
29216 /* RDSEED */
29217 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
29218 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
29219 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
29220 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
29221 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
29222 "__builtin_ia32_rdseed_di_step",
29223 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
29225 /* ADCX */
29226 def_builtin (0, "__builtin_ia32_addcarryx_u32",
29227 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29228 def_builtin (OPTION_MASK_ISA_64BIT,
29229 "__builtin_ia32_addcarryx_u64",
29230 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29231 IX86_BUILTIN_ADDCARRYX64);
29233 /* Add FMA4 multi-arg argument instructions */
29234 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29236 if (d->name == 0)
29237 continue;
29239 ftype = (enum ix86_builtin_func_type) d->flag;
29240 def_builtin_const (d->mask, d->name, ftype, d->code);
29244 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29245 to return a pointer to VERSION_DECL if the outcome of the expression
29246 formed by PREDICATE_CHAIN is true. This function will be called during
29247 version dispatch to decide which function version to execute. It returns
29248 the basic block at the end, to which more conditions can be added. */
29250 static basic_block
29251 add_condition_to_bb (tree function_decl, tree version_decl,
29252 tree predicate_chain, basic_block new_bb)
29254 gimple return_stmt;
29255 tree convert_expr, result_var;
29256 gimple convert_stmt;
29257 gimple call_cond_stmt;
29258 gimple if_else_stmt;
29260 basic_block bb1, bb2, bb3;
29261 edge e12, e23;
29263 tree cond_var, and_expr_var = NULL_TREE;
29264 gimple_seq gseq;
29266 tree predicate_decl, predicate_arg;
29268 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29270 gcc_assert (new_bb != NULL);
29271 gseq = bb_seq (new_bb);
29274 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29275 build_fold_addr_expr (version_decl));
29276 result_var = create_tmp_var (ptr_type_node, NULL);
29277 convert_stmt = gimple_build_assign (result_var, convert_expr);
29278 return_stmt = gimple_build_return (result_var);
29280 if (predicate_chain == NULL_TREE)
29282 gimple_seq_add_stmt (&gseq, convert_stmt);
29283 gimple_seq_add_stmt (&gseq, return_stmt);
29284 set_bb_seq (new_bb, gseq);
29285 gimple_set_bb (convert_stmt, new_bb);
29286 gimple_set_bb (return_stmt, new_bb);
29287 pop_cfun ();
29288 return new_bb;
29291 while (predicate_chain != NULL)
29293 cond_var = create_tmp_var (integer_type_node, NULL);
29294 predicate_decl = TREE_PURPOSE (predicate_chain);
29295 predicate_arg = TREE_VALUE (predicate_chain);
29296 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29297 gimple_call_set_lhs (call_cond_stmt, cond_var);
29299 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29300 gimple_set_bb (call_cond_stmt, new_bb);
29301 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29303 predicate_chain = TREE_CHAIN (predicate_chain);
29305 if (and_expr_var == NULL)
29306 and_expr_var = cond_var;
29307 else
29309 gimple assign_stmt;
29310 /* Use MIN_EXPR to check if any integer is zero?.
29311 and_expr_var = min_expr <cond_var, and_expr_var> */
29312 assign_stmt = gimple_build_assign (and_expr_var,
29313 build2 (MIN_EXPR, integer_type_node,
29314 cond_var, and_expr_var));
29316 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29317 gimple_set_bb (assign_stmt, new_bb);
29318 gimple_seq_add_stmt (&gseq, assign_stmt);
29322 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29323 integer_zero_node,
29324 NULL_TREE, NULL_TREE);
29325 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29326 gimple_set_bb (if_else_stmt, new_bb);
29327 gimple_seq_add_stmt (&gseq, if_else_stmt);
29329 gimple_seq_add_stmt (&gseq, convert_stmt);
29330 gimple_seq_add_stmt (&gseq, return_stmt);
29331 set_bb_seq (new_bb, gseq);
29333 bb1 = new_bb;
29334 e12 = split_block (bb1, if_else_stmt);
29335 bb2 = e12->dest;
29336 e12->flags &= ~EDGE_FALLTHRU;
29337 e12->flags |= EDGE_TRUE_VALUE;
29339 e23 = split_block (bb2, return_stmt);
29341 gimple_set_bb (convert_stmt, bb2);
29342 gimple_set_bb (return_stmt, bb2);
29344 bb3 = e23->dest;
29345 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29347 remove_edge (e23);
29348 make_edge (bb2, EXIT_BLOCK_PTR, 0);
29350 pop_cfun ();
29352 return bb3;
29355 /* This parses the attribute arguments to target in DECL and determines
29356 the right builtin to use to match the platform specification.
29357 It returns the priority value for this version decl. If PREDICATE_LIST
29358 is not NULL, it stores the list of cpu features that need to be checked
29359 before dispatching this function. */
29361 static unsigned int
29362 get_builtin_code_for_version (tree decl, tree *predicate_list)
29364 tree attrs;
29365 struct cl_target_option cur_target;
29366 tree target_node;
29367 struct cl_target_option *new_target;
29368 const char *arg_str = NULL;
29369 const char *attrs_str = NULL;
29370 char *tok_str = NULL;
29371 char *token;
29373 /* Priority of i386 features, greater value is higher priority. This is
29374 used to decide the order in which function dispatch must happen. For
29375 instance, a version specialized for SSE4.2 should be checked for dispatch
29376 before a version for SSE3, as SSE4.2 implies SSE3. */
29377 enum feature_priority
29379 P_ZERO = 0,
29380 P_MMX,
29381 P_SSE,
29382 P_SSE2,
29383 P_SSE3,
29384 P_SSSE3,
29385 P_PROC_SSSE3,
29386 P_SSE4_a,
29387 P_PROC_SSE4_a,
29388 P_SSE4_1,
29389 P_SSE4_2,
29390 P_PROC_SSE4_2,
29391 P_POPCNT,
29392 P_AVX,
29393 P_AVX2,
29394 P_FMA,
29395 P_PROC_FMA
29398 enum feature_priority priority = P_ZERO;
29400 /* These are the target attribute strings for which a dispatcher is
29401 available, from fold_builtin_cpu. */
29403 static struct _feature_list
29405 const char *const name;
29406 const enum feature_priority priority;
29408 const feature_list[] =
29410 {"mmx", P_MMX},
29411 {"sse", P_SSE},
29412 {"sse2", P_SSE2},
29413 {"sse3", P_SSE3},
29414 {"ssse3", P_SSSE3},
29415 {"sse4.1", P_SSE4_1},
29416 {"sse4.2", P_SSE4_2},
29417 {"popcnt", P_POPCNT},
29418 {"avx", P_AVX},
29419 {"avx2", P_AVX2}
29423 static unsigned int NUM_FEATURES
29424 = sizeof (feature_list) / sizeof (struct _feature_list);
29426 unsigned int i;
29428 tree predicate_chain = NULL_TREE;
29429 tree predicate_decl, predicate_arg;
29431 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29432 gcc_assert (attrs != NULL);
29434 attrs = TREE_VALUE (TREE_VALUE (attrs));
29436 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29437 attrs_str = TREE_STRING_POINTER (attrs);
29439 /* Return priority zero for default function. */
29440 if (strcmp (attrs_str, "default") == 0)
29441 return 0;
29443 /* Handle arch= if specified. For priority, set it to be 1 more than
29444 the best instruction set the processor can handle. For instance, if
29445 there is a version for atom and a version for ssse3 (the highest ISA
29446 priority for atom), the atom version must be checked for dispatch
29447 before the ssse3 version. */
29448 if (strstr (attrs_str, "arch=") != NULL)
29450 cl_target_option_save (&cur_target, &global_options);
29451 target_node = ix86_valid_target_attribute_tree (attrs);
29453 gcc_assert (target_node);
29454 new_target = TREE_TARGET_OPTION (target_node);
29455 gcc_assert (new_target);
29457 if (new_target->arch_specified && new_target->arch > 0)
29459 switch (new_target->arch)
29461 case PROCESSOR_CORE2:
29462 arg_str = "core2";
29463 priority = P_PROC_SSSE3;
29464 break;
29465 case PROCESSOR_COREI7:
29466 arg_str = "corei7";
29467 priority = P_PROC_SSE4_2;
29468 break;
29469 case PROCESSOR_ATOM:
29470 arg_str = "atom";
29471 priority = P_PROC_SSSE3;
29472 break;
29473 case PROCESSOR_AMDFAM10:
29474 arg_str = "amdfam10h";
29475 priority = P_PROC_SSE4_a;
29476 break;
29477 case PROCESSOR_BDVER1:
29478 arg_str = "bdver1";
29479 priority = P_PROC_FMA;
29480 break;
29481 case PROCESSOR_BDVER2:
29482 arg_str = "bdver2";
29483 priority = P_PROC_FMA;
29484 break;
29488 cl_target_option_restore (&global_options, &cur_target);
29490 if (predicate_list && arg_str == NULL)
29492 error_at (DECL_SOURCE_LOCATION (decl),
29493 "No dispatcher found for the versioning attributes");
29494 return 0;
29497 if (predicate_list)
29499 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
29500 /* For a C string literal the length includes the trailing NULL. */
29501 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
29502 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29503 predicate_chain);
29507 /* Process feature name. */
29508 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
29509 strcpy (tok_str, attrs_str);
29510 token = strtok (tok_str, ",");
29511 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
29513 while (token != NULL)
29515 /* Do not process "arch=" */
29516 if (strncmp (token, "arch=", 5) == 0)
29518 token = strtok (NULL, ",");
29519 continue;
29521 for (i = 0; i < NUM_FEATURES; ++i)
29523 if (strcmp (token, feature_list[i].name) == 0)
29525 if (predicate_list)
29527 predicate_arg = build_string_literal (
29528 strlen (feature_list[i].name) + 1,
29529 feature_list[i].name);
29530 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29531 predicate_chain);
29533 /* Find the maximum priority feature. */
29534 if (feature_list[i].priority > priority)
29535 priority = feature_list[i].priority;
29537 break;
29540 if (predicate_list && i == NUM_FEATURES)
29542 error_at (DECL_SOURCE_LOCATION (decl),
29543 "No dispatcher found for %s", token);
29544 return 0;
29546 token = strtok (NULL, ",");
29548 free (tok_str);
29550 if (predicate_list && predicate_chain == NULL_TREE)
29552 error_at (DECL_SOURCE_LOCATION (decl),
29553 "No dispatcher found for the versioning attributes : %s",
29554 attrs_str);
29555 return 0;
29557 else if (predicate_list)
29559 predicate_chain = nreverse (predicate_chain);
29560 *predicate_list = predicate_chain;
29563 return priority;
29566 /* This compares the priority of target features in function DECL1
29567 and DECL2. It returns positive value if DECL1 is higher priority,
29568 negative value if DECL2 is higher priority and 0 if they are the
29569 same. */
29571 static int
29572 ix86_compare_version_priority (tree decl1, tree decl2)
29574 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
29575 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
29577 return (int)priority1 - (int)priority2;
29580 /* V1 and V2 point to function versions with different priorities
29581 based on the target ISA. This function compares their priorities. */
29583 static int
29584 feature_compare (const void *v1, const void *v2)
29586 typedef struct _function_version_info
29588 tree version_decl;
29589 tree predicate_chain;
29590 unsigned int dispatch_priority;
29591 } function_version_info;
29593 const function_version_info c1 = *(const function_version_info *)v1;
29594 const function_version_info c2 = *(const function_version_info *)v2;
29595 return (c2.dispatch_priority - c1.dispatch_priority);
29598 /* This function generates the dispatch function for
29599 multi-versioned functions. DISPATCH_DECL is the function which will
29600 contain the dispatch logic. FNDECLS are the function choices for
29601 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
29602 in DISPATCH_DECL in which the dispatch code is generated. */
29604 static int
29605 dispatch_function_versions (tree dispatch_decl,
29606 void *fndecls_p,
29607 basic_block *empty_bb)
29609 tree default_decl;
29610 gimple ifunc_cpu_init_stmt;
29611 gimple_seq gseq;
29612 int ix;
29613 tree ele;
29614 vec<tree> *fndecls;
29615 unsigned int num_versions = 0;
29616 unsigned int actual_versions = 0;
29617 unsigned int i;
29619 struct _function_version_info
29621 tree version_decl;
29622 tree predicate_chain;
29623 unsigned int dispatch_priority;
29624 }*function_version_info;
29626 gcc_assert (dispatch_decl != NULL
29627 && fndecls_p != NULL
29628 && empty_bb != NULL);
29630 /*fndecls_p is actually a vector. */
29631 fndecls = static_cast<vec<tree> *> (fndecls_p);
29633 /* At least one more version other than the default. */
29634 num_versions = fndecls->length ();
29635 gcc_assert (num_versions >= 2);
29637 function_version_info = (struct _function_version_info *)
29638 XNEWVEC (struct _function_version_info, (num_versions - 1));
29640 /* The first version in the vector is the default decl. */
29641 default_decl = (*fndecls)[0];
29643 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
29645 gseq = bb_seq (*empty_bb);
29646 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
29647 constructors, so explicity call __builtin_cpu_init here. */
29648 ifunc_cpu_init_stmt = gimple_build_call_vec (
29649 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
29650 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
29651 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
29652 set_bb_seq (*empty_bb, gseq);
29654 pop_cfun ();
29657 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
29659 tree version_decl = ele;
29660 tree predicate_chain = NULL_TREE;
29661 unsigned int priority;
29662 /* Get attribute string, parse it and find the right predicate decl.
29663 The predicate function could be a lengthy combination of many
29664 features, like arch-type and various isa-variants. */
29665 priority = get_builtin_code_for_version (version_decl,
29666 &predicate_chain);
29668 if (predicate_chain == NULL_TREE)
29669 continue;
29671 function_version_info [actual_versions].version_decl = version_decl;
29672 function_version_info [actual_versions].predicate_chain
29673 = predicate_chain;
29674 function_version_info [actual_versions].dispatch_priority = priority;
29675 actual_versions++;
29678 /* Sort the versions according to descending order of dispatch priority. The
29679 priority is based on the ISA. This is not a perfect solution. There
29680 could still be ambiguity. If more than one function version is suitable
29681 to execute, which one should be dispatched? In future, allow the user
29682 to specify a dispatch priority next to the version. */
29683 qsort (function_version_info, actual_versions,
29684 sizeof (struct _function_version_info), feature_compare);
29686 for (i = 0; i < actual_versions; ++i)
29687 *empty_bb = add_condition_to_bb (dispatch_decl,
29688 function_version_info[i].version_decl,
29689 function_version_info[i].predicate_chain,
29690 *empty_bb);
29692 /* dispatch default version at the end. */
29693 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
29694 NULL, *empty_bb);
29696 free (function_version_info);
29697 return 0;
29700 /* Comparator function to be used in qsort routine to sort attribute
29701 specification strings to "target". */
29703 static int
29704 attr_strcmp (const void *v1, const void *v2)
29706 const char *c1 = *(char *const*)v1;
29707 const char *c2 = *(char *const*)v2;
29708 return strcmp (c1, c2);
29711 /* ARGLIST is the argument to target attribute. This function tokenizes
29712 the comma separated arguments, sorts them and returns a string which
29713 is a unique identifier for the comma separated arguments. It also
29714 replaces non-identifier characters "=,-" with "_". */
29716 static char *
29717 sorted_attr_string (tree arglist)
29719 tree arg;
29720 size_t str_len_sum = 0;
29721 char **args = NULL;
29722 char *attr_str, *ret_str;
29723 char *attr = NULL;
29724 unsigned int argnum = 1;
29725 unsigned int i;
29727 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29729 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29730 size_t len = strlen (str);
29731 str_len_sum += len + 1;
29732 if (arg != arglist)
29733 argnum++;
29734 for (i = 0; i < strlen (str); i++)
29735 if (str[i] == ',')
29736 argnum++;
29739 attr_str = XNEWVEC (char, str_len_sum);
29740 str_len_sum = 0;
29741 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29743 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29744 size_t len = strlen (str);
29745 memcpy (attr_str + str_len_sum, str, len);
29746 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29747 str_len_sum += len + 1;
29750 /* Replace "=,-" with "_". */
29751 for (i = 0; i < strlen (attr_str); i++)
29752 if (attr_str[i] == '=' || attr_str[i]== '-')
29753 attr_str[i] = '_';
29755 if (argnum == 1)
29756 return attr_str;
29758 args = XNEWVEC (char *, argnum);
29760 i = 0;
29761 attr = strtok (attr_str, ",");
29762 while (attr != NULL)
29764 args[i] = attr;
29765 i++;
29766 attr = strtok (NULL, ",");
29769 qsort (args, argnum, sizeof (char *), attr_strcmp);
29771 ret_str = XNEWVEC (char, str_len_sum);
29772 str_len_sum = 0;
29773 for (i = 0; i < argnum; i++)
29775 size_t len = strlen (args[i]);
29776 memcpy (ret_str + str_len_sum, args[i], len);
29777 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29778 str_len_sum += len + 1;
29781 XDELETEVEC (args);
29782 XDELETEVEC (attr_str);
29783 return ret_str;
29786 /* This function changes the assembler name for functions that are
29787 versions. If DECL is a function version and has a "target"
29788 attribute, it appends the attribute string to its assembler name. */
29790 static tree
29791 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29793 tree version_attr;
29794 const char *orig_name, *version_string;
29795 char *attr_str, *assembler_name;
29797 if (DECL_DECLARED_INLINE_P (decl)
29798 && lookup_attribute ("gnu_inline",
29799 DECL_ATTRIBUTES (decl)))
29800 error_at (DECL_SOURCE_LOCATION (decl),
29801 "Function versions cannot be marked as gnu_inline,"
29802 " bodies have to be generated");
29804 if (DECL_VIRTUAL_P (decl)
29805 || DECL_VINDEX (decl))
29806 sorry ("Virtual function multiversioning not supported");
29808 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29810 /* target attribute string cannot be NULL. */
29811 gcc_assert (version_attr != NULL_TREE);
29813 orig_name = IDENTIFIER_POINTER (id);
29814 version_string
29815 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29817 if (strcmp (version_string, "default") == 0)
29818 return id;
29820 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29821 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29823 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29825 /* Allow assembler name to be modified if already set. */
29826 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29827 SET_DECL_RTL (decl, NULL);
29829 tree ret = get_identifier (assembler_name);
29830 XDELETEVEC (attr_str);
29831 XDELETEVEC (assembler_name);
29832 return ret;
29835 /* This function returns true if FN1 and FN2 are versions of the same function,
29836 that is, the target strings of the function decls are different. This assumes
29837 that FN1 and FN2 have the same signature. */
29839 static bool
29840 ix86_function_versions (tree fn1, tree fn2)
29842 tree attr1, attr2;
29843 char *target1, *target2;
29844 bool result;
29846 if (TREE_CODE (fn1) != FUNCTION_DECL
29847 || TREE_CODE (fn2) != FUNCTION_DECL)
29848 return false;
29850 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29851 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29853 /* At least one function decl should have the target attribute specified. */
29854 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29855 return false;
29857 /* Diagnose missing target attribute if one of the decls is already
29858 multi-versioned. */
29859 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29861 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29863 if (attr2 != NULL_TREE)
29865 tree tem = fn1;
29866 fn1 = fn2;
29867 fn2 = tem;
29868 attr1 = attr2;
29870 error_at (DECL_SOURCE_LOCATION (fn2),
29871 "missing %<target%> attribute for multi-versioned %D",
29872 fn2);
29873 error_at (DECL_SOURCE_LOCATION (fn1),
29874 "previous declaration of %D", fn1);
29875 /* Prevent diagnosing of the same error multiple times. */
29876 DECL_ATTRIBUTES (fn2)
29877 = tree_cons (get_identifier ("target"),
29878 copy_node (TREE_VALUE (attr1)),
29879 DECL_ATTRIBUTES (fn2));
29881 return false;
29884 target1 = sorted_attr_string (TREE_VALUE (attr1));
29885 target2 = sorted_attr_string (TREE_VALUE (attr2));
29887 /* The sorted target strings must be different for fn1 and fn2
29888 to be versions. */
29889 if (strcmp (target1, target2) == 0)
29890 result = false;
29891 else
29892 result = true;
29894 XDELETEVEC (target1);
29895 XDELETEVEC (target2);
29897 return result;
29900 static tree
29901 ix86_mangle_decl_assembler_name (tree decl, tree id)
29903 /* For function version, add the target suffix to the assembler name. */
29904 if (TREE_CODE (decl) == FUNCTION_DECL
29905 && DECL_FUNCTION_VERSIONED (decl))
29906 id = ix86_mangle_function_version_assembler_name (decl, id);
29907 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29908 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29909 #endif
29911 return id;
29914 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29915 is true, append the full path name of the source file. */
29917 static char *
29918 make_name (tree decl, const char *suffix, bool make_unique)
29920 char *global_var_name;
29921 int name_len;
29922 const char *name;
29923 const char *unique_name = NULL;
29925 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29927 /* Get a unique name that can be used globally without any chances
29928 of collision at link time. */
29929 if (make_unique)
29930 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29932 name_len = strlen (name) + strlen (suffix) + 2;
29934 if (make_unique)
29935 name_len += strlen (unique_name) + 1;
29936 global_var_name = XNEWVEC (char, name_len);
29938 /* Use '.' to concatenate names as it is demangler friendly. */
29939 if (make_unique)
29940 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29941 suffix);
29942 else
29943 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29945 return global_var_name;
29948 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29950 /* Make a dispatcher declaration for the multi-versioned function DECL.
29951 Calls to DECL function will be replaced with calls to the dispatcher
29952 by the front-end. Return the decl created. */
29954 static tree
29955 make_dispatcher_decl (const tree decl)
29957 tree func_decl;
29958 char *func_name;
29959 tree fn_type, func_type;
29960 bool is_uniq = false;
29962 if (TREE_PUBLIC (decl) == 0)
29963 is_uniq = true;
29965 func_name = make_name (decl, "ifunc", is_uniq);
29967 fn_type = TREE_TYPE (decl);
29968 func_type = build_function_type (TREE_TYPE (fn_type),
29969 TYPE_ARG_TYPES (fn_type));
29971 func_decl = build_fn_decl (func_name, func_type);
29972 XDELETEVEC (func_name);
29973 TREE_USED (func_decl) = 1;
29974 DECL_CONTEXT (func_decl) = NULL_TREE;
29975 DECL_INITIAL (func_decl) = error_mark_node;
29976 DECL_ARTIFICIAL (func_decl) = 1;
29977 /* Mark this func as external, the resolver will flip it again if
29978 it gets generated. */
29979 DECL_EXTERNAL (func_decl) = 1;
29980 /* This will be of type IFUNCs have to be externally visible. */
29981 TREE_PUBLIC (func_decl) = 1;
29983 return func_decl;
29986 #endif
29988 /* Returns true if decl is multi-versioned and DECL is the default function,
29989 that is it is not tagged with target specific optimization. */
29991 static bool
29992 is_function_default_version (const tree decl)
29994 if (TREE_CODE (decl) != FUNCTION_DECL
29995 || !DECL_FUNCTION_VERSIONED (decl))
29996 return false;
29997 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29998 gcc_assert (attr);
29999 attr = TREE_VALUE (TREE_VALUE (attr));
30000 return (TREE_CODE (attr) == STRING_CST
30001 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
30004 /* Make a dispatcher declaration for the multi-versioned function DECL.
30005 Calls to DECL function will be replaced with calls to the dispatcher
30006 by the front-end. Returns the decl of the dispatcher function. */
30008 static tree
30009 ix86_get_function_versions_dispatcher (void *decl)
30011 tree fn = (tree) decl;
30012 struct cgraph_node *node = NULL;
30013 struct cgraph_node *default_node = NULL;
30014 struct cgraph_function_version_info *node_v = NULL;
30015 struct cgraph_function_version_info *first_v = NULL;
30017 tree dispatch_decl = NULL;
30019 struct cgraph_function_version_info *default_version_info = NULL;
30021 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
30023 node = cgraph_get_node (fn);
30024 gcc_assert (node != NULL);
30026 node_v = get_cgraph_node_version (node);
30027 gcc_assert (node_v != NULL);
30029 if (node_v->dispatcher_resolver != NULL)
30030 return node_v->dispatcher_resolver;
30032 /* Find the default version and make it the first node. */
30033 first_v = node_v;
30034 /* Go to the beginning of the chain. */
30035 while (first_v->prev != NULL)
30036 first_v = first_v->prev;
30037 default_version_info = first_v;
30038 while (default_version_info != NULL)
30040 if (is_function_default_version
30041 (default_version_info->this_node->symbol.decl))
30042 break;
30043 default_version_info = default_version_info->next;
30046 /* If there is no default node, just return NULL. */
30047 if (default_version_info == NULL)
30048 return NULL;
30050 /* Make default info the first node. */
30051 if (first_v != default_version_info)
30053 default_version_info->prev->next = default_version_info->next;
30054 if (default_version_info->next)
30055 default_version_info->next->prev = default_version_info->prev;
30056 first_v->prev = default_version_info;
30057 default_version_info->next = first_v;
30058 default_version_info->prev = NULL;
30061 default_node = default_version_info->this_node;
30063 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30064 if (targetm.has_ifunc_p ())
30066 struct cgraph_function_version_info *it_v = NULL;
30067 struct cgraph_node *dispatcher_node = NULL;
30068 struct cgraph_function_version_info *dispatcher_version_info = NULL;
30070 /* Right now, the dispatching is done via ifunc. */
30071 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
30073 dispatcher_node = cgraph_get_create_node (dispatch_decl);
30074 gcc_assert (dispatcher_node != NULL);
30075 dispatcher_node->dispatcher_function = 1;
30076 dispatcher_version_info
30077 = insert_new_cgraph_node_version (dispatcher_node);
30078 dispatcher_version_info->next = default_version_info;
30079 dispatcher_node->symbol.definition = 1;
30081 /* Set the dispatcher for all the versions. */
30082 it_v = default_version_info;
30083 while (it_v != NULL)
30085 it_v->dispatcher_resolver = dispatch_decl;
30086 it_v = it_v->next;
30089 else
30090 #endif
30092 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
30093 "multiversioning needs ifunc which is not supported "
30094 "on this target");
30097 return dispatch_decl;
30100 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
30101 it to CHAIN. */
30103 static tree
30104 make_attribute (const char *name, const char *arg_name, tree chain)
30106 tree attr_name;
30107 tree attr_arg_name;
30108 tree attr_args;
30109 tree attr;
30111 attr_name = get_identifier (name);
30112 attr_arg_name = build_string (strlen (arg_name), arg_name);
30113 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
30114 attr = tree_cons (attr_name, attr_args, chain);
30115 return attr;
30118 /* Make the resolver function decl to dispatch the versions of
30119 a multi-versioned function, DEFAULT_DECL. Create an
30120 empty basic block in the resolver and store the pointer in
30121 EMPTY_BB. Return the decl of the resolver function. */
30123 static tree
30124 make_resolver_func (const tree default_decl,
30125 const tree dispatch_decl,
30126 basic_block *empty_bb)
30128 char *resolver_name;
30129 tree decl, type, decl_name, t;
30130 bool is_uniq = false;
30132 /* IFUNC's have to be globally visible. So, if the default_decl is
30133 not, then the name of the IFUNC should be made unique. */
30134 if (TREE_PUBLIC (default_decl) == 0)
30135 is_uniq = true;
30137 /* Append the filename to the resolver function if the versions are
30138 not externally visible. This is because the resolver function has
30139 to be externally visible for the loader to find it. So, appending
30140 the filename will prevent conflicts with a resolver function from
30141 another module which is based on the same version name. */
30142 resolver_name = make_name (default_decl, "resolver", is_uniq);
30144 /* The resolver function should return a (void *). */
30145 type = build_function_type_list (ptr_type_node, NULL_TREE);
30147 decl = build_fn_decl (resolver_name, type);
30148 decl_name = get_identifier (resolver_name);
30149 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
30151 DECL_NAME (decl) = decl_name;
30152 TREE_USED (decl) = 1;
30153 DECL_ARTIFICIAL (decl) = 1;
30154 DECL_IGNORED_P (decl) = 0;
30155 /* IFUNC resolvers have to be externally visible. */
30156 TREE_PUBLIC (decl) = 1;
30157 DECL_UNINLINABLE (decl) = 0;
30159 /* Resolver is not external, body is generated. */
30160 DECL_EXTERNAL (decl) = 0;
30161 DECL_EXTERNAL (dispatch_decl) = 0;
30163 DECL_CONTEXT (decl) = NULL_TREE;
30164 DECL_INITIAL (decl) = make_node (BLOCK);
30165 DECL_STATIC_CONSTRUCTOR (decl) = 0;
30167 if (DECL_COMDAT_GROUP (default_decl)
30168 || TREE_PUBLIC (default_decl))
30170 /* In this case, each translation unit with a call to this
30171 versioned function will put out a resolver. Ensure it
30172 is comdat to keep just one copy. */
30173 DECL_COMDAT (decl) = 1;
30174 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
30176 /* Build result decl and add to function_decl. */
30177 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
30178 DECL_ARTIFICIAL (t) = 1;
30179 DECL_IGNORED_P (t) = 1;
30180 DECL_RESULT (decl) = t;
30182 gimplify_function_tree (decl);
30183 push_cfun (DECL_STRUCT_FUNCTION (decl));
30184 *empty_bb = init_lowered_empty_function (decl, false);
30186 cgraph_add_new_function (decl, true);
30187 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
30189 pop_cfun ();
30191 gcc_assert (dispatch_decl != NULL);
30192 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
30193 DECL_ATTRIBUTES (dispatch_decl)
30194 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
30196 /* Create the alias for dispatch to resolver here. */
30197 /*cgraph_create_function_alias (dispatch_decl, decl);*/
30198 cgraph_same_body_alias (NULL, dispatch_decl, decl);
30199 XDELETEVEC (resolver_name);
30200 return decl;
30203 /* Generate the dispatching code body to dispatch multi-versioned function
30204 DECL. The target hook is called to process the "target" attributes and
30205 provide the code to dispatch the right function at run-time. NODE points
30206 to the dispatcher decl whose body will be created. */
30208 static tree
30209 ix86_generate_version_dispatcher_body (void *node_p)
30211 tree resolver_decl;
30212 basic_block empty_bb;
30213 vec<tree> fn_ver_vec = vNULL;
30214 tree default_ver_decl;
30215 struct cgraph_node *versn;
30216 struct cgraph_node *node;
30218 struct cgraph_function_version_info *node_version_info = NULL;
30219 struct cgraph_function_version_info *versn_info = NULL;
30221 node = (cgraph_node *)node_p;
30223 node_version_info = get_cgraph_node_version (node);
30224 gcc_assert (node->dispatcher_function
30225 && node_version_info != NULL);
30227 if (node_version_info->dispatcher_resolver)
30228 return node_version_info->dispatcher_resolver;
30230 /* The first version in the chain corresponds to the default version. */
30231 default_ver_decl = node_version_info->next->this_node->symbol.decl;
30233 /* node is going to be an alias, so remove the finalized bit. */
30234 node->symbol.definition = false;
30236 resolver_decl = make_resolver_func (default_ver_decl,
30237 node->symbol.decl, &empty_bb);
30239 node_version_info->dispatcher_resolver = resolver_decl;
30241 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30243 fn_ver_vec.create (2);
30245 for (versn_info = node_version_info->next; versn_info;
30246 versn_info = versn_info->next)
30248 versn = versn_info->this_node;
30249 /* Check for virtual functions here again, as by this time it should
30250 have been determined if this function needs a vtable index or
30251 not. This happens for methods in derived classes that override
30252 virtual methods in base classes but are not explicitly marked as
30253 virtual. */
30254 if (DECL_VINDEX (versn->symbol.decl))
30255 sorry ("Virtual function multiversioning not supported");
30257 fn_ver_vec.safe_push (versn->symbol.decl);
30260 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30261 fn_ver_vec.release ();
30262 rebuild_cgraph_edges ();
30263 pop_cfun ();
30264 return resolver_decl;
30266 /* This builds the processor_model struct type defined in
30267 libgcc/config/i386/cpuinfo.c */
30269 static tree
30270 build_processor_model_struct (void)
30272 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30273 "__cpu_features"};
30274 tree field = NULL_TREE, field_chain = NULL_TREE;
30275 int i;
30276 tree type = make_node (RECORD_TYPE);
30278 /* The first 3 fields are unsigned int. */
30279 for (i = 0; i < 3; ++i)
30281 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30282 get_identifier (field_name[i]), unsigned_type_node);
30283 if (field_chain != NULL_TREE)
30284 DECL_CHAIN (field) = field_chain;
30285 field_chain = field;
30288 /* The last field is an array of unsigned integers of size one. */
30289 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30290 get_identifier (field_name[3]),
30291 build_array_type (unsigned_type_node,
30292 build_index_type (size_one_node)));
30293 if (field_chain != NULL_TREE)
30294 DECL_CHAIN (field) = field_chain;
30295 field_chain = field;
30297 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30298 return type;
30301 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30303 static tree
30304 make_var_decl (tree type, const char *name)
30306 tree new_decl;
30308 new_decl = build_decl (UNKNOWN_LOCATION,
30309 VAR_DECL,
30310 get_identifier(name),
30311 type);
30313 DECL_EXTERNAL (new_decl) = 1;
30314 TREE_STATIC (new_decl) = 1;
30315 TREE_PUBLIC (new_decl) = 1;
30316 DECL_INITIAL (new_decl) = 0;
30317 DECL_ARTIFICIAL (new_decl) = 0;
30318 DECL_PRESERVE_P (new_decl) = 1;
30320 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30321 assemble_variable (new_decl, 0, 0, 0);
30323 return new_decl;
30326 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30327 into an integer defined in libgcc/config/i386/cpuinfo.c */
30329 static tree
30330 fold_builtin_cpu (tree fndecl, tree *args)
30332 unsigned int i;
30333 enum ix86_builtins fn_code = (enum ix86_builtins)
30334 DECL_FUNCTION_CODE (fndecl);
30335 tree param_string_cst = NULL;
30337 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30338 enum processor_features
30340 F_CMOV = 0,
30341 F_MMX,
30342 F_POPCNT,
30343 F_SSE,
30344 F_SSE2,
30345 F_SSE3,
30346 F_SSSE3,
30347 F_SSE4_1,
30348 F_SSE4_2,
30349 F_AVX,
30350 F_AVX2,
30351 F_MAX
30354 /* These are the values for vendor types and cpu types and subtypes
30355 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30356 the corresponding start value. */
30357 enum processor_model
30359 M_INTEL = 1,
30360 M_AMD,
30361 M_CPU_TYPE_START,
30362 M_INTEL_ATOM,
30363 M_INTEL_CORE2,
30364 M_INTEL_COREI7,
30365 M_AMDFAM10H,
30366 M_AMDFAM15H,
30367 M_INTEL_SLM,
30368 M_CPU_SUBTYPE_START,
30369 M_INTEL_COREI7_NEHALEM,
30370 M_INTEL_COREI7_WESTMERE,
30371 M_INTEL_COREI7_SANDYBRIDGE,
30372 M_AMDFAM10H_BARCELONA,
30373 M_AMDFAM10H_SHANGHAI,
30374 M_AMDFAM10H_ISTANBUL,
30375 M_AMDFAM15H_BDVER1,
30376 M_AMDFAM15H_BDVER2,
30377 M_AMDFAM15H_BDVER3
30380 static struct _arch_names_table
30382 const char *const name;
30383 const enum processor_model model;
30385 const arch_names_table[] =
30387 {"amd", M_AMD},
30388 {"intel", M_INTEL},
30389 {"atom", M_INTEL_ATOM},
30390 {"slm", M_INTEL_SLM},
30391 {"core2", M_INTEL_CORE2},
30392 {"corei7", M_INTEL_COREI7},
30393 {"nehalem", M_INTEL_COREI7_NEHALEM},
30394 {"westmere", M_INTEL_COREI7_WESTMERE},
30395 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30396 {"amdfam10h", M_AMDFAM10H},
30397 {"barcelona", M_AMDFAM10H_BARCELONA},
30398 {"shanghai", M_AMDFAM10H_SHANGHAI},
30399 {"istanbul", M_AMDFAM10H_ISTANBUL},
30400 {"amdfam15h", M_AMDFAM15H},
30401 {"bdver1", M_AMDFAM15H_BDVER1},
30402 {"bdver2", M_AMDFAM15H_BDVER2},
30403 {"bdver3", M_AMDFAM15H_BDVER3},
30406 static struct _isa_names_table
30408 const char *const name;
30409 const enum processor_features feature;
30411 const isa_names_table[] =
30413 {"cmov", F_CMOV},
30414 {"mmx", F_MMX},
30415 {"popcnt", F_POPCNT},
30416 {"sse", F_SSE},
30417 {"sse2", F_SSE2},
30418 {"sse3", F_SSE3},
30419 {"ssse3", F_SSSE3},
30420 {"sse4.1", F_SSE4_1},
30421 {"sse4.2", F_SSE4_2},
30422 {"avx", F_AVX},
30423 {"avx2", F_AVX2}
30426 tree __processor_model_type = build_processor_model_struct ();
30427 tree __cpu_model_var = make_var_decl (__processor_model_type,
30428 "__cpu_model");
30431 varpool_add_new_variable (__cpu_model_var);
30433 gcc_assert ((args != NULL) && (*args != NULL));
30435 param_string_cst = *args;
30436 while (param_string_cst
30437 && TREE_CODE (param_string_cst) != STRING_CST)
30439 /* *args must be a expr that can contain other EXPRS leading to a
30440 STRING_CST. */
30441 if (!EXPR_P (param_string_cst))
30443 error ("Parameter to builtin must be a string constant or literal");
30444 return integer_zero_node;
30446 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30449 gcc_assert (param_string_cst);
30451 if (fn_code == IX86_BUILTIN_CPU_IS)
30453 tree ref;
30454 tree field;
30455 tree final;
30457 unsigned int field_val = 0;
30458 unsigned int NUM_ARCH_NAMES
30459 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30461 for (i = 0; i < NUM_ARCH_NAMES; i++)
30462 if (strcmp (arch_names_table[i].name,
30463 TREE_STRING_POINTER (param_string_cst)) == 0)
30464 break;
30466 if (i == NUM_ARCH_NAMES)
30468 error ("Parameter to builtin not valid: %s",
30469 TREE_STRING_POINTER (param_string_cst));
30470 return integer_zero_node;
30473 field = TYPE_FIELDS (__processor_model_type);
30474 field_val = arch_names_table[i].model;
30476 /* CPU types are stored in the next field. */
30477 if (field_val > M_CPU_TYPE_START
30478 && field_val < M_CPU_SUBTYPE_START)
30480 field = DECL_CHAIN (field);
30481 field_val -= M_CPU_TYPE_START;
30484 /* CPU subtypes are stored in the next field. */
30485 if (field_val > M_CPU_SUBTYPE_START)
30487 field = DECL_CHAIN ( DECL_CHAIN (field));
30488 field_val -= M_CPU_SUBTYPE_START;
30491 /* Get the appropriate field in __cpu_model. */
30492 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30493 field, NULL_TREE);
30495 /* Check the value. */
30496 final = build2 (EQ_EXPR, unsigned_type_node, ref,
30497 build_int_cstu (unsigned_type_node, field_val));
30498 return build1 (CONVERT_EXPR, integer_type_node, final);
30500 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30502 tree ref;
30503 tree array_elt;
30504 tree field;
30505 tree final;
30507 unsigned int field_val = 0;
30508 unsigned int NUM_ISA_NAMES
30509 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
30511 for (i = 0; i < NUM_ISA_NAMES; i++)
30512 if (strcmp (isa_names_table[i].name,
30513 TREE_STRING_POINTER (param_string_cst)) == 0)
30514 break;
30516 if (i == NUM_ISA_NAMES)
30518 error ("Parameter to builtin not valid: %s",
30519 TREE_STRING_POINTER (param_string_cst));
30520 return integer_zero_node;
30523 field = TYPE_FIELDS (__processor_model_type);
30524 /* Get the last field, which is __cpu_features. */
30525 while (DECL_CHAIN (field))
30526 field = DECL_CHAIN (field);
30528 /* Get the appropriate field: __cpu_model.__cpu_features */
30529 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30530 field, NULL_TREE);
30532 /* Access the 0th element of __cpu_features array. */
30533 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
30534 integer_zero_node, NULL_TREE, NULL_TREE);
30536 field_val = (1 << isa_names_table[i].feature);
30537 /* Return __cpu_model.__cpu_features[0] & field_val */
30538 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
30539 build_int_cstu (unsigned_type_node, field_val));
30540 return build1 (CONVERT_EXPR, integer_type_node, final);
30542 gcc_unreachable ();
30545 static tree
30546 ix86_fold_builtin (tree fndecl, int n_args,
30547 tree *args, bool ignore ATTRIBUTE_UNUSED)
30549 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30551 enum ix86_builtins fn_code = (enum ix86_builtins)
30552 DECL_FUNCTION_CODE (fndecl);
30553 if (fn_code == IX86_BUILTIN_CPU_IS
30554 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30556 gcc_assert (n_args == 1);
30557 return fold_builtin_cpu (fndecl, args);
30561 #ifdef SUBTARGET_FOLD_BUILTIN
30562 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
30563 #endif
30565 return NULL_TREE;
30568 /* Make builtins to detect cpu type and features supported. NAME is
30569 the builtin name, CODE is the builtin code, and FTYPE is the function
30570 type of the builtin. */
30572 static void
30573 make_cpu_type_builtin (const char* name, int code,
30574 enum ix86_builtin_func_type ftype, bool is_const)
30576 tree decl;
30577 tree type;
30579 type = ix86_get_builtin_func_type (ftype);
30580 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30581 NULL, NULL_TREE);
30582 gcc_assert (decl != NULL_TREE);
30583 ix86_builtins[(int) code] = decl;
30584 TREE_READONLY (decl) = is_const;
30587 /* Make builtins to get CPU type and features supported. The created
30588 builtins are :
30590 __builtin_cpu_init (), to detect cpu type and features,
30591 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
30592 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
30595 static void
30596 ix86_init_platform_type_builtins (void)
30598 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
30599 INT_FTYPE_VOID, false);
30600 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
30601 INT_FTYPE_PCCHAR, true);
30602 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
30603 INT_FTYPE_PCCHAR, true);
30606 /* Internal method for ix86_init_builtins. */
30608 static void
30609 ix86_init_builtins_va_builtins_abi (void)
30611 tree ms_va_ref, sysv_va_ref;
30612 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
30613 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
30614 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
30615 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
30617 if (!TARGET_64BIT)
30618 return;
30619 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
30620 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
30621 ms_va_ref = build_reference_type (ms_va_list_type_node);
30622 sysv_va_ref =
30623 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
30625 fnvoid_va_end_ms =
30626 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30627 fnvoid_va_start_ms =
30628 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30629 fnvoid_va_end_sysv =
30630 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
30631 fnvoid_va_start_sysv =
30632 build_varargs_function_type_list (void_type_node, sysv_va_ref,
30633 NULL_TREE);
30634 fnvoid_va_copy_ms =
30635 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
30636 NULL_TREE);
30637 fnvoid_va_copy_sysv =
30638 build_function_type_list (void_type_node, sysv_va_ref,
30639 sysv_va_ref, NULL_TREE);
30641 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
30642 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
30643 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
30644 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
30645 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
30646 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
30647 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
30648 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30649 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
30650 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30651 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
30652 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30655 static void
30656 ix86_init_builtin_types (void)
30658 tree float128_type_node, float80_type_node;
30660 /* The __float80 type. */
30661 float80_type_node = long_double_type_node;
30662 if (TYPE_MODE (float80_type_node) != XFmode)
30664 /* The __float80 type. */
30665 float80_type_node = make_node (REAL_TYPE);
30667 TYPE_PRECISION (float80_type_node) = 80;
30668 layout_type (float80_type_node);
30670 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
30672 /* The __float128 type. */
30673 float128_type_node = make_node (REAL_TYPE);
30674 TYPE_PRECISION (float128_type_node) = 128;
30675 layout_type (float128_type_node);
30676 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
30678 /* This macro is built by i386-builtin-types.awk. */
30679 DEFINE_BUILTIN_PRIMITIVE_TYPES;
30682 static void
30683 ix86_init_builtins (void)
30685 tree t;
30687 ix86_init_builtin_types ();
30689 /* Builtins to get CPU type and features. */
30690 ix86_init_platform_type_builtins ();
30692 /* TFmode support builtins. */
30693 def_builtin_const (0, "__builtin_infq",
30694 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
30695 def_builtin_const (0, "__builtin_huge_valq",
30696 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
30698 /* We will expand them to normal call if SSE isn't available since
30699 they are used by libgcc. */
30700 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
30701 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
30702 BUILT_IN_MD, "__fabstf2", NULL_TREE);
30703 TREE_READONLY (t) = 1;
30704 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
30706 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
30707 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
30708 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
30709 TREE_READONLY (t) = 1;
30710 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
30712 ix86_init_tm_builtins ();
30713 ix86_init_mmx_sse_builtins ();
30715 if (TARGET_LP64)
30716 ix86_init_builtins_va_builtins_abi ();
30718 #ifdef SUBTARGET_INIT_BUILTINS
30719 SUBTARGET_INIT_BUILTINS;
30720 #endif
30723 /* Return the ix86 builtin for CODE. */
30725 static tree
30726 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
30728 if (code >= IX86_BUILTIN_MAX)
30729 return error_mark_node;
30731 return ix86_builtins[code];
30734 /* Errors in the source file can cause expand_expr to return const0_rtx
30735 where we expect a vector. To avoid crashing, use one of the vector
30736 clear instructions. */
30737 static rtx
30738 safe_vector_operand (rtx x, enum machine_mode mode)
30740 if (x == const0_rtx)
30741 x = CONST0_RTX (mode);
30742 return x;
30745 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30747 static rtx
30748 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30750 rtx pat;
30751 tree arg0 = CALL_EXPR_ARG (exp, 0);
30752 tree arg1 = CALL_EXPR_ARG (exp, 1);
30753 rtx op0 = expand_normal (arg0);
30754 rtx op1 = expand_normal (arg1);
30755 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30756 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30757 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30759 if (VECTOR_MODE_P (mode0))
30760 op0 = safe_vector_operand (op0, mode0);
30761 if (VECTOR_MODE_P (mode1))
30762 op1 = safe_vector_operand (op1, mode1);
30764 if (optimize || !target
30765 || GET_MODE (target) != tmode
30766 || !insn_data[icode].operand[0].predicate (target, tmode))
30767 target = gen_reg_rtx (tmode);
30769 if (GET_MODE (op1) == SImode && mode1 == TImode)
30771 rtx x = gen_reg_rtx (V4SImode);
30772 emit_insn (gen_sse2_loadd (x, op1));
30773 op1 = gen_lowpart (TImode, x);
30776 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30777 op0 = copy_to_mode_reg (mode0, op0);
30778 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30779 op1 = copy_to_mode_reg (mode1, op1);
30781 pat = GEN_FCN (icode) (target, op0, op1);
30782 if (! pat)
30783 return 0;
30785 emit_insn (pat);
30787 return target;
30790 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30792 static rtx
30793 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30794 enum ix86_builtin_func_type m_type,
30795 enum rtx_code sub_code)
30797 rtx pat;
30798 int i;
30799 int nargs;
30800 bool comparison_p = false;
30801 bool tf_p = false;
30802 bool last_arg_constant = false;
30803 int num_memory = 0;
30804 struct {
30805 rtx op;
30806 enum machine_mode mode;
30807 } args[4];
30809 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30811 switch (m_type)
30813 case MULTI_ARG_4_DF2_DI_I:
30814 case MULTI_ARG_4_DF2_DI_I1:
30815 case MULTI_ARG_4_SF2_SI_I:
30816 case MULTI_ARG_4_SF2_SI_I1:
30817 nargs = 4;
30818 last_arg_constant = true;
30819 break;
30821 case MULTI_ARG_3_SF:
30822 case MULTI_ARG_3_DF:
30823 case MULTI_ARG_3_SF2:
30824 case MULTI_ARG_3_DF2:
30825 case MULTI_ARG_3_DI:
30826 case MULTI_ARG_3_SI:
30827 case MULTI_ARG_3_SI_DI:
30828 case MULTI_ARG_3_HI:
30829 case MULTI_ARG_3_HI_SI:
30830 case MULTI_ARG_3_QI:
30831 case MULTI_ARG_3_DI2:
30832 case MULTI_ARG_3_SI2:
30833 case MULTI_ARG_3_HI2:
30834 case MULTI_ARG_3_QI2:
30835 nargs = 3;
30836 break;
30838 case MULTI_ARG_2_SF:
30839 case MULTI_ARG_2_DF:
30840 case MULTI_ARG_2_DI:
30841 case MULTI_ARG_2_SI:
30842 case MULTI_ARG_2_HI:
30843 case MULTI_ARG_2_QI:
30844 nargs = 2;
30845 break;
30847 case MULTI_ARG_2_DI_IMM:
30848 case MULTI_ARG_2_SI_IMM:
30849 case MULTI_ARG_2_HI_IMM:
30850 case MULTI_ARG_2_QI_IMM:
30851 nargs = 2;
30852 last_arg_constant = true;
30853 break;
30855 case MULTI_ARG_1_SF:
30856 case MULTI_ARG_1_DF:
30857 case MULTI_ARG_1_SF2:
30858 case MULTI_ARG_1_DF2:
30859 case MULTI_ARG_1_DI:
30860 case MULTI_ARG_1_SI:
30861 case MULTI_ARG_1_HI:
30862 case MULTI_ARG_1_QI:
30863 case MULTI_ARG_1_SI_DI:
30864 case MULTI_ARG_1_HI_DI:
30865 case MULTI_ARG_1_HI_SI:
30866 case MULTI_ARG_1_QI_DI:
30867 case MULTI_ARG_1_QI_SI:
30868 case MULTI_ARG_1_QI_HI:
30869 nargs = 1;
30870 break;
30872 case MULTI_ARG_2_DI_CMP:
30873 case MULTI_ARG_2_SI_CMP:
30874 case MULTI_ARG_2_HI_CMP:
30875 case MULTI_ARG_2_QI_CMP:
30876 nargs = 2;
30877 comparison_p = true;
30878 break;
30880 case MULTI_ARG_2_SF_TF:
30881 case MULTI_ARG_2_DF_TF:
30882 case MULTI_ARG_2_DI_TF:
30883 case MULTI_ARG_2_SI_TF:
30884 case MULTI_ARG_2_HI_TF:
30885 case MULTI_ARG_2_QI_TF:
30886 nargs = 2;
30887 tf_p = true;
30888 break;
30890 default:
30891 gcc_unreachable ();
30894 if (optimize || !target
30895 || GET_MODE (target) != tmode
30896 || !insn_data[icode].operand[0].predicate (target, tmode))
30897 target = gen_reg_rtx (tmode);
30899 gcc_assert (nargs <= 4);
30901 for (i = 0; i < nargs; i++)
30903 tree arg = CALL_EXPR_ARG (exp, i);
30904 rtx op = expand_normal (arg);
30905 int adjust = (comparison_p) ? 1 : 0;
30906 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30908 if (last_arg_constant && i == nargs - 1)
30910 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30912 enum insn_code new_icode = icode;
30913 switch (icode)
30915 case CODE_FOR_xop_vpermil2v2df3:
30916 case CODE_FOR_xop_vpermil2v4sf3:
30917 case CODE_FOR_xop_vpermil2v4df3:
30918 case CODE_FOR_xop_vpermil2v8sf3:
30919 error ("the last argument must be a 2-bit immediate");
30920 return gen_reg_rtx (tmode);
30921 case CODE_FOR_xop_rotlv2di3:
30922 new_icode = CODE_FOR_rotlv2di3;
30923 goto xop_rotl;
30924 case CODE_FOR_xop_rotlv4si3:
30925 new_icode = CODE_FOR_rotlv4si3;
30926 goto xop_rotl;
30927 case CODE_FOR_xop_rotlv8hi3:
30928 new_icode = CODE_FOR_rotlv8hi3;
30929 goto xop_rotl;
30930 case CODE_FOR_xop_rotlv16qi3:
30931 new_icode = CODE_FOR_rotlv16qi3;
30932 xop_rotl:
30933 if (CONST_INT_P (op))
30935 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30936 op = GEN_INT (INTVAL (op) & mask);
30937 gcc_checking_assert
30938 (insn_data[icode].operand[i + 1].predicate (op, mode));
30940 else
30942 gcc_checking_assert
30943 (nargs == 2
30944 && insn_data[new_icode].operand[0].mode == tmode
30945 && insn_data[new_icode].operand[1].mode == tmode
30946 && insn_data[new_icode].operand[2].mode == mode
30947 && insn_data[new_icode].operand[0].predicate
30948 == insn_data[icode].operand[0].predicate
30949 && insn_data[new_icode].operand[1].predicate
30950 == insn_data[icode].operand[1].predicate);
30951 icode = new_icode;
30952 goto non_constant;
30954 break;
30955 default:
30956 gcc_unreachable ();
30960 else
30962 non_constant:
30963 if (VECTOR_MODE_P (mode))
30964 op = safe_vector_operand (op, mode);
30966 /* If we aren't optimizing, only allow one memory operand to be
30967 generated. */
30968 if (memory_operand (op, mode))
30969 num_memory++;
30971 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30973 if (optimize
30974 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30975 || num_memory > 1)
30976 op = force_reg (mode, op);
30979 args[i].op = op;
30980 args[i].mode = mode;
30983 switch (nargs)
30985 case 1:
30986 pat = GEN_FCN (icode) (target, args[0].op);
30987 break;
30989 case 2:
30990 if (tf_p)
30991 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30992 GEN_INT ((int)sub_code));
30993 else if (! comparison_p)
30994 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30995 else
30997 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30998 args[0].op,
30999 args[1].op);
31001 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
31003 break;
31005 case 3:
31006 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31007 break;
31009 case 4:
31010 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
31011 break;
31013 default:
31014 gcc_unreachable ();
31017 if (! pat)
31018 return 0;
31020 emit_insn (pat);
31021 return target;
31024 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
31025 insns with vec_merge. */
31027 static rtx
31028 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
31029 rtx target)
31031 rtx pat;
31032 tree arg0 = CALL_EXPR_ARG (exp, 0);
31033 rtx op1, op0 = expand_normal (arg0);
31034 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31035 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31037 if (optimize || !target
31038 || GET_MODE (target) != tmode
31039 || !insn_data[icode].operand[0].predicate (target, tmode))
31040 target = gen_reg_rtx (tmode);
31042 if (VECTOR_MODE_P (mode0))
31043 op0 = safe_vector_operand (op0, mode0);
31045 if ((optimize && !register_operand (op0, mode0))
31046 || !insn_data[icode].operand[1].predicate (op0, mode0))
31047 op0 = copy_to_mode_reg (mode0, op0);
31049 op1 = op0;
31050 if (!insn_data[icode].operand[2].predicate (op1, mode0))
31051 op1 = copy_to_mode_reg (mode0, op1);
31053 pat = GEN_FCN (icode) (target, op0, op1);
31054 if (! pat)
31055 return 0;
31056 emit_insn (pat);
31057 return target;
31060 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
31062 static rtx
31063 ix86_expand_sse_compare (const struct builtin_description *d,
31064 tree exp, rtx target, bool swap)
31066 rtx pat;
31067 tree arg0 = CALL_EXPR_ARG (exp, 0);
31068 tree arg1 = CALL_EXPR_ARG (exp, 1);
31069 rtx op0 = expand_normal (arg0);
31070 rtx op1 = expand_normal (arg1);
31071 rtx op2;
31072 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31073 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31074 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31075 enum rtx_code comparison = d->comparison;
31077 if (VECTOR_MODE_P (mode0))
31078 op0 = safe_vector_operand (op0, mode0);
31079 if (VECTOR_MODE_P (mode1))
31080 op1 = safe_vector_operand (op1, mode1);
31082 /* Swap operands if we have a comparison that isn't available in
31083 hardware. */
31084 if (swap)
31086 rtx tmp = gen_reg_rtx (mode1);
31087 emit_move_insn (tmp, op1);
31088 op1 = op0;
31089 op0 = tmp;
31092 if (optimize || !target
31093 || GET_MODE (target) != tmode
31094 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31095 target = gen_reg_rtx (tmode);
31097 if ((optimize && !register_operand (op0, mode0))
31098 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
31099 op0 = copy_to_mode_reg (mode0, op0);
31100 if ((optimize && !register_operand (op1, mode1))
31101 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
31102 op1 = copy_to_mode_reg (mode1, op1);
31104 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
31105 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31106 if (! pat)
31107 return 0;
31108 emit_insn (pat);
31109 return target;
31112 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
31114 static rtx
31115 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
31116 rtx target)
31118 rtx pat;
31119 tree arg0 = CALL_EXPR_ARG (exp, 0);
31120 tree arg1 = CALL_EXPR_ARG (exp, 1);
31121 rtx op0 = expand_normal (arg0);
31122 rtx op1 = expand_normal (arg1);
31123 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31124 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31125 enum rtx_code comparison = d->comparison;
31127 if (VECTOR_MODE_P (mode0))
31128 op0 = safe_vector_operand (op0, mode0);
31129 if (VECTOR_MODE_P (mode1))
31130 op1 = safe_vector_operand (op1, mode1);
31132 /* Swap operands if we have a comparison that isn't available in
31133 hardware. */
31134 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
31136 rtx tmp = op1;
31137 op1 = op0;
31138 op0 = tmp;
31141 target = gen_reg_rtx (SImode);
31142 emit_move_insn (target, const0_rtx);
31143 target = gen_rtx_SUBREG (QImode, target, 0);
31145 if ((optimize && !register_operand (op0, mode0))
31146 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31147 op0 = copy_to_mode_reg (mode0, op0);
31148 if ((optimize && !register_operand (op1, mode1))
31149 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31150 op1 = copy_to_mode_reg (mode1, op1);
31152 pat = GEN_FCN (d->icode) (op0, op1);
31153 if (! pat)
31154 return 0;
31155 emit_insn (pat);
31156 emit_insn (gen_rtx_SET (VOIDmode,
31157 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31158 gen_rtx_fmt_ee (comparison, QImode,
31159 SET_DEST (pat),
31160 const0_rtx)));
31162 return SUBREG_REG (target);
31165 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
31167 static rtx
31168 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
31169 rtx target)
31171 rtx pat;
31172 tree arg0 = CALL_EXPR_ARG (exp, 0);
31173 rtx op1, op0 = expand_normal (arg0);
31174 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31175 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31177 if (optimize || target == 0
31178 || GET_MODE (target) != tmode
31179 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31180 target = gen_reg_rtx (tmode);
31182 if (VECTOR_MODE_P (mode0))
31183 op0 = safe_vector_operand (op0, mode0);
31185 if ((optimize && !register_operand (op0, mode0))
31186 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31187 op0 = copy_to_mode_reg (mode0, op0);
31189 op1 = GEN_INT (d->comparison);
31191 pat = GEN_FCN (d->icode) (target, op0, op1);
31192 if (! pat)
31193 return 0;
31194 emit_insn (pat);
31195 return target;
31198 static rtx
31199 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
31200 tree exp, rtx target)
31202 rtx pat;
31203 tree arg0 = CALL_EXPR_ARG (exp, 0);
31204 tree arg1 = CALL_EXPR_ARG (exp, 1);
31205 rtx op0 = expand_normal (arg0);
31206 rtx op1 = expand_normal (arg1);
31207 rtx op2;
31208 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31209 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31210 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31212 if (optimize || target == 0
31213 || GET_MODE (target) != tmode
31214 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31215 target = gen_reg_rtx (tmode);
31217 op0 = safe_vector_operand (op0, mode0);
31218 op1 = safe_vector_operand (op1, mode1);
31220 if ((optimize && !register_operand (op0, mode0))
31221 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31222 op0 = copy_to_mode_reg (mode0, op0);
31223 if ((optimize && !register_operand (op1, mode1))
31224 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31225 op1 = copy_to_mode_reg (mode1, op1);
31227 op2 = GEN_INT (d->comparison);
31229 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31230 if (! pat)
31231 return 0;
31232 emit_insn (pat);
31233 return target;
31236 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31238 static rtx
31239 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31240 rtx target)
31242 rtx pat;
31243 tree arg0 = CALL_EXPR_ARG (exp, 0);
31244 tree arg1 = CALL_EXPR_ARG (exp, 1);
31245 rtx op0 = expand_normal (arg0);
31246 rtx op1 = expand_normal (arg1);
31247 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31248 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31249 enum rtx_code comparison = d->comparison;
31251 if (VECTOR_MODE_P (mode0))
31252 op0 = safe_vector_operand (op0, mode0);
31253 if (VECTOR_MODE_P (mode1))
31254 op1 = safe_vector_operand (op1, mode1);
31256 target = gen_reg_rtx (SImode);
31257 emit_move_insn (target, const0_rtx);
31258 target = gen_rtx_SUBREG (QImode, target, 0);
31260 if ((optimize && !register_operand (op0, mode0))
31261 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31262 op0 = copy_to_mode_reg (mode0, op0);
31263 if ((optimize && !register_operand (op1, mode1))
31264 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31265 op1 = copy_to_mode_reg (mode1, op1);
31267 pat = GEN_FCN (d->icode) (op0, op1);
31268 if (! pat)
31269 return 0;
31270 emit_insn (pat);
31271 emit_insn (gen_rtx_SET (VOIDmode,
31272 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31273 gen_rtx_fmt_ee (comparison, QImode,
31274 SET_DEST (pat),
31275 const0_rtx)));
31277 return SUBREG_REG (target);
31280 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31282 static rtx
31283 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31284 tree exp, rtx target)
31286 rtx pat;
31287 tree arg0 = CALL_EXPR_ARG (exp, 0);
31288 tree arg1 = CALL_EXPR_ARG (exp, 1);
31289 tree arg2 = CALL_EXPR_ARG (exp, 2);
31290 tree arg3 = CALL_EXPR_ARG (exp, 3);
31291 tree arg4 = CALL_EXPR_ARG (exp, 4);
31292 rtx scratch0, scratch1;
31293 rtx op0 = expand_normal (arg0);
31294 rtx op1 = expand_normal (arg1);
31295 rtx op2 = expand_normal (arg2);
31296 rtx op3 = expand_normal (arg3);
31297 rtx op4 = expand_normal (arg4);
31298 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31300 tmode0 = insn_data[d->icode].operand[0].mode;
31301 tmode1 = insn_data[d->icode].operand[1].mode;
31302 modev2 = insn_data[d->icode].operand[2].mode;
31303 modei3 = insn_data[d->icode].operand[3].mode;
31304 modev4 = insn_data[d->icode].operand[4].mode;
31305 modei5 = insn_data[d->icode].operand[5].mode;
31306 modeimm = insn_data[d->icode].operand[6].mode;
31308 if (VECTOR_MODE_P (modev2))
31309 op0 = safe_vector_operand (op0, modev2);
31310 if (VECTOR_MODE_P (modev4))
31311 op2 = safe_vector_operand (op2, modev4);
31313 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31314 op0 = copy_to_mode_reg (modev2, op0);
31315 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31316 op1 = copy_to_mode_reg (modei3, op1);
31317 if ((optimize && !register_operand (op2, modev4))
31318 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31319 op2 = copy_to_mode_reg (modev4, op2);
31320 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31321 op3 = copy_to_mode_reg (modei5, op3);
31323 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31325 error ("the fifth argument must be an 8-bit immediate");
31326 return const0_rtx;
31329 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31331 if (optimize || !target
31332 || GET_MODE (target) != tmode0
31333 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31334 target = gen_reg_rtx (tmode0);
31336 scratch1 = gen_reg_rtx (tmode1);
31338 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31340 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31342 if (optimize || !target
31343 || GET_MODE (target) != tmode1
31344 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31345 target = gen_reg_rtx (tmode1);
31347 scratch0 = gen_reg_rtx (tmode0);
31349 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31351 else
31353 gcc_assert (d->flag);
31355 scratch0 = gen_reg_rtx (tmode0);
31356 scratch1 = gen_reg_rtx (tmode1);
31358 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31361 if (! pat)
31362 return 0;
31364 emit_insn (pat);
31366 if (d->flag)
31368 target = gen_reg_rtx (SImode);
31369 emit_move_insn (target, const0_rtx);
31370 target = gen_rtx_SUBREG (QImode, target, 0);
31372 emit_insn
31373 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31374 gen_rtx_fmt_ee (EQ, QImode,
31375 gen_rtx_REG ((enum machine_mode) d->flag,
31376 FLAGS_REG),
31377 const0_rtx)));
31378 return SUBREG_REG (target);
31380 else
31381 return target;
31385 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31387 static rtx
31388 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31389 tree exp, rtx target)
31391 rtx pat;
31392 tree arg0 = CALL_EXPR_ARG (exp, 0);
31393 tree arg1 = CALL_EXPR_ARG (exp, 1);
31394 tree arg2 = CALL_EXPR_ARG (exp, 2);
31395 rtx scratch0, scratch1;
31396 rtx op0 = expand_normal (arg0);
31397 rtx op1 = expand_normal (arg1);
31398 rtx op2 = expand_normal (arg2);
31399 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31401 tmode0 = insn_data[d->icode].operand[0].mode;
31402 tmode1 = insn_data[d->icode].operand[1].mode;
31403 modev2 = insn_data[d->icode].operand[2].mode;
31404 modev3 = insn_data[d->icode].operand[3].mode;
31405 modeimm = insn_data[d->icode].operand[4].mode;
31407 if (VECTOR_MODE_P (modev2))
31408 op0 = safe_vector_operand (op0, modev2);
31409 if (VECTOR_MODE_P (modev3))
31410 op1 = safe_vector_operand (op1, modev3);
31412 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31413 op0 = copy_to_mode_reg (modev2, op0);
31414 if ((optimize && !register_operand (op1, modev3))
31415 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31416 op1 = copy_to_mode_reg (modev3, op1);
31418 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31420 error ("the third argument must be an 8-bit immediate");
31421 return const0_rtx;
31424 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31426 if (optimize || !target
31427 || GET_MODE (target) != tmode0
31428 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31429 target = gen_reg_rtx (tmode0);
31431 scratch1 = gen_reg_rtx (tmode1);
31433 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31435 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31437 if (optimize || !target
31438 || GET_MODE (target) != tmode1
31439 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31440 target = gen_reg_rtx (tmode1);
31442 scratch0 = gen_reg_rtx (tmode0);
31444 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31446 else
31448 gcc_assert (d->flag);
31450 scratch0 = gen_reg_rtx (tmode0);
31451 scratch1 = gen_reg_rtx (tmode1);
31453 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31456 if (! pat)
31457 return 0;
31459 emit_insn (pat);
31461 if (d->flag)
31463 target = gen_reg_rtx (SImode);
31464 emit_move_insn (target, const0_rtx);
31465 target = gen_rtx_SUBREG (QImode, target, 0);
31467 emit_insn
31468 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31469 gen_rtx_fmt_ee (EQ, QImode,
31470 gen_rtx_REG ((enum machine_mode) d->flag,
31471 FLAGS_REG),
31472 const0_rtx)));
31473 return SUBREG_REG (target);
31475 else
31476 return target;
31479 /* Subroutine of ix86_expand_builtin to take care of insns with
31480 variable number of operands. */
31482 static rtx
31483 ix86_expand_args_builtin (const struct builtin_description *d,
31484 tree exp, rtx target)
31486 rtx pat, real_target;
31487 unsigned int i, nargs;
31488 unsigned int nargs_constant = 0;
31489 int num_memory = 0;
31490 struct
31492 rtx op;
31493 enum machine_mode mode;
31494 } args[4];
31495 bool last_arg_count = false;
31496 enum insn_code icode = d->icode;
31497 const struct insn_data_d *insn_p = &insn_data[icode];
31498 enum machine_mode tmode = insn_p->operand[0].mode;
31499 enum machine_mode rmode = VOIDmode;
31500 bool swap = false;
31501 enum rtx_code comparison = d->comparison;
31503 switch ((enum ix86_builtin_func_type) d->flag)
31505 case V2DF_FTYPE_V2DF_ROUND:
31506 case V4DF_FTYPE_V4DF_ROUND:
31507 case V4SF_FTYPE_V4SF_ROUND:
31508 case V8SF_FTYPE_V8SF_ROUND:
31509 case V4SI_FTYPE_V4SF_ROUND:
31510 case V8SI_FTYPE_V8SF_ROUND:
31511 return ix86_expand_sse_round (d, exp, target);
31512 case V4SI_FTYPE_V2DF_V2DF_ROUND:
31513 case V8SI_FTYPE_V4DF_V4DF_ROUND:
31514 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
31515 case INT_FTYPE_V8SF_V8SF_PTEST:
31516 case INT_FTYPE_V4DI_V4DI_PTEST:
31517 case INT_FTYPE_V4DF_V4DF_PTEST:
31518 case INT_FTYPE_V4SF_V4SF_PTEST:
31519 case INT_FTYPE_V2DI_V2DI_PTEST:
31520 case INT_FTYPE_V2DF_V2DF_PTEST:
31521 return ix86_expand_sse_ptest (d, exp, target);
31522 case FLOAT128_FTYPE_FLOAT128:
31523 case FLOAT_FTYPE_FLOAT:
31524 case INT_FTYPE_INT:
31525 case UINT64_FTYPE_INT:
31526 case UINT16_FTYPE_UINT16:
31527 case INT64_FTYPE_INT64:
31528 case INT64_FTYPE_V4SF:
31529 case INT64_FTYPE_V2DF:
31530 case INT_FTYPE_V16QI:
31531 case INT_FTYPE_V8QI:
31532 case INT_FTYPE_V8SF:
31533 case INT_FTYPE_V4DF:
31534 case INT_FTYPE_V4SF:
31535 case INT_FTYPE_V2DF:
31536 case INT_FTYPE_V32QI:
31537 case V16QI_FTYPE_V16QI:
31538 case V8SI_FTYPE_V8SF:
31539 case V8SI_FTYPE_V4SI:
31540 case V8HI_FTYPE_V8HI:
31541 case V8HI_FTYPE_V16QI:
31542 case V8QI_FTYPE_V8QI:
31543 case V8SF_FTYPE_V8SF:
31544 case V8SF_FTYPE_V8SI:
31545 case V8SF_FTYPE_V4SF:
31546 case V8SF_FTYPE_V8HI:
31547 case V4SI_FTYPE_V4SI:
31548 case V4SI_FTYPE_V16QI:
31549 case V4SI_FTYPE_V4SF:
31550 case V4SI_FTYPE_V8SI:
31551 case V4SI_FTYPE_V8HI:
31552 case V4SI_FTYPE_V4DF:
31553 case V4SI_FTYPE_V2DF:
31554 case V4HI_FTYPE_V4HI:
31555 case V4DF_FTYPE_V4DF:
31556 case V4DF_FTYPE_V4SI:
31557 case V4DF_FTYPE_V4SF:
31558 case V4DF_FTYPE_V2DF:
31559 case V4SF_FTYPE_V4SF:
31560 case V4SF_FTYPE_V4SI:
31561 case V4SF_FTYPE_V8SF:
31562 case V4SF_FTYPE_V4DF:
31563 case V4SF_FTYPE_V8HI:
31564 case V4SF_FTYPE_V2DF:
31565 case V2DI_FTYPE_V2DI:
31566 case V2DI_FTYPE_V16QI:
31567 case V2DI_FTYPE_V8HI:
31568 case V2DI_FTYPE_V4SI:
31569 case V2DF_FTYPE_V2DF:
31570 case V2DF_FTYPE_V4SI:
31571 case V2DF_FTYPE_V4DF:
31572 case V2DF_FTYPE_V4SF:
31573 case V2DF_FTYPE_V2SI:
31574 case V2SI_FTYPE_V2SI:
31575 case V2SI_FTYPE_V4SF:
31576 case V2SI_FTYPE_V2SF:
31577 case V2SI_FTYPE_V2DF:
31578 case V2SF_FTYPE_V2SF:
31579 case V2SF_FTYPE_V2SI:
31580 case V32QI_FTYPE_V32QI:
31581 case V32QI_FTYPE_V16QI:
31582 case V16HI_FTYPE_V16HI:
31583 case V16HI_FTYPE_V8HI:
31584 case V8SI_FTYPE_V8SI:
31585 case V16HI_FTYPE_V16QI:
31586 case V8SI_FTYPE_V16QI:
31587 case V4DI_FTYPE_V16QI:
31588 case V8SI_FTYPE_V8HI:
31589 case V4DI_FTYPE_V8HI:
31590 case V4DI_FTYPE_V4SI:
31591 case V4DI_FTYPE_V2DI:
31592 nargs = 1;
31593 break;
31594 case V4SF_FTYPE_V4SF_VEC_MERGE:
31595 case V2DF_FTYPE_V2DF_VEC_MERGE:
31596 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
31597 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
31598 case V16QI_FTYPE_V16QI_V16QI:
31599 case V16QI_FTYPE_V8HI_V8HI:
31600 case V8QI_FTYPE_V8QI_V8QI:
31601 case V8QI_FTYPE_V4HI_V4HI:
31602 case V8HI_FTYPE_V8HI_V8HI:
31603 case V8HI_FTYPE_V16QI_V16QI:
31604 case V8HI_FTYPE_V4SI_V4SI:
31605 case V8SF_FTYPE_V8SF_V8SF:
31606 case V8SF_FTYPE_V8SF_V8SI:
31607 case V4SI_FTYPE_V4SI_V4SI:
31608 case V4SI_FTYPE_V8HI_V8HI:
31609 case V4SI_FTYPE_V4SF_V4SF:
31610 case V4SI_FTYPE_V2DF_V2DF:
31611 case V4HI_FTYPE_V4HI_V4HI:
31612 case V4HI_FTYPE_V8QI_V8QI:
31613 case V4HI_FTYPE_V2SI_V2SI:
31614 case V4DF_FTYPE_V4DF_V4DF:
31615 case V4DF_FTYPE_V4DF_V4DI:
31616 case V4SF_FTYPE_V4SF_V4SF:
31617 case V4SF_FTYPE_V4SF_V4SI:
31618 case V4SF_FTYPE_V4SF_V2SI:
31619 case V4SF_FTYPE_V4SF_V2DF:
31620 case V4SF_FTYPE_V4SF_DI:
31621 case V4SF_FTYPE_V4SF_SI:
31622 case V2DI_FTYPE_V2DI_V2DI:
31623 case V2DI_FTYPE_V16QI_V16QI:
31624 case V2DI_FTYPE_V4SI_V4SI:
31625 case V2UDI_FTYPE_V4USI_V4USI:
31626 case V2DI_FTYPE_V2DI_V16QI:
31627 case V2DI_FTYPE_V2DF_V2DF:
31628 case V2SI_FTYPE_V2SI_V2SI:
31629 case V2SI_FTYPE_V4HI_V4HI:
31630 case V2SI_FTYPE_V2SF_V2SF:
31631 case V2DF_FTYPE_V2DF_V2DF:
31632 case V2DF_FTYPE_V2DF_V4SF:
31633 case V2DF_FTYPE_V2DF_V2DI:
31634 case V2DF_FTYPE_V2DF_DI:
31635 case V2DF_FTYPE_V2DF_SI:
31636 case V2SF_FTYPE_V2SF_V2SF:
31637 case V1DI_FTYPE_V1DI_V1DI:
31638 case V1DI_FTYPE_V8QI_V8QI:
31639 case V1DI_FTYPE_V2SI_V2SI:
31640 case V32QI_FTYPE_V16HI_V16HI:
31641 case V16HI_FTYPE_V8SI_V8SI:
31642 case V32QI_FTYPE_V32QI_V32QI:
31643 case V16HI_FTYPE_V32QI_V32QI:
31644 case V16HI_FTYPE_V16HI_V16HI:
31645 case V8SI_FTYPE_V4DF_V4DF:
31646 case V8SI_FTYPE_V8SI_V8SI:
31647 case V8SI_FTYPE_V16HI_V16HI:
31648 case V4DI_FTYPE_V4DI_V4DI:
31649 case V4DI_FTYPE_V8SI_V8SI:
31650 case V4UDI_FTYPE_V8USI_V8USI:
31651 if (comparison == UNKNOWN)
31652 return ix86_expand_binop_builtin (icode, exp, target);
31653 nargs = 2;
31654 break;
31655 case V4SF_FTYPE_V4SF_V4SF_SWAP:
31656 case V2DF_FTYPE_V2DF_V2DF_SWAP:
31657 gcc_assert (comparison != UNKNOWN);
31658 nargs = 2;
31659 swap = true;
31660 break;
31661 case V16HI_FTYPE_V16HI_V8HI_COUNT:
31662 case V16HI_FTYPE_V16HI_SI_COUNT:
31663 case V8SI_FTYPE_V8SI_V4SI_COUNT:
31664 case V8SI_FTYPE_V8SI_SI_COUNT:
31665 case V4DI_FTYPE_V4DI_V2DI_COUNT:
31666 case V4DI_FTYPE_V4DI_INT_COUNT:
31667 case V8HI_FTYPE_V8HI_V8HI_COUNT:
31668 case V8HI_FTYPE_V8HI_SI_COUNT:
31669 case V4SI_FTYPE_V4SI_V4SI_COUNT:
31670 case V4SI_FTYPE_V4SI_SI_COUNT:
31671 case V4HI_FTYPE_V4HI_V4HI_COUNT:
31672 case V4HI_FTYPE_V4HI_SI_COUNT:
31673 case V2DI_FTYPE_V2DI_V2DI_COUNT:
31674 case V2DI_FTYPE_V2DI_SI_COUNT:
31675 case V2SI_FTYPE_V2SI_V2SI_COUNT:
31676 case V2SI_FTYPE_V2SI_SI_COUNT:
31677 case V1DI_FTYPE_V1DI_V1DI_COUNT:
31678 case V1DI_FTYPE_V1DI_SI_COUNT:
31679 nargs = 2;
31680 last_arg_count = true;
31681 break;
31682 case UINT64_FTYPE_UINT64_UINT64:
31683 case UINT_FTYPE_UINT_UINT:
31684 case UINT_FTYPE_UINT_USHORT:
31685 case UINT_FTYPE_UINT_UCHAR:
31686 case UINT16_FTYPE_UINT16_INT:
31687 case UINT8_FTYPE_UINT8_INT:
31688 nargs = 2;
31689 break;
31690 case V2DI_FTYPE_V2DI_INT_CONVERT:
31691 nargs = 2;
31692 rmode = V1TImode;
31693 nargs_constant = 1;
31694 break;
31695 case V4DI_FTYPE_V4DI_INT_CONVERT:
31696 nargs = 2;
31697 rmode = V2TImode;
31698 nargs_constant = 1;
31699 break;
31700 case V8HI_FTYPE_V8HI_INT:
31701 case V8HI_FTYPE_V8SF_INT:
31702 case V8HI_FTYPE_V4SF_INT:
31703 case V8SF_FTYPE_V8SF_INT:
31704 case V4SI_FTYPE_V4SI_INT:
31705 case V4SI_FTYPE_V8SI_INT:
31706 case V4HI_FTYPE_V4HI_INT:
31707 case V4DF_FTYPE_V4DF_INT:
31708 case V4SF_FTYPE_V4SF_INT:
31709 case V4SF_FTYPE_V8SF_INT:
31710 case V2DI_FTYPE_V2DI_INT:
31711 case V2DF_FTYPE_V2DF_INT:
31712 case V2DF_FTYPE_V4DF_INT:
31713 case V16HI_FTYPE_V16HI_INT:
31714 case V8SI_FTYPE_V8SI_INT:
31715 case V4DI_FTYPE_V4DI_INT:
31716 case V2DI_FTYPE_V4DI_INT:
31717 nargs = 2;
31718 nargs_constant = 1;
31719 break;
31720 case V16QI_FTYPE_V16QI_V16QI_V16QI:
31721 case V8SF_FTYPE_V8SF_V8SF_V8SF:
31722 case V4DF_FTYPE_V4DF_V4DF_V4DF:
31723 case V4SF_FTYPE_V4SF_V4SF_V4SF:
31724 case V2DF_FTYPE_V2DF_V2DF_V2DF:
31725 case V32QI_FTYPE_V32QI_V32QI_V32QI:
31726 nargs = 3;
31727 break;
31728 case V32QI_FTYPE_V32QI_V32QI_INT:
31729 case V16HI_FTYPE_V16HI_V16HI_INT:
31730 case V16QI_FTYPE_V16QI_V16QI_INT:
31731 case V4DI_FTYPE_V4DI_V4DI_INT:
31732 case V8HI_FTYPE_V8HI_V8HI_INT:
31733 case V8SI_FTYPE_V8SI_V8SI_INT:
31734 case V8SI_FTYPE_V8SI_V4SI_INT:
31735 case V8SF_FTYPE_V8SF_V8SF_INT:
31736 case V8SF_FTYPE_V8SF_V4SF_INT:
31737 case V4SI_FTYPE_V4SI_V4SI_INT:
31738 case V4DF_FTYPE_V4DF_V4DF_INT:
31739 case V4DF_FTYPE_V4DF_V2DF_INT:
31740 case V4SF_FTYPE_V4SF_V4SF_INT:
31741 case V2DI_FTYPE_V2DI_V2DI_INT:
31742 case V4DI_FTYPE_V4DI_V2DI_INT:
31743 case V2DF_FTYPE_V2DF_V2DF_INT:
31744 nargs = 3;
31745 nargs_constant = 1;
31746 break;
31747 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31748 nargs = 3;
31749 rmode = V4DImode;
31750 nargs_constant = 1;
31751 break;
31752 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31753 nargs = 3;
31754 rmode = V2DImode;
31755 nargs_constant = 1;
31756 break;
31757 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31758 nargs = 3;
31759 rmode = DImode;
31760 nargs_constant = 1;
31761 break;
31762 case V2DI_FTYPE_V2DI_UINT_UINT:
31763 nargs = 3;
31764 nargs_constant = 2;
31765 break;
31766 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31767 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31768 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31769 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31770 nargs = 4;
31771 nargs_constant = 1;
31772 break;
31773 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31774 nargs = 4;
31775 nargs_constant = 2;
31776 break;
31777 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31778 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31779 nargs = 4;
31780 break;
31781 default:
31782 gcc_unreachable ();
31785 gcc_assert (nargs <= ARRAY_SIZE (args));
31787 if (comparison != UNKNOWN)
31789 gcc_assert (nargs == 2);
31790 return ix86_expand_sse_compare (d, exp, target, swap);
31793 if (rmode == VOIDmode || rmode == tmode)
31795 if (optimize
31796 || target == 0
31797 || GET_MODE (target) != tmode
31798 || !insn_p->operand[0].predicate (target, tmode))
31799 target = gen_reg_rtx (tmode);
31800 real_target = target;
31802 else
31804 target = gen_reg_rtx (rmode);
31805 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31808 for (i = 0; i < nargs; i++)
31810 tree arg = CALL_EXPR_ARG (exp, i);
31811 rtx op = expand_normal (arg);
31812 enum machine_mode mode = insn_p->operand[i + 1].mode;
31813 bool match = insn_p->operand[i + 1].predicate (op, mode);
31815 if (last_arg_count && (i + 1) == nargs)
31817 /* SIMD shift insns take either an 8-bit immediate or
31818 register as count. But builtin functions take int as
31819 count. If count doesn't match, we put it in register. */
31820 if (!match)
31822 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31823 if (!insn_p->operand[i + 1].predicate (op, mode))
31824 op = copy_to_reg (op);
31827 else if ((nargs - i) <= nargs_constant)
31829 if (!match)
31830 switch (icode)
31832 case CODE_FOR_avx2_inserti128:
31833 case CODE_FOR_avx2_extracti128:
31834 error ("the last argument must be an 1-bit immediate");
31835 return const0_rtx;
31837 case CODE_FOR_sse4_1_roundsd:
31838 case CODE_FOR_sse4_1_roundss:
31840 case CODE_FOR_sse4_1_roundpd:
31841 case CODE_FOR_sse4_1_roundps:
31842 case CODE_FOR_avx_roundpd256:
31843 case CODE_FOR_avx_roundps256:
31845 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31846 case CODE_FOR_sse4_1_roundps_sfix:
31847 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31848 case CODE_FOR_avx_roundps_sfix256:
31850 case CODE_FOR_sse4_1_blendps:
31851 case CODE_FOR_avx_blendpd256:
31852 case CODE_FOR_avx_vpermilv4df:
31853 error ("the last argument must be a 4-bit immediate");
31854 return const0_rtx;
31856 case CODE_FOR_sse4_1_blendpd:
31857 case CODE_FOR_avx_vpermilv2df:
31858 case CODE_FOR_xop_vpermil2v2df3:
31859 case CODE_FOR_xop_vpermil2v4sf3:
31860 case CODE_FOR_xop_vpermil2v4df3:
31861 case CODE_FOR_xop_vpermil2v8sf3:
31862 error ("the last argument must be a 2-bit immediate");
31863 return const0_rtx;
31865 case CODE_FOR_avx_vextractf128v4df:
31866 case CODE_FOR_avx_vextractf128v8sf:
31867 case CODE_FOR_avx_vextractf128v8si:
31868 case CODE_FOR_avx_vinsertf128v4df:
31869 case CODE_FOR_avx_vinsertf128v8sf:
31870 case CODE_FOR_avx_vinsertf128v8si:
31871 error ("the last argument must be a 1-bit immediate");
31872 return const0_rtx;
31874 case CODE_FOR_avx_vmcmpv2df3:
31875 case CODE_FOR_avx_vmcmpv4sf3:
31876 case CODE_FOR_avx_cmpv2df3:
31877 case CODE_FOR_avx_cmpv4sf3:
31878 case CODE_FOR_avx_cmpv4df3:
31879 case CODE_FOR_avx_cmpv8sf3:
31880 error ("the last argument must be a 5-bit immediate");
31881 return const0_rtx;
31883 default:
31884 switch (nargs_constant)
31886 case 2:
31887 if ((nargs - i) == nargs_constant)
31889 error ("the next to last argument must be an 8-bit immediate");
31890 break;
31892 case 1:
31893 error ("the last argument must be an 8-bit immediate");
31894 break;
31895 default:
31896 gcc_unreachable ();
31898 return const0_rtx;
31901 else
31903 if (VECTOR_MODE_P (mode))
31904 op = safe_vector_operand (op, mode);
31906 /* If we aren't optimizing, only allow one memory operand to
31907 be generated. */
31908 if (memory_operand (op, mode))
31909 num_memory++;
31911 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31913 if (optimize || !match || num_memory > 1)
31914 op = copy_to_mode_reg (mode, op);
31916 else
31918 op = copy_to_reg (op);
31919 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31923 args[i].op = op;
31924 args[i].mode = mode;
31927 switch (nargs)
31929 case 1:
31930 pat = GEN_FCN (icode) (real_target, args[0].op);
31931 break;
31932 case 2:
31933 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31934 break;
31935 case 3:
31936 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31937 args[2].op);
31938 break;
31939 case 4:
31940 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31941 args[2].op, args[3].op);
31942 break;
31943 default:
31944 gcc_unreachable ();
31947 if (! pat)
31948 return 0;
31950 emit_insn (pat);
31951 return target;
31954 /* Subroutine of ix86_expand_builtin to take care of special insns
31955 with variable number of operands. */
31957 static rtx
31958 ix86_expand_special_args_builtin (const struct builtin_description *d,
31959 tree exp, rtx target)
31961 tree arg;
31962 rtx pat, op;
31963 unsigned int i, nargs, arg_adjust, memory;
31964 struct
31966 rtx op;
31967 enum machine_mode mode;
31968 } args[3];
31969 enum insn_code icode = d->icode;
31970 bool last_arg_constant = false;
31971 const struct insn_data_d *insn_p = &insn_data[icode];
31972 enum machine_mode tmode = insn_p->operand[0].mode;
31973 enum { load, store } klass;
31975 switch ((enum ix86_builtin_func_type) d->flag)
31977 case VOID_FTYPE_VOID:
31978 emit_insn (GEN_FCN (icode) (target));
31979 return 0;
31980 case VOID_FTYPE_UINT64:
31981 case VOID_FTYPE_UNSIGNED:
31982 nargs = 0;
31983 klass = store;
31984 memory = 0;
31985 break;
31987 case INT_FTYPE_VOID:
31988 case UINT64_FTYPE_VOID:
31989 case UNSIGNED_FTYPE_VOID:
31990 nargs = 0;
31991 klass = load;
31992 memory = 0;
31993 break;
31994 case UINT64_FTYPE_PUNSIGNED:
31995 case V2DI_FTYPE_PV2DI:
31996 case V4DI_FTYPE_PV4DI:
31997 case V32QI_FTYPE_PCCHAR:
31998 case V16QI_FTYPE_PCCHAR:
31999 case V8SF_FTYPE_PCV4SF:
32000 case V8SF_FTYPE_PCFLOAT:
32001 case V4SF_FTYPE_PCFLOAT:
32002 case V4DF_FTYPE_PCV2DF:
32003 case V4DF_FTYPE_PCDOUBLE:
32004 case V2DF_FTYPE_PCDOUBLE:
32005 case VOID_FTYPE_PVOID:
32006 nargs = 1;
32007 klass = load;
32008 memory = 0;
32009 break;
32010 case VOID_FTYPE_PV2SF_V4SF:
32011 case VOID_FTYPE_PV4DI_V4DI:
32012 case VOID_FTYPE_PV2DI_V2DI:
32013 case VOID_FTYPE_PCHAR_V32QI:
32014 case VOID_FTYPE_PCHAR_V16QI:
32015 case VOID_FTYPE_PFLOAT_V8SF:
32016 case VOID_FTYPE_PFLOAT_V4SF:
32017 case VOID_FTYPE_PDOUBLE_V4DF:
32018 case VOID_FTYPE_PDOUBLE_V2DF:
32019 case VOID_FTYPE_PLONGLONG_LONGLONG:
32020 case VOID_FTYPE_PULONGLONG_ULONGLONG:
32021 case VOID_FTYPE_PINT_INT:
32022 nargs = 1;
32023 klass = store;
32024 /* Reserve memory operand for target. */
32025 memory = ARRAY_SIZE (args);
32026 break;
32027 case V4SF_FTYPE_V4SF_PCV2SF:
32028 case V2DF_FTYPE_V2DF_PCDOUBLE:
32029 nargs = 2;
32030 klass = load;
32031 memory = 1;
32032 break;
32033 case V8SF_FTYPE_PCV8SF_V8SI:
32034 case V4DF_FTYPE_PCV4DF_V4DI:
32035 case V4SF_FTYPE_PCV4SF_V4SI:
32036 case V2DF_FTYPE_PCV2DF_V2DI:
32037 case V8SI_FTYPE_PCV8SI_V8SI:
32038 case V4DI_FTYPE_PCV4DI_V4DI:
32039 case V4SI_FTYPE_PCV4SI_V4SI:
32040 case V2DI_FTYPE_PCV2DI_V2DI:
32041 nargs = 2;
32042 klass = load;
32043 memory = 0;
32044 break;
32045 case VOID_FTYPE_PV8SF_V8SI_V8SF:
32046 case VOID_FTYPE_PV4DF_V4DI_V4DF:
32047 case VOID_FTYPE_PV4SF_V4SI_V4SF:
32048 case VOID_FTYPE_PV2DF_V2DI_V2DF:
32049 case VOID_FTYPE_PV8SI_V8SI_V8SI:
32050 case VOID_FTYPE_PV4DI_V4DI_V4DI:
32051 case VOID_FTYPE_PV4SI_V4SI_V4SI:
32052 case VOID_FTYPE_PV2DI_V2DI_V2DI:
32053 nargs = 2;
32054 klass = store;
32055 /* Reserve memory operand for target. */
32056 memory = ARRAY_SIZE (args);
32057 break;
32058 case VOID_FTYPE_UINT_UINT_UINT:
32059 case VOID_FTYPE_UINT64_UINT_UINT:
32060 case UCHAR_FTYPE_UINT_UINT_UINT:
32061 case UCHAR_FTYPE_UINT64_UINT_UINT:
32062 nargs = 3;
32063 klass = load;
32064 memory = ARRAY_SIZE (args);
32065 last_arg_constant = true;
32066 break;
32067 default:
32068 gcc_unreachable ();
32071 gcc_assert (nargs <= ARRAY_SIZE (args));
32073 if (klass == store)
32075 arg = CALL_EXPR_ARG (exp, 0);
32076 op = expand_normal (arg);
32077 gcc_assert (target == 0);
32078 if (memory)
32080 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32081 target = gen_rtx_MEM (tmode, op);
32083 else
32084 target = force_reg (tmode, op);
32085 arg_adjust = 1;
32087 else
32089 arg_adjust = 0;
32090 if (optimize
32091 || target == 0
32092 || !register_operand (target, tmode)
32093 || GET_MODE (target) != tmode)
32094 target = gen_reg_rtx (tmode);
32097 for (i = 0; i < nargs; i++)
32099 enum machine_mode mode = insn_p->operand[i + 1].mode;
32100 bool match;
32102 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
32103 op = expand_normal (arg);
32104 match = insn_p->operand[i + 1].predicate (op, mode);
32106 if (last_arg_constant && (i + 1) == nargs)
32108 if (!match)
32110 if (icode == CODE_FOR_lwp_lwpvalsi3
32111 || icode == CODE_FOR_lwp_lwpinssi3
32112 || icode == CODE_FOR_lwp_lwpvaldi3
32113 || icode == CODE_FOR_lwp_lwpinsdi3)
32114 error ("the last argument must be a 32-bit immediate");
32115 else
32116 error ("the last argument must be an 8-bit immediate");
32117 return const0_rtx;
32120 else
32122 if (i == memory)
32124 /* This must be the memory operand. */
32125 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32126 op = gen_rtx_MEM (mode, op);
32127 gcc_assert (GET_MODE (op) == mode
32128 || GET_MODE (op) == VOIDmode);
32130 else
32132 /* This must be register. */
32133 if (VECTOR_MODE_P (mode))
32134 op = safe_vector_operand (op, mode);
32136 gcc_assert (GET_MODE (op) == mode
32137 || GET_MODE (op) == VOIDmode);
32138 op = copy_to_mode_reg (mode, op);
32142 args[i].op = op;
32143 args[i].mode = mode;
32146 switch (nargs)
32148 case 0:
32149 pat = GEN_FCN (icode) (target);
32150 break;
32151 case 1:
32152 pat = GEN_FCN (icode) (target, args[0].op);
32153 break;
32154 case 2:
32155 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32156 break;
32157 case 3:
32158 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32159 break;
32160 default:
32161 gcc_unreachable ();
32164 if (! pat)
32165 return 0;
32166 emit_insn (pat);
32167 return klass == store ? 0 : target;
32170 /* Return the integer constant in ARG. Constrain it to be in the range
32171 of the subparts of VEC_TYPE; issue an error if not. */
32173 static int
32174 get_element_number (tree vec_type, tree arg)
32176 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
32178 if (!host_integerp (arg, 1)
32179 || (elt = tree_low_cst (arg, 1), elt > max))
32181 error ("selector must be an integer constant in the range 0..%wi", max);
32182 return 0;
32185 return elt;
32188 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32189 ix86_expand_vector_init. We DO have language-level syntax for this, in
32190 the form of (type){ init-list }. Except that since we can't place emms
32191 instructions from inside the compiler, we can't allow the use of MMX
32192 registers unless the user explicitly asks for it. So we do *not* define
32193 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
32194 we have builtins invoked by mmintrin.h that gives us license to emit
32195 these sorts of instructions. */
32197 static rtx
32198 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
32200 enum machine_mode tmode = TYPE_MODE (type);
32201 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
32202 int i, n_elt = GET_MODE_NUNITS (tmode);
32203 rtvec v = rtvec_alloc (n_elt);
32205 gcc_assert (VECTOR_MODE_P (tmode));
32206 gcc_assert (call_expr_nargs (exp) == n_elt);
32208 for (i = 0; i < n_elt; ++i)
32210 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
32211 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
32214 if (!target || !register_operand (target, tmode))
32215 target = gen_reg_rtx (tmode);
32217 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
32218 return target;
32221 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32222 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
32223 had a language-level syntax for referencing vector elements. */
32225 static rtx
32226 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32228 enum machine_mode tmode, mode0;
32229 tree arg0, arg1;
32230 int elt;
32231 rtx op0;
32233 arg0 = CALL_EXPR_ARG (exp, 0);
32234 arg1 = CALL_EXPR_ARG (exp, 1);
32236 op0 = expand_normal (arg0);
32237 elt = get_element_number (TREE_TYPE (arg0), arg1);
32239 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32240 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32241 gcc_assert (VECTOR_MODE_P (mode0));
32243 op0 = force_reg (mode0, op0);
32245 if (optimize || !target || !register_operand (target, tmode))
32246 target = gen_reg_rtx (tmode);
32248 ix86_expand_vector_extract (true, target, op0, elt);
32250 return target;
32253 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32254 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32255 a language-level syntax for referencing vector elements. */
32257 static rtx
32258 ix86_expand_vec_set_builtin (tree exp)
32260 enum machine_mode tmode, mode1;
32261 tree arg0, arg1, arg2;
32262 int elt;
32263 rtx op0, op1, target;
32265 arg0 = CALL_EXPR_ARG (exp, 0);
32266 arg1 = CALL_EXPR_ARG (exp, 1);
32267 arg2 = CALL_EXPR_ARG (exp, 2);
32269 tmode = TYPE_MODE (TREE_TYPE (arg0));
32270 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32271 gcc_assert (VECTOR_MODE_P (tmode));
32273 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32274 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32275 elt = get_element_number (TREE_TYPE (arg0), arg2);
32277 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32278 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32280 op0 = force_reg (tmode, op0);
32281 op1 = force_reg (mode1, op1);
32283 /* OP0 is the source of these builtin functions and shouldn't be
32284 modified. Create a copy, use it and return it as target. */
32285 target = gen_reg_rtx (tmode);
32286 emit_move_insn (target, op0);
32287 ix86_expand_vector_set (true, target, op1, elt);
32289 return target;
32292 /* Expand an expression EXP that calls a built-in function,
32293 with result going to TARGET if that's convenient
32294 (and in mode MODE if that's convenient).
32295 SUBTARGET may be used as the target for computing one of EXP's operands.
32296 IGNORE is nonzero if the value is to be ignored. */
32298 static rtx
32299 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
32300 enum machine_mode mode ATTRIBUTE_UNUSED,
32301 int ignore ATTRIBUTE_UNUSED)
32303 const struct builtin_description *d;
32304 size_t i;
32305 enum insn_code icode;
32306 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32307 tree arg0, arg1, arg2, arg3, arg4;
32308 rtx op0, op1, op2, op3, op4, pat, insn;
32309 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32310 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32312 /* For CPU builtins that can be folded, fold first and expand the fold. */
32313 switch (fcode)
32315 case IX86_BUILTIN_CPU_INIT:
32317 /* Make it call __cpu_indicator_init in libgcc. */
32318 tree call_expr, fndecl, type;
32319 type = build_function_type_list (integer_type_node, NULL_TREE);
32320 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32321 call_expr = build_call_expr (fndecl, 0);
32322 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32324 case IX86_BUILTIN_CPU_IS:
32325 case IX86_BUILTIN_CPU_SUPPORTS:
32327 tree arg0 = CALL_EXPR_ARG (exp, 0);
32328 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32329 gcc_assert (fold_expr != NULL_TREE);
32330 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32334 /* Determine whether the builtin function is available under the current ISA.
32335 Originally the builtin was not created if it wasn't applicable to the
32336 current ISA based on the command line switches. With function specific
32337 options, we need to check in the context of the function making the call
32338 whether it is supported. */
32339 if (ix86_builtins_isa[fcode].isa
32340 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32342 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32343 NULL, (enum fpmath_unit) 0, false);
32345 if (!opts)
32346 error ("%qE needs unknown isa option", fndecl);
32347 else
32349 gcc_assert (opts != NULL);
32350 error ("%qE needs isa option %s", fndecl, opts);
32351 free (opts);
32353 return const0_rtx;
32356 switch (fcode)
32358 case IX86_BUILTIN_MASKMOVQ:
32359 case IX86_BUILTIN_MASKMOVDQU:
32360 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32361 ? CODE_FOR_mmx_maskmovq
32362 : CODE_FOR_sse2_maskmovdqu);
32363 /* Note the arg order is different from the operand order. */
32364 arg1 = CALL_EXPR_ARG (exp, 0);
32365 arg2 = CALL_EXPR_ARG (exp, 1);
32366 arg0 = CALL_EXPR_ARG (exp, 2);
32367 op0 = expand_normal (arg0);
32368 op1 = expand_normal (arg1);
32369 op2 = expand_normal (arg2);
32370 mode0 = insn_data[icode].operand[0].mode;
32371 mode1 = insn_data[icode].operand[1].mode;
32372 mode2 = insn_data[icode].operand[2].mode;
32374 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32375 op0 = gen_rtx_MEM (mode1, op0);
32377 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32378 op0 = copy_to_mode_reg (mode0, op0);
32379 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32380 op1 = copy_to_mode_reg (mode1, op1);
32381 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32382 op2 = copy_to_mode_reg (mode2, op2);
32383 pat = GEN_FCN (icode) (op0, op1, op2);
32384 if (! pat)
32385 return 0;
32386 emit_insn (pat);
32387 return 0;
32389 case IX86_BUILTIN_LDMXCSR:
32390 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32391 target = assign_386_stack_local (SImode, SLOT_TEMP);
32392 emit_move_insn (target, op0);
32393 emit_insn (gen_sse_ldmxcsr (target));
32394 return 0;
32396 case IX86_BUILTIN_STMXCSR:
32397 target = assign_386_stack_local (SImode, SLOT_TEMP);
32398 emit_insn (gen_sse_stmxcsr (target));
32399 return copy_to_mode_reg (SImode, target);
32401 case IX86_BUILTIN_CLFLUSH:
32402 arg0 = CALL_EXPR_ARG (exp, 0);
32403 op0 = expand_normal (arg0);
32404 icode = CODE_FOR_sse2_clflush;
32405 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32406 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32408 emit_insn (gen_sse2_clflush (op0));
32409 return 0;
32411 case IX86_BUILTIN_MONITOR:
32412 arg0 = CALL_EXPR_ARG (exp, 0);
32413 arg1 = CALL_EXPR_ARG (exp, 1);
32414 arg2 = CALL_EXPR_ARG (exp, 2);
32415 op0 = expand_normal (arg0);
32416 op1 = expand_normal (arg1);
32417 op2 = expand_normal (arg2);
32418 if (!REG_P (op0))
32419 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32420 if (!REG_P (op1))
32421 op1 = copy_to_mode_reg (SImode, op1);
32422 if (!REG_P (op2))
32423 op2 = copy_to_mode_reg (SImode, op2);
32424 emit_insn (ix86_gen_monitor (op0, op1, op2));
32425 return 0;
32427 case IX86_BUILTIN_MWAIT:
32428 arg0 = CALL_EXPR_ARG (exp, 0);
32429 arg1 = CALL_EXPR_ARG (exp, 1);
32430 op0 = expand_normal (arg0);
32431 op1 = expand_normal (arg1);
32432 if (!REG_P (op0))
32433 op0 = copy_to_mode_reg (SImode, op0);
32434 if (!REG_P (op1))
32435 op1 = copy_to_mode_reg (SImode, op1);
32436 emit_insn (gen_sse3_mwait (op0, op1));
32437 return 0;
32439 case IX86_BUILTIN_VEC_INIT_V2SI:
32440 case IX86_BUILTIN_VEC_INIT_V4HI:
32441 case IX86_BUILTIN_VEC_INIT_V8QI:
32442 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32444 case IX86_BUILTIN_VEC_EXT_V2DF:
32445 case IX86_BUILTIN_VEC_EXT_V2DI:
32446 case IX86_BUILTIN_VEC_EXT_V4SF:
32447 case IX86_BUILTIN_VEC_EXT_V4SI:
32448 case IX86_BUILTIN_VEC_EXT_V8HI:
32449 case IX86_BUILTIN_VEC_EXT_V2SI:
32450 case IX86_BUILTIN_VEC_EXT_V4HI:
32451 case IX86_BUILTIN_VEC_EXT_V16QI:
32452 return ix86_expand_vec_ext_builtin (exp, target);
32454 case IX86_BUILTIN_VEC_SET_V2DI:
32455 case IX86_BUILTIN_VEC_SET_V4SF:
32456 case IX86_BUILTIN_VEC_SET_V4SI:
32457 case IX86_BUILTIN_VEC_SET_V8HI:
32458 case IX86_BUILTIN_VEC_SET_V4HI:
32459 case IX86_BUILTIN_VEC_SET_V16QI:
32460 return ix86_expand_vec_set_builtin (exp);
32462 case IX86_BUILTIN_INFQ:
32463 case IX86_BUILTIN_HUGE_VALQ:
32465 REAL_VALUE_TYPE inf;
32466 rtx tmp;
32468 real_inf (&inf);
32469 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32471 tmp = validize_mem (force_const_mem (mode, tmp));
32473 if (target == 0)
32474 target = gen_reg_rtx (mode);
32476 emit_move_insn (target, tmp);
32477 return target;
32480 case IX86_BUILTIN_RDPMC:
32481 case IX86_BUILTIN_RDTSC:
32482 case IX86_BUILTIN_RDTSCP:
32484 op0 = gen_reg_rtx (DImode);
32485 op1 = gen_reg_rtx (DImode);
32487 if (fcode == IX86_BUILTIN_RDPMC)
32489 arg0 = CALL_EXPR_ARG (exp, 0);
32490 op2 = expand_normal (arg0);
32491 if (!register_operand (op2, SImode))
32492 op2 = copy_to_mode_reg (SImode, op2);
32494 insn = (TARGET_64BIT
32495 ? gen_rdpmc_rex64 (op0, op1, op2)
32496 : gen_rdpmc (op0, op2));
32497 emit_insn (insn);
32499 else if (fcode == IX86_BUILTIN_RDTSC)
32501 insn = (TARGET_64BIT
32502 ? gen_rdtsc_rex64 (op0, op1)
32503 : gen_rdtsc (op0));
32504 emit_insn (insn);
32506 else
32508 op2 = gen_reg_rtx (SImode);
32510 insn = (TARGET_64BIT
32511 ? gen_rdtscp_rex64 (op0, op1, op2)
32512 : gen_rdtscp (op0, op2));
32513 emit_insn (insn);
32515 arg0 = CALL_EXPR_ARG (exp, 0);
32516 op4 = expand_normal (arg0);
32517 if (!address_operand (op4, VOIDmode))
32519 op4 = convert_memory_address (Pmode, op4);
32520 op4 = copy_addr_to_reg (op4);
32522 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
32525 if (target == 0)
32527 /* mode is VOIDmode if __builtin_rd* has been called
32528 without lhs. */
32529 if (mode == VOIDmode)
32530 return target;
32531 target = gen_reg_rtx (mode);
32534 if (TARGET_64BIT)
32536 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
32537 op1, 1, OPTAB_DIRECT);
32538 op0 = expand_simple_binop (DImode, IOR, op0, op1,
32539 op0, 1, OPTAB_DIRECT);
32542 emit_move_insn (target, op0);
32543 return target;
32545 case IX86_BUILTIN_FXSAVE:
32546 case IX86_BUILTIN_FXRSTOR:
32547 case IX86_BUILTIN_FXSAVE64:
32548 case IX86_BUILTIN_FXRSTOR64:
32549 switch (fcode)
32551 case IX86_BUILTIN_FXSAVE:
32552 icode = CODE_FOR_fxsave;
32553 break;
32554 case IX86_BUILTIN_FXRSTOR:
32555 icode = CODE_FOR_fxrstor;
32556 break;
32557 case IX86_BUILTIN_FXSAVE64:
32558 icode = CODE_FOR_fxsave64;
32559 break;
32560 case IX86_BUILTIN_FXRSTOR64:
32561 icode = CODE_FOR_fxrstor64;
32562 break;
32563 default:
32564 gcc_unreachable ();
32567 arg0 = CALL_EXPR_ARG (exp, 0);
32568 op0 = expand_normal (arg0);
32570 if (!address_operand (op0, VOIDmode))
32572 op0 = convert_memory_address (Pmode, op0);
32573 op0 = copy_addr_to_reg (op0);
32575 op0 = gen_rtx_MEM (BLKmode, op0);
32577 pat = GEN_FCN (icode) (op0);
32578 if (pat)
32579 emit_insn (pat);
32580 return 0;
32582 case IX86_BUILTIN_XSAVE:
32583 case IX86_BUILTIN_XRSTOR:
32584 case IX86_BUILTIN_XSAVE64:
32585 case IX86_BUILTIN_XRSTOR64:
32586 case IX86_BUILTIN_XSAVEOPT:
32587 case IX86_BUILTIN_XSAVEOPT64:
32588 arg0 = CALL_EXPR_ARG (exp, 0);
32589 arg1 = CALL_EXPR_ARG (exp, 1);
32590 op0 = expand_normal (arg0);
32591 op1 = expand_normal (arg1);
32593 if (!address_operand (op0, VOIDmode))
32595 op0 = convert_memory_address (Pmode, op0);
32596 op0 = copy_addr_to_reg (op0);
32598 op0 = gen_rtx_MEM (BLKmode, op0);
32600 op1 = force_reg (DImode, op1);
32602 if (TARGET_64BIT)
32604 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
32605 NULL, 1, OPTAB_DIRECT);
32606 switch (fcode)
32608 case IX86_BUILTIN_XSAVE:
32609 icode = CODE_FOR_xsave_rex64;
32610 break;
32611 case IX86_BUILTIN_XRSTOR:
32612 icode = CODE_FOR_xrstor_rex64;
32613 break;
32614 case IX86_BUILTIN_XSAVE64:
32615 icode = CODE_FOR_xsave64;
32616 break;
32617 case IX86_BUILTIN_XRSTOR64:
32618 icode = CODE_FOR_xrstor64;
32619 break;
32620 case IX86_BUILTIN_XSAVEOPT:
32621 icode = CODE_FOR_xsaveopt_rex64;
32622 break;
32623 case IX86_BUILTIN_XSAVEOPT64:
32624 icode = CODE_FOR_xsaveopt64;
32625 break;
32626 default:
32627 gcc_unreachable ();
32630 op2 = gen_lowpart (SImode, op2);
32631 op1 = gen_lowpart (SImode, op1);
32632 pat = GEN_FCN (icode) (op0, op1, op2);
32634 else
32636 switch (fcode)
32638 case IX86_BUILTIN_XSAVE:
32639 icode = CODE_FOR_xsave;
32640 break;
32641 case IX86_BUILTIN_XRSTOR:
32642 icode = CODE_FOR_xrstor;
32643 break;
32644 case IX86_BUILTIN_XSAVEOPT:
32645 icode = CODE_FOR_xsaveopt;
32646 break;
32647 default:
32648 gcc_unreachable ();
32650 pat = GEN_FCN (icode) (op0, op1);
32653 if (pat)
32654 emit_insn (pat);
32655 return 0;
32657 case IX86_BUILTIN_LLWPCB:
32658 arg0 = CALL_EXPR_ARG (exp, 0);
32659 op0 = expand_normal (arg0);
32660 icode = CODE_FOR_lwp_llwpcb;
32661 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32662 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32663 emit_insn (gen_lwp_llwpcb (op0));
32664 return 0;
32666 case IX86_BUILTIN_SLWPCB:
32667 icode = CODE_FOR_lwp_slwpcb;
32668 if (!target
32669 || !insn_data[icode].operand[0].predicate (target, Pmode))
32670 target = gen_reg_rtx (Pmode);
32671 emit_insn (gen_lwp_slwpcb (target));
32672 return target;
32674 case IX86_BUILTIN_BEXTRI32:
32675 case IX86_BUILTIN_BEXTRI64:
32676 arg0 = CALL_EXPR_ARG (exp, 0);
32677 arg1 = CALL_EXPR_ARG (exp, 1);
32678 op0 = expand_normal (arg0);
32679 op1 = expand_normal (arg1);
32680 icode = (fcode == IX86_BUILTIN_BEXTRI32
32681 ? CODE_FOR_tbm_bextri_si
32682 : CODE_FOR_tbm_bextri_di);
32683 if (!CONST_INT_P (op1))
32685 error ("last argument must be an immediate");
32686 return const0_rtx;
32688 else
32690 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
32691 unsigned char lsb_index = INTVAL (op1) & 0xFF;
32692 op1 = GEN_INT (length);
32693 op2 = GEN_INT (lsb_index);
32694 pat = GEN_FCN (icode) (target, op0, op1, op2);
32695 if (pat)
32696 emit_insn (pat);
32697 return target;
32700 case IX86_BUILTIN_RDRAND16_STEP:
32701 icode = CODE_FOR_rdrandhi_1;
32702 mode0 = HImode;
32703 goto rdrand_step;
32705 case IX86_BUILTIN_RDRAND32_STEP:
32706 icode = CODE_FOR_rdrandsi_1;
32707 mode0 = SImode;
32708 goto rdrand_step;
32710 case IX86_BUILTIN_RDRAND64_STEP:
32711 icode = CODE_FOR_rdranddi_1;
32712 mode0 = DImode;
32714 rdrand_step:
32715 op0 = gen_reg_rtx (mode0);
32716 emit_insn (GEN_FCN (icode) (op0));
32718 arg0 = CALL_EXPR_ARG (exp, 0);
32719 op1 = expand_normal (arg0);
32720 if (!address_operand (op1, VOIDmode))
32722 op1 = convert_memory_address (Pmode, op1);
32723 op1 = copy_addr_to_reg (op1);
32725 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32727 op1 = gen_reg_rtx (SImode);
32728 emit_move_insn (op1, CONST1_RTX (SImode));
32730 /* Emit SImode conditional move. */
32731 if (mode0 == HImode)
32733 op2 = gen_reg_rtx (SImode);
32734 emit_insn (gen_zero_extendhisi2 (op2, op0));
32736 else if (mode0 == SImode)
32737 op2 = op0;
32738 else
32739 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32741 if (target == 0)
32742 target = gen_reg_rtx (SImode);
32744 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32745 const0_rtx);
32746 emit_insn (gen_rtx_SET (VOIDmode, target,
32747 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32748 return target;
32750 case IX86_BUILTIN_RDSEED16_STEP:
32751 icode = CODE_FOR_rdseedhi_1;
32752 mode0 = HImode;
32753 goto rdseed_step;
32755 case IX86_BUILTIN_RDSEED32_STEP:
32756 icode = CODE_FOR_rdseedsi_1;
32757 mode0 = SImode;
32758 goto rdseed_step;
32760 case IX86_BUILTIN_RDSEED64_STEP:
32761 icode = CODE_FOR_rdseeddi_1;
32762 mode0 = DImode;
32764 rdseed_step:
32765 op0 = gen_reg_rtx (mode0);
32766 emit_insn (GEN_FCN (icode) (op0));
32768 arg0 = CALL_EXPR_ARG (exp, 0);
32769 op1 = expand_normal (arg0);
32770 if (!address_operand (op1, VOIDmode))
32772 op1 = convert_memory_address (Pmode, op1);
32773 op1 = copy_addr_to_reg (op1);
32775 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32777 op2 = gen_reg_rtx (QImode);
32779 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32780 const0_rtx);
32781 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32783 if (target == 0)
32784 target = gen_reg_rtx (SImode);
32786 emit_insn (gen_zero_extendqisi2 (target, op2));
32787 return target;
32789 case IX86_BUILTIN_ADDCARRYX32:
32790 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32791 mode0 = SImode;
32792 goto addcarryx;
32794 case IX86_BUILTIN_ADDCARRYX64:
32795 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32796 mode0 = DImode;
32798 addcarryx:
32799 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32800 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32801 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32802 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32804 op0 = gen_reg_rtx (QImode);
32806 /* Generate CF from input operand. */
32807 op1 = expand_normal (arg0);
32808 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32809 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32811 /* Gen ADCX instruction to compute X+Y+CF. */
32812 op2 = expand_normal (arg1);
32813 op3 = expand_normal (arg2);
32815 if (!REG_P (op2))
32816 op2 = copy_to_mode_reg (mode0, op2);
32817 if (!REG_P (op3))
32818 op3 = copy_to_mode_reg (mode0, op3);
32820 op0 = gen_reg_rtx (mode0);
32822 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32823 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32824 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32826 /* Store the result. */
32827 op4 = expand_normal (arg3);
32828 if (!address_operand (op4, VOIDmode))
32830 op4 = convert_memory_address (Pmode, op4);
32831 op4 = copy_addr_to_reg (op4);
32833 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32835 /* Return current CF value. */
32836 if (target == 0)
32837 target = gen_reg_rtx (QImode);
32839 PUT_MODE (pat, QImode);
32840 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32841 return target;
32843 case IX86_BUILTIN_GATHERSIV2DF:
32844 icode = CODE_FOR_avx2_gathersiv2df;
32845 goto gather_gen;
32846 case IX86_BUILTIN_GATHERSIV4DF:
32847 icode = CODE_FOR_avx2_gathersiv4df;
32848 goto gather_gen;
32849 case IX86_BUILTIN_GATHERDIV2DF:
32850 icode = CODE_FOR_avx2_gatherdiv2df;
32851 goto gather_gen;
32852 case IX86_BUILTIN_GATHERDIV4DF:
32853 icode = CODE_FOR_avx2_gatherdiv4df;
32854 goto gather_gen;
32855 case IX86_BUILTIN_GATHERSIV4SF:
32856 icode = CODE_FOR_avx2_gathersiv4sf;
32857 goto gather_gen;
32858 case IX86_BUILTIN_GATHERSIV8SF:
32859 icode = CODE_FOR_avx2_gathersiv8sf;
32860 goto gather_gen;
32861 case IX86_BUILTIN_GATHERDIV4SF:
32862 icode = CODE_FOR_avx2_gatherdiv4sf;
32863 goto gather_gen;
32864 case IX86_BUILTIN_GATHERDIV8SF:
32865 icode = CODE_FOR_avx2_gatherdiv8sf;
32866 goto gather_gen;
32867 case IX86_BUILTIN_GATHERSIV2DI:
32868 icode = CODE_FOR_avx2_gathersiv2di;
32869 goto gather_gen;
32870 case IX86_BUILTIN_GATHERSIV4DI:
32871 icode = CODE_FOR_avx2_gathersiv4di;
32872 goto gather_gen;
32873 case IX86_BUILTIN_GATHERDIV2DI:
32874 icode = CODE_FOR_avx2_gatherdiv2di;
32875 goto gather_gen;
32876 case IX86_BUILTIN_GATHERDIV4DI:
32877 icode = CODE_FOR_avx2_gatherdiv4di;
32878 goto gather_gen;
32879 case IX86_BUILTIN_GATHERSIV4SI:
32880 icode = CODE_FOR_avx2_gathersiv4si;
32881 goto gather_gen;
32882 case IX86_BUILTIN_GATHERSIV8SI:
32883 icode = CODE_FOR_avx2_gathersiv8si;
32884 goto gather_gen;
32885 case IX86_BUILTIN_GATHERDIV4SI:
32886 icode = CODE_FOR_avx2_gatherdiv4si;
32887 goto gather_gen;
32888 case IX86_BUILTIN_GATHERDIV8SI:
32889 icode = CODE_FOR_avx2_gatherdiv8si;
32890 goto gather_gen;
32891 case IX86_BUILTIN_GATHERALTSIV4DF:
32892 icode = CODE_FOR_avx2_gathersiv4df;
32893 goto gather_gen;
32894 case IX86_BUILTIN_GATHERALTDIV8SF:
32895 icode = CODE_FOR_avx2_gatherdiv8sf;
32896 goto gather_gen;
32897 case IX86_BUILTIN_GATHERALTSIV4DI:
32898 icode = CODE_FOR_avx2_gathersiv4di;
32899 goto gather_gen;
32900 case IX86_BUILTIN_GATHERALTDIV8SI:
32901 icode = CODE_FOR_avx2_gatherdiv8si;
32902 goto gather_gen;
32904 gather_gen:
32905 arg0 = CALL_EXPR_ARG (exp, 0);
32906 arg1 = CALL_EXPR_ARG (exp, 1);
32907 arg2 = CALL_EXPR_ARG (exp, 2);
32908 arg3 = CALL_EXPR_ARG (exp, 3);
32909 arg4 = CALL_EXPR_ARG (exp, 4);
32910 op0 = expand_normal (arg0);
32911 op1 = expand_normal (arg1);
32912 op2 = expand_normal (arg2);
32913 op3 = expand_normal (arg3);
32914 op4 = expand_normal (arg4);
32915 /* Note the arg order is different from the operand order. */
32916 mode0 = insn_data[icode].operand[1].mode;
32917 mode2 = insn_data[icode].operand[3].mode;
32918 mode3 = insn_data[icode].operand[4].mode;
32919 mode4 = insn_data[icode].operand[5].mode;
32921 if (target == NULL_RTX
32922 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32923 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32924 else
32925 subtarget = target;
32927 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32928 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32930 rtx half = gen_reg_rtx (V4SImode);
32931 if (!nonimmediate_operand (op2, V8SImode))
32932 op2 = copy_to_mode_reg (V8SImode, op2);
32933 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32934 op2 = half;
32936 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32937 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32939 rtx (*gen) (rtx, rtx);
32940 rtx half = gen_reg_rtx (mode0);
32941 if (mode0 == V4SFmode)
32942 gen = gen_vec_extract_lo_v8sf;
32943 else
32944 gen = gen_vec_extract_lo_v8si;
32945 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32946 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32947 emit_insn (gen (half, op0));
32948 op0 = half;
32949 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32950 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32951 emit_insn (gen (half, op3));
32952 op3 = half;
32955 /* Force memory operand only with base register here. But we
32956 don't want to do it on memory operand for other builtin
32957 functions. */
32958 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32960 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32961 op0 = copy_to_mode_reg (mode0, op0);
32962 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32963 op1 = copy_to_mode_reg (Pmode, op1);
32964 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32965 op2 = copy_to_mode_reg (mode2, op2);
32966 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32967 op3 = copy_to_mode_reg (mode3, op3);
32968 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32970 error ("last argument must be scale 1, 2, 4, 8");
32971 return const0_rtx;
32974 /* Optimize. If mask is known to have all high bits set,
32975 replace op0 with pc_rtx to signal that the instruction
32976 overwrites the whole destination and doesn't use its
32977 previous contents. */
32978 if (optimize)
32980 if (TREE_CODE (arg3) == VECTOR_CST)
32982 unsigned int negative = 0;
32983 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32985 tree cst = VECTOR_CST_ELT (arg3, i);
32986 if (TREE_CODE (cst) == INTEGER_CST
32987 && tree_int_cst_sign_bit (cst))
32988 negative++;
32989 else if (TREE_CODE (cst) == REAL_CST
32990 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32991 negative++;
32993 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32994 op0 = pc_rtx;
32996 else if (TREE_CODE (arg3) == SSA_NAME)
32998 /* Recognize also when mask is like:
32999 __v2df src = _mm_setzero_pd ();
33000 __v2df mask = _mm_cmpeq_pd (src, src);
33002 __v8sf src = _mm256_setzero_ps ();
33003 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
33004 as that is a cheaper way to load all ones into
33005 a register than having to load a constant from
33006 memory. */
33007 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
33008 if (is_gimple_call (def_stmt))
33010 tree fndecl = gimple_call_fndecl (def_stmt);
33011 if (fndecl
33012 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33013 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
33015 case IX86_BUILTIN_CMPPD:
33016 case IX86_BUILTIN_CMPPS:
33017 case IX86_BUILTIN_CMPPD256:
33018 case IX86_BUILTIN_CMPPS256:
33019 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
33020 break;
33021 /* FALLTHRU */
33022 case IX86_BUILTIN_CMPEQPD:
33023 case IX86_BUILTIN_CMPEQPS:
33024 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
33025 && initializer_zerop (gimple_call_arg (def_stmt,
33026 1)))
33027 op0 = pc_rtx;
33028 break;
33029 default:
33030 break;
33036 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
33037 if (! pat)
33038 return const0_rtx;
33039 emit_insn (pat);
33041 if (fcode == IX86_BUILTIN_GATHERDIV8SF
33042 || fcode == IX86_BUILTIN_GATHERDIV8SI)
33044 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
33045 ? V4SFmode : V4SImode;
33046 if (target == NULL_RTX)
33047 target = gen_reg_rtx (tmode);
33048 if (tmode == V4SFmode)
33049 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
33050 else
33051 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
33053 else
33054 target = subtarget;
33056 return target;
33058 case IX86_BUILTIN_XABORT:
33059 icode = CODE_FOR_xabort;
33060 arg0 = CALL_EXPR_ARG (exp, 0);
33061 op0 = expand_normal (arg0);
33062 mode0 = insn_data[icode].operand[0].mode;
33063 if (!insn_data[icode].operand[0].predicate (op0, mode0))
33065 error ("the xabort's argument must be an 8-bit immediate");
33066 return const0_rtx;
33068 emit_insn (gen_xabort (op0));
33069 return 0;
33071 default:
33072 break;
33075 for (i = 0, d = bdesc_special_args;
33076 i < ARRAY_SIZE (bdesc_special_args);
33077 i++, d++)
33078 if (d->code == fcode)
33079 return ix86_expand_special_args_builtin (d, exp, target);
33081 for (i = 0, d = bdesc_args;
33082 i < ARRAY_SIZE (bdesc_args);
33083 i++, d++)
33084 if (d->code == fcode)
33085 switch (fcode)
33087 case IX86_BUILTIN_FABSQ:
33088 case IX86_BUILTIN_COPYSIGNQ:
33089 if (!TARGET_SSE)
33090 /* Emit a normal call if SSE isn't available. */
33091 return expand_call (exp, target, ignore);
33092 default:
33093 return ix86_expand_args_builtin (d, exp, target);
33096 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
33097 if (d->code == fcode)
33098 return ix86_expand_sse_comi (d, exp, target);
33100 for (i = 0, d = bdesc_pcmpestr;
33101 i < ARRAY_SIZE (bdesc_pcmpestr);
33102 i++, d++)
33103 if (d->code == fcode)
33104 return ix86_expand_sse_pcmpestr (d, exp, target);
33106 for (i = 0, d = bdesc_pcmpistr;
33107 i < ARRAY_SIZE (bdesc_pcmpistr);
33108 i++, d++)
33109 if (d->code == fcode)
33110 return ix86_expand_sse_pcmpistr (d, exp, target);
33112 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33113 if (d->code == fcode)
33114 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
33115 (enum ix86_builtin_func_type)
33116 d->flag, d->comparison);
33118 gcc_unreachable ();
33121 /* Returns a function decl for a vectorized version of the builtin function
33122 with builtin function code FN and the result vector type TYPE, or NULL_TREE
33123 if it is not available. */
33125 static tree
33126 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
33127 tree type_in)
33129 enum machine_mode in_mode, out_mode;
33130 int in_n, out_n;
33131 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
33133 if (TREE_CODE (type_out) != VECTOR_TYPE
33134 || TREE_CODE (type_in) != VECTOR_TYPE
33135 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
33136 return NULL_TREE;
33138 out_mode = TYPE_MODE (TREE_TYPE (type_out));
33139 out_n = TYPE_VECTOR_SUBPARTS (type_out);
33140 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33141 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33143 switch (fn)
33145 case BUILT_IN_SQRT:
33146 if (out_mode == DFmode && in_mode == DFmode)
33148 if (out_n == 2 && in_n == 2)
33149 return ix86_builtins[IX86_BUILTIN_SQRTPD];
33150 else if (out_n == 4 && in_n == 4)
33151 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
33153 break;
33155 case BUILT_IN_SQRTF:
33156 if (out_mode == SFmode && in_mode == SFmode)
33158 if (out_n == 4 && in_n == 4)
33159 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
33160 else if (out_n == 8 && in_n == 8)
33161 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
33163 break;
33165 case BUILT_IN_IFLOOR:
33166 case BUILT_IN_LFLOOR:
33167 case BUILT_IN_LLFLOOR:
33168 /* The round insn does not trap on denormals. */
33169 if (flag_trapping_math || !TARGET_ROUND)
33170 break;
33172 if (out_mode == SImode && in_mode == DFmode)
33174 if (out_n == 4 && in_n == 2)
33175 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
33176 else if (out_n == 8 && in_n == 4)
33177 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
33179 break;
33181 case BUILT_IN_IFLOORF:
33182 case BUILT_IN_LFLOORF:
33183 case BUILT_IN_LLFLOORF:
33184 /* The round insn does not trap on denormals. */
33185 if (flag_trapping_math || !TARGET_ROUND)
33186 break;
33188 if (out_mode == SImode && in_mode == SFmode)
33190 if (out_n == 4 && in_n == 4)
33191 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
33192 else if (out_n == 8 && in_n == 8)
33193 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
33195 break;
33197 case BUILT_IN_ICEIL:
33198 case BUILT_IN_LCEIL:
33199 case BUILT_IN_LLCEIL:
33200 /* The round insn does not trap on denormals. */
33201 if (flag_trapping_math || !TARGET_ROUND)
33202 break;
33204 if (out_mode == SImode && in_mode == DFmode)
33206 if (out_n == 4 && in_n == 2)
33207 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
33208 else if (out_n == 8 && in_n == 4)
33209 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
33211 break;
33213 case BUILT_IN_ICEILF:
33214 case BUILT_IN_LCEILF:
33215 case BUILT_IN_LLCEILF:
33216 /* The round insn does not trap on denormals. */
33217 if (flag_trapping_math || !TARGET_ROUND)
33218 break;
33220 if (out_mode == SImode && in_mode == SFmode)
33222 if (out_n == 4 && in_n == 4)
33223 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
33224 else if (out_n == 8 && in_n == 8)
33225 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
33227 break;
33229 case BUILT_IN_IRINT:
33230 case BUILT_IN_LRINT:
33231 case BUILT_IN_LLRINT:
33232 if (out_mode == SImode && in_mode == DFmode)
33234 if (out_n == 4 && in_n == 2)
33235 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33236 else if (out_n == 8 && in_n == 4)
33237 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33239 break;
33241 case BUILT_IN_IRINTF:
33242 case BUILT_IN_LRINTF:
33243 case BUILT_IN_LLRINTF:
33244 if (out_mode == SImode && in_mode == SFmode)
33246 if (out_n == 4 && in_n == 4)
33247 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33248 else if (out_n == 8 && in_n == 8)
33249 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33251 break;
33253 case BUILT_IN_IROUND:
33254 case BUILT_IN_LROUND:
33255 case BUILT_IN_LLROUND:
33256 /* The round insn does not trap on denormals. */
33257 if (flag_trapping_math || !TARGET_ROUND)
33258 break;
33260 if (out_mode == SImode && in_mode == DFmode)
33262 if (out_n == 4 && in_n == 2)
33263 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33264 else if (out_n == 8 && in_n == 4)
33265 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33267 break;
33269 case BUILT_IN_IROUNDF:
33270 case BUILT_IN_LROUNDF:
33271 case BUILT_IN_LLROUNDF:
33272 /* The round insn does not trap on denormals. */
33273 if (flag_trapping_math || !TARGET_ROUND)
33274 break;
33276 if (out_mode == SImode && in_mode == SFmode)
33278 if (out_n == 4 && in_n == 4)
33279 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33280 else if (out_n == 8 && in_n == 8)
33281 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33283 break;
33285 case BUILT_IN_COPYSIGN:
33286 if (out_mode == DFmode && in_mode == DFmode)
33288 if (out_n == 2 && in_n == 2)
33289 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33290 else if (out_n == 4 && in_n == 4)
33291 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33293 break;
33295 case BUILT_IN_COPYSIGNF:
33296 if (out_mode == SFmode && in_mode == SFmode)
33298 if (out_n == 4 && in_n == 4)
33299 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33300 else if (out_n == 8 && in_n == 8)
33301 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33303 break;
33305 case BUILT_IN_FLOOR:
33306 /* The round insn does not trap on denormals. */
33307 if (flag_trapping_math || !TARGET_ROUND)
33308 break;
33310 if (out_mode == DFmode && in_mode == DFmode)
33312 if (out_n == 2 && in_n == 2)
33313 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33314 else if (out_n == 4 && in_n == 4)
33315 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33317 break;
33319 case BUILT_IN_FLOORF:
33320 /* The round insn does not trap on denormals. */
33321 if (flag_trapping_math || !TARGET_ROUND)
33322 break;
33324 if (out_mode == SFmode && in_mode == SFmode)
33326 if (out_n == 4 && in_n == 4)
33327 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33328 else if (out_n == 8 && in_n == 8)
33329 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33331 break;
33333 case BUILT_IN_CEIL:
33334 /* The round insn does not trap on denormals. */
33335 if (flag_trapping_math || !TARGET_ROUND)
33336 break;
33338 if (out_mode == DFmode && in_mode == DFmode)
33340 if (out_n == 2 && in_n == 2)
33341 return ix86_builtins[IX86_BUILTIN_CEILPD];
33342 else if (out_n == 4 && in_n == 4)
33343 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33345 break;
33347 case BUILT_IN_CEILF:
33348 /* The round insn does not trap on denormals. */
33349 if (flag_trapping_math || !TARGET_ROUND)
33350 break;
33352 if (out_mode == SFmode && in_mode == SFmode)
33354 if (out_n == 4 && in_n == 4)
33355 return ix86_builtins[IX86_BUILTIN_CEILPS];
33356 else if (out_n == 8 && in_n == 8)
33357 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33359 break;
33361 case BUILT_IN_TRUNC:
33362 /* The round insn does not trap on denormals. */
33363 if (flag_trapping_math || !TARGET_ROUND)
33364 break;
33366 if (out_mode == DFmode && in_mode == DFmode)
33368 if (out_n == 2 && in_n == 2)
33369 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33370 else if (out_n == 4 && in_n == 4)
33371 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33373 break;
33375 case BUILT_IN_TRUNCF:
33376 /* The round insn does not trap on denormals. */
33377 if (flag_trapping_math || !TARGET_ROUND)
33378 break;
33380 if (out_mode == SFmode && in_mode == SFmode)
33382 if (out_n == 4 && in_n == 4)
33383 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33384 else if (out_n == 8 && in_n == 8)
33385 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33387 break;
33389 case BUILT_IN_RINT:
33390 /* The round insn does not trap on denormals. */
33391 if (flag_trapping_math || !TARGET_ROUND)
33392 break;
33394 if (out_mode == DFmode && in_mode == DFmode)
33396 if (out_n == 2 && in_n == 2)
33397 return ix86_builtins[IX86_BUILTIN_RINTPD];
33398 else if (out_n == 4 && in_n == 4)
33399 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33401 break;
33403 case BUILT_IN_RINTF:
33404 /* The round insn does not trap on denormals. */
33405 if (flag_trapping_math || !TARGET_ROUND)
33406 break;
33408 if (out_mode == SFmode && in_mode == SFmode)
33410 if (out_n == 4 && in_n == 4)
33411 return ix86_builtins[IX86_BUILTIN_RINTPS];
33412 else if (out_n == 8 && in_n == 8)
33413 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33415 break;
33417 case BUILT_IN_ROUND:
33418 /* The round insn does not trap on denormals. */
33419 if (flag_trapping_math || !TARGET_ROUND)
33420 break;
33422 if (out_mode == DFmode && in_mode == DFmode)
33424 if (out_n == 2 && in_n == 2)
33425 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33426 else if (out_n == 4 && in_n == 4)
33427 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33429 break;
33431 case BUILT_IN_ROUNDF:
33432 /* The round insn does not trap on denormals. */
33433 if (flag_trapping_math || !TARGET_ROUND)
33434 break;
33436 if (out_mode == SFmode && in_mode == SFmode)
33438 if (out_n == 4 && in_n == 4)
33439 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33440 else if (out_n == 8 && in_n == 8)
33441 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33443 break;
33445 case BUILT_IN_FMA:
33446 if (out_mode == DFmode && in_mode == DFmode)
33448 if (out_n == 2 && in_n == 2)
33449 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33450 if (out_n == 4 && in_n == 4)
33451 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33453 break;
33455 case BUILT_IN_FMAF:
33456 if (out_mode == SFmode && in_mode == SFmode)
33458 if (out_n == 4 && in_n == 4)
33459 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33460 if (out_n == 8 && in_n == 8)
33461 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33463 break;
33465 default:
33466 break;
33469 /* Dispatch to a handler for a vectorization library. */
33470 if (ix86_veclib_handler)
33471 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33472 type_in);
33474 return NULL_TREE;
33477 /* Handler for an SVML-style interface to
33478 a library with vectorized intrinsics. */
33480 static tree
33481 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
33483 char name[20];
33484 tree fntype, new_fndecl, args;
33485 unsigned arity;
33486 const char *bname;
33487 enum machine_mode el_mode, in_mode;
33488 int n, in_n;
33490 /* The SVML is suitable for unsafe math only. */
33491 if (!flag_unsafe_math_optimizations)
33492 return NULL_TREE;
33494 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33495 n = TYPE_VECTOR_SUBPARTS (type_out);
33496 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33497 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33498 if (el_mode != in_mode
33499 || n != in_n)
33500 return NULL_TREE;
33502 switch (fn)
33504 case BUILT_IN_EXP:
33505 case BUILT_IN_LOG:
33506 case BUILT_IN_LOG10:
33507 case BUILT_IN_POW:
33508 case BUILT_IN_TANH:
33509 case BUILT_IN_TAN:
33510 case BUILT_IN_ATAN:
33511 case BUILT_IN_ATAN2:
33512 case BUILT_IN_ATANH:
33513 case BUILT_IN_CBRT:
33514 case BUILT_IN_SINH:
33515 case BUILT_IN_SIN:
33516 case BUILT_IN_ASINH:
33517 case BUILT_IN_ASIN:
33518 case BUILT_IN_COSH:
33519 case BUILT_IN_COS:
33520 case BUILT_IN_ACOSH:
33521 case BUILT_IN_ACOS:
33522 if (el_mode != DFmode || n != 2)
33523 return NULL_TREE;
33524 break;
33526 case BUILT_IN_EXPF:
33527 case BUILT_IN_LOGF:
33528 case BUILT_IN_LOG10F:
33529 case BUILT_IN_POWF:
33530 case BUILT_IN_TANHF:
33531 case BUILT_IN_TANF:
33532 case BUILT_IN_ATANF:
33533 case BUILT_IN_ATAN2F:
33534 case BUILT_IN_ATANHF:
33535 case BUILT_IN_CBRTF:
33536 case BUILT_IN_SINHF:
33537 case BUILT_IN_SINF:
33538 case BUILT_IN_ASINHF:
33539 case BUILT_IN_ASINF:
33540 case BUILT_IN_COSHF:
33541 case BUILT_IN_COSF:
33542 case BUILT_IN_ACOSHF:
33543 case BUILT_IN_ACOSF:
33544 if (el_mode != SFmode || n != 4)
33545 return NULL_TREE;
33546 break;
33548 default:
33549 return NULL_TREE;
33552 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33554 if (fn == BUILT_IN_LOGF)
33555 strcpy (name, "vmlsLn4");
33556 else if (fn == BUILT_IN_LOG)
33557 strcpy (name, "vmldLn2");
33558 else if (n == 4)
33560 sprintf (name, "vmls%s", bname+10);
33561 name[strlen (name)-1] = '4';
33563 else
33564 sprintf (name, "vmld%s2", bname+10);
33566 /* Convert to uppercase. */
33567 name[4] &= ~0x20;
33569 arity = 0;
33570 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33571 args;
33572 args = TREE_CHAIN (args))
33573 arity++;
33575 if (arity == 1)
33576 fntype = build_function_type_list (type_out, type_in, NULL);
33577 else
33578 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33580 /* Build a function declaration for the vectorized function. */
33581 new_fndecl = build_decl (BUILTINS_LOCATION,
33582 FUNCTION_DECL, get_identifier (name), fntype);
33583 TREE_PUBLIC (new_fndecl) = 1;
33584 DECL_EXTERNAL (new_fndecl) = 1;
33585 DECL_IS_NOVOPS (new_fndecl) = 1;
33586 TREE_READONLY (new_fndecl) = 1;
33588 return new_fndecl;
33591 /* Handler for an ACML-style interface to
33592 a library with vectorized intrinsics. */
33594 static tree
33595 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
33597 char name[20] = "__vr.._";
33598 tree fntype, new_fndecl, args;
33599 unsigned arity;
33600 const char *bname;
33601 enum machine_mode el_mode, in_mode;
33602 int n, in_n;
33604 /* The ACML is 64bits only and suitable for unsafe math only as
33605 it does not correctly support parts of IEEE with the required
33606 precision such as denormals. */
33607 if (!TARGET_64BIT
33608 || !flag_unsafe_math_optimizations)
33609 return NULL_TREE;
33611 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33612 n = TYPE_VECTOR_SUBPARTS (type_out);
33613 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33614 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33615 if (el_mode != in_mode
33616 || n != in_n)
33617 return NULL_TREE;
33619 switch (fn)
33621 case BUILT_IN_SIN:
33622 case BUILT_IN_COS:
33623 case BUILT_IN_EXP:
33624 case BUILT_IN_LOG:
33625 case BUILT_IN_LOG2:
33626 case BUILT_IN_LOG10:
33627 name[4] = 'd';
33628 name[5] = '2';
33629 if (el_mode != DFmode
33630 || n != 2)
33631 return NULL_TREE;
33632 break;
33634 case BUILT_IN_SINF:
33635 case BUILT_IN_COSF:
33636 case BUILT_IN_EXPF:
33637 case BUILT_IN_POWF:
33638 case BUILT_IN_LOGF:
33639 case BUILT_IN_LOG2F:
33640 case BUILT_IN_LOG10F:
33641 name[4] = 's';
33642 name[5] = '4';
33643 if (el_mode != SFmode
33644 || n != 4)
33645 return NULL_TREE;
33646 break;
33648 default:
33649 return NULL_TREE;
33652 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33653 sprintf (name + 7, "%s", bname+10);
33655 arity = 0;
33656 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33657 args;
33658 args = TREE_CHAIN (args))
33659 arity++;
33661 if (arity == 1)
33662 fntype = build_function_type_list (type_out, type_in, NULL);
33663 else
33664 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33666 /* Build a function declaration for the vectorized function. */
33667 new_fndecl = build_decl (BUILTINS_LOCATION,
33668 FUNCTION_DECL, get_identifier (name), fntype);
33669 TREE_PUBLIC (new_fndecl) = 1;
33670 DECL_EXTERNAL (new_fndecl) = 1;
33671 DECL_IS_NOVOPS (new_fndecl) = 1;
33672 TREE_READONLY (new_fndecl) = 1;
33674 return new_fndecl;
33677 /* Returns a decl of a function that implements gather load with
33678 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
33679 Return NULL_TREE if it is not available. */
33681 static tree
33682 ix86_vectorize_builtin_gather (const_tree mem_vectype,
33683 const_tree index_type, int scale)
33685 bool si;
33686 enum ix86_builtins code;
33688 if (! TARGET_AVX2)
33689 return NULL_TREE;
33691 if ((TREE_CODE (index_type) != INTEGER_TYPE
33692 && !POINTER_TYPE_P (index_type))
33693 || (TYPE_MODE (index_type) != SImode
33694 && TYPE_MODE (index_type) != DImode))
33695 return NULL_TREE;
33697 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
33698 return NULL_TREE;
33700 /* v*gather* insn sign extends index to pointer mode. */
33701 if (TYPE_PRECISION (index_type) < POINTER_SIZE
33702 && TYPE_UNSIGNED (index_type))
33703 return NULL_TREE;
33705 if (scale <= 0
33706 || scale > 8
33707 || (scale & (scale - 1)) != 0)
33708 return NULL_TREE;
33710 si = TYPE_MODE (index_type) == SImode;
33711 switch (TYPE_MODE (mem_vectype))
33713 case V2DFmode:
33714 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
33715 break;
33716 case V4DFmode:
33717 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
33718 break;
33719 case V2DImode:
33720 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
33721 break;
33722 case V4DImode:
33723 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
33724 break;
33725 case V4SFmode:
33726 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
33727 break;
33728 case V8SFmode:
33729 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
33730 break;
33731 case V4SImode:
33732 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
33733 break;
33734 case V8SImode:
33735 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33736 break;
33737 default:
33738 return NULL_TREE;
33741 return ix86_builtins[code];
33744 /* Returns a code for a target-specific builtin that implements
33745 reciprocal of the function, or NULL_TREE if not available. */
33747 static tree
33748 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33749 bool sqrt ATTRIBUTE_UNUSED)
33751 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33752 && flag_finite_math_only && !flag_trapping_math
33753 && flag_unsafe_math_optimizations))
33754 return NULL_TREE;
33756 if (md_fn)
33757 /* Machine dependent builtins. */
33758 switch (fn)
33760 /* Vectorized version of sqrt to rsqrt conversion. */
33761 case IX86_BUILTIN_SQRTPS_NR:
33762 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33764 case IX86_BUILTIN_SQRTPS_NR256:
33765 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33767 default:
33768 return NULL_TREE;
33770 else
33771 /* Normal builtins. */
33772 switch (fn)
33774 /* Sqrt to rsqrt conversion. */
33775 case BUILT_IN_SQRTF:
33776 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33778 default:
33779 return NULL_TREE;
33783 /* Helper for avx_vpermilps256_operand et al. This is also used by
33784 the expansion functions to turn the parallel back into a mask.
33785 The return value is 0 for no match and the imm8+1 for a match. */
33788 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33790 unsigned i, nelt = GET_MODE_NUNITS (mode);
33791 unsigned mask = 0;
33792 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33794 if (XVECLEN (par, 0) != (int) nelt)
33795 return 0;
33797 /* Validate that all of the elements are constants, and not totally
33798 out of range. Copy the data into an integral array to make the
33799 subsequent checks easier. */
33800 for (i = 0; i < nelt; ++i)
33802 rtx er = XVECEXP (par, 0, i);
33803 unsigned HOST_WIDE_INT ei;
33805 if (!CONST_INT_P (er))
33806 return 0;
33807 ei = INTVAL (er);
33808 if (ei >= nelt)
33809 return 0;
33810 ipar[i] = ei;
33813 switch (mode)
33815 case V4DFmode:
33816 /* In the 256-bit DFmode case, we can only move elements within
33817 a 128-bit lane. */
33818 for (i = 0; i < 2; ++i)
33820 if (ipar[i] >= 2)
33821 return 0;
33822 mask |= ipar[i] << i;
33824 for (i = 2; i < 4; ++i)
33826 if (ipar[i] < 2)
33827 return 0;
33828 mask |= (ipar[i] - 2) << i;
33830 break;
33832 case V8SFmode:
33833 /* In the 256-bit SFmode case, we have full freedom of movement
33834 within the low 128-bit lane, but the high 128-bit lane must
33835 mirror the exact same pattern. */
33836 for (i = 0; i < 4; ++i)
33837 if (ipar[i] + 4 != ipar[i + 4])
33838 return 0;
33839 nelt = 4;
33840 /* FALLTHRU */
33842 case V2DFmode:
33843 case V4SFmode:
33844 /* In the 128-bit case, we've full freedom in the placement of
33845 the elements from the source operand. */
33846 for (i = 0; i < nelt; ++i)
33847 mask |= ipar[i] << (i * (nelt / 2));
33848 break;
33850 default:
33851 gcc_unreachable ();
33854 /* Make sure success has a non-zero value by adding one. */
33855 return mask + 1;
33858 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33859 the expansion functions to turn the parallel back into a mask.
33860 The return value is 0 for no match and the imm8+1 for a match. */
33863 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33865 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33866 unsigned mask = 0;
33867 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33869 if (XVECLEN (par, 0) != (int) nelt)
33870 return 0;
33872 /* Validate that all of the elements are constants, and not totally
33873 out of range. Copy the data into an integral array to make the
33874 subsequent checks easier. */
33875 for (i = 0; i < nelt; ++i)
33877 rtx er = XVECEXP (par, 0, i);
33878 unsigned HOST_WIDE_INT ei;
33880 if (!CONST_INT_P (er))
33881 return 0;
33882 ei = INTVAL (er);
33883 if (ei >= 2 * nelt)
33884 return 0;
33885 ipar[i] = ei;
33888 /* Validate that the halves of the permute are halves. */
33889 for (i = 0; i < nelt2 - 1; ++i)
33890 if (ipar[i] + 1 != ipar[i + 1])
33891 return 0;
33892 for (i = nelt2; i < nelt - 1; ++i)
33893 if (ipar[i] + 1 != ipar[i + 1])
33894 return 0;
33896 /* Reconstruct the mask. */
33897 for (i = 0; i < 2; ++i)
33899 unsigned e = ipar[i * nelt2];
33900 if (e % nelt2)
33901 return 0;
33902 e /= nelt2;
33903 mask |= e << (i * 4);
33906 /* Make sure success has a non-zero value by adding one. */
33907 return mask + 1;
33910 /* Store OPERAND to the memory after reload is completed. This means
33911 that we can't easily use assign_stack_local. */
33913 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33915 rtx result;
33917 gcc_assert (reload_completed);
33918 if (ix86_using_red_zone ())
33920 result = gen_rtx_MEM (mode,
33921 gen_rtx_PLUS (Pmode,
33922 stack_pointer_rtx,
33923 GEN_INT (-RED_ZONE_SIZE)));
33924 emit_move_insn (result, operand);
33926 else if (TARGET_64BIT)
33928 switch (mode)
33930 case HImode:
33931 case SImode:
33932 operand = gen_lowpart (DImode, operand);
33933 /* FALLTHRU */
33934 case DImode:
33935 emit_insn (
33936 gen_rtx_SET (VOIDmode,
33937 gen_rtx_MEM (DImode,
33938 gen_rtx_PRE_DEC (DImode,
33939 stack_pointer_rtx)),
33940 operand));
33941 break;
33942 default:
33943 gcc_unreachable ();
33945 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33947 else
33949 switch (mode)
33951 case DImode:
33953 rtx operands[2];
33954 split_double_mode (mode, &operand, 1, operands, operands + 1);
33955 emit_insn (
33956 gen_rtx_SET (VOIDmode,
33957 gen_rtx_MEM (SImode,
33958 gen_rtx_PRE_DEC (Pmode,
33959 stack_pointer_rtx)),
33960 operands[1]));
33961 emit_insn (
33962 gen_rtx_SET (VOIDmode,
33963 gen_rtx_MEM (SImode,
33964 gen_rtx_PRE_DEC (Pmode,
33965 stack_pointer_rtx)),
33966 operands[0]));
33968 break;
33969 case HImode:
33970 /* Store HImodes as SImodes. */
33971 operand = gen_lowpart (SImode, operand);
33972 /* FALLTHRU */
33973 case SImode:
33974 emit_insn (
33975 gen_rtx_SET (VOIDmode,
33976 gen_rtx_MEM (GET_MODE (operand),
33977 gen_rtx_PRE_DEC (SImode,
33978 stack_pointer_rtx)),
33979 operand));
33980 break;
33981 default:
33982 gcc_unreachable ();
33984 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33986 return result;
33989 /* Free operand from the memory. */
33990 void
33991 ix86_free_from_memory (enum machine_mode mode)
33993 if (!ix86_using_red_zone ())
33995 int size;
33997 if (mode == DImode || TARGET_64BIT)
33998 size = 8;
33999 else
34000 size = 4;
34001 /* Use LEA to deallocate stack space. In peephole2 it will be converted
34002 to pop or add instruction if registers are available. */
34003 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
34004 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
34005 GEN_INT (size))));
34009 /* Return a register priority for hard reg REGNO. */
34010 static int
34011 ix86_register_priority (int hard_regno)
34013 /* ebp and r13 as the base always wants a displacement, r12 as the
34014 base always wants an index. So discourage their usage in an
34015 address. */
34016 if (hard_regno == R12_REG || hard_regno == R13_REG)
34017 return 0;
34018 if (hard_regno == BP_REG)
34019 return 1;
34020 /* New x86-64 int registers result in bigger code size. Discourage
34021 them. */
34022 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
34023 return 2;
34024 /* New x86-64 SSE registers result in bigger code size. Discourage
34025 them. */
34026 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
34027 return 2;
34028 /* Usage of AX register results in smaller code. Prefer it. */
34029 if (hard_regno == 0)
34030 return 4;
34031 return 3;
34034 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
34036 Put float CONST_DOUBLE in the constant pool instead of fp regs.
34037 QImode must go into class Q_REGS.
34038 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
34039 movdf to do mem-to-mem moves through integer regs. */
34041 static reg_class_t
34042 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
34044 enum machine_mode mode = GET_MODE (x);
34046 /* We're only allowed to return a subclass of CLASS. Many of the
34047 following checks fail for NO_REGS, so eliminate that early. */
34048 if (regclass == NO_REGS)
34049 return NO_REGS;
34051 /* All classes can load zeros. */
34052 if (x == CONST0_RTX (mode))
34053 return regclass;
34055 /* Force constants into memory if we are loading a (nonzero) constant into
34056 an MMX or SSE register. This is because there are no MMX/SSE instructions
34057 to load from a constant. */
34058 if (CONSTANT_P (x)
34059 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
34060 return NO_REGS;
34062 /* Prefer SSE regs only, if we can use them for math. */
34063 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
34064 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
34066 /* Floating-point constants need more complex checks. */
34067 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
34069 /* General regs can load everything. */
34070 if (reg_class_subset_p (regclass, GENERAL_REGS))
34071 return regclass;
34073 /* Floats can load 0 and 1 plus some others. Note that we eliminated
34074 zero above. We only want to wind up preferring 80387 registers if
34075 we plan on doing computation with them. */
34076 if (TARGET_80387
34077 && standard_80387_constant_p (x) > 0)
34079 /* Limit class to non-sse. */
34080 if (regclass == FLOAT_SSE_REGS)
34081 return FLOAT_REGS;
34082 if (regclass == FP_TOP_SSE_REGS)
34083 return FP_TOP_REG;
34084 if (regclass == FP_SECOND_SSE_REGS)
34085 return FP_SECOND_REG;
34086 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
34087 return regclass;
34090 return NO_REGS;
34093 /* Generally when we see PLUS here, it's the function invariant
34094 (plus soft-fp const_int). Which can only be computed into general
34095 regs. */
34096 if (GET_CODE (x) == PLUS)
34097 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
34099 /* QImode constants are easy to load, but non-constant QImode data
34100 must go into Q_REGS. */
34101 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
34103 if (reg_class_subset_p (regclass, Q_REGS))
34104 return regclass;
34105 if (reg_class_subset_p (Q_REGS, regclass))
34106 return Q_REGS;
34107 return NO_REGS;
34110 return regclass;
34113 /* Discourage putting floating-point values in SSE registers unless
34114 SSE math is being used, and likewise for the 387 registers. */
34115 static reg_class_t
34116 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
34118 enum machine_mode mode = GET_MODE (x);
34120 /* Restrict the output reload class to the register bank that we are doing
34121 math on. If we would like not to return a subset of CLASS, reject this
34122 alternative: if reload cannot do this, it will still use its choice. */
34123 mode = GET_MODE (x);
34124 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
34125 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
34127 if (X87_FLOAT_MODE_P (mode))
34129 if (regclass == FP_TOP_SSE_REGS)
34130 return FP_TOP_REG;
34131 else if (regclass == FP_SECOND_SSE_REGS)
34132 return FP_SECOND_REG;
34133 else
34134 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
34137 return regclass;
34140 static reg_class_t
34141 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
34142 enum machine_mode mode, secondary_reload_info *sri)
34144 /* Double-word spills from general registers to non-offsettable memory
34145 references (zero-extended addresses) require special handling. */
34146 if (TARGET_64BIT
34147 && MEM_P (x)
34148 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
34149 && INTEGER_CLASS_P (rclass)
34150 && !offsettable_memref_p (x))
34152 sri->icode = (in_p
34153 ? CODE_FOR_reload_noff_load
34154 : CODE_FOR_reload_noff_store);
34155 /* Add the cost of moving address to a temporary. */
34156 sri->extra_cost = 1;
34158 return NO_REGS;
34161 /* QImode spills from non-QI registers require
34162 intermediate register on 32bit targets. */
34163 if (!TARGET_64BIT
34164 && !in_p && mode == QImode
34165 && INTEGER_CLASS_P (rclass)
34166 && MAYBE_NON_Q_CLASS_P (rclass))
34168 int regno;
34170 if (REG_P (x))
34171 regno = REGNO (x);
34172 else
34173 regno = -1;
34175 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
34176 regno = true_regnum (x);
34178 /* Return Q_REGS if the operand is in memory. */
34179 if (regno == -1)
34180 return Q_REGS;
34183 /* This condition handles corner case where an expression involving
34184 pointers gets vectorized. We're trying to use the address of a
34185 stack slot as a vector initializer.
34187 (set (reg:V2DI 74 [ vect_cst_.2 ])
34188 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
34190 Eventually frame gets turned into sp+offset like this:
34192 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34193 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34194 (const_int 392 [0x188]))))
34196 That later gets turned into:
34198 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34199 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34200 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
34202 We'll have the following reload recorded:
34204 Reload 0: reload_in (DI) =
34205 (plus:DI (reg/f:DI 7 sp)
34206 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
34207 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34208 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
34209 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
34210 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34211 reload_reg_rtx: (reg:V2DI 22 xmm1)
34213 Which isn't going to work since SSE instructions can't handle scalar
34214 additions. Returning GENERAL_REGS forces the addition into integer
34215 register and reload can handle subsequent reloads without problems. */
34217 if (in_p && GET_CODE (x) == PLUS
34218 && SSE_CLASS_P (rclass)
34219 && SCALAR_INT_MODE_P (mode))
34220 return GENERAL_REGS;
34222 return NO_REGS;
34225 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34227 static bool
34228 ix86_class_likely_spilled_p (reg_class_t rclass)
34230 switch (rclass)
34232 case AREG:
34233 case DREG:
34234 case CREG:
34235 case BREG:
34236 case AD_REGS:
34237 case SIREG:
34238 case DIREG:
34239 case SSE_FIRST_REG:
34240 case FP_TOP_REG:
34241 case FP_SECOND_REG:
34242 return true;
34244 default:
34245 break;
34248 return false;
34251 /* If we are copying between general and FP registers, we need a memory
34252 location. The same is true for SSE and MMX registers.
34254 To optimize register_move_cost performance, allow inline variant.
34256 The macro can't work reliably when one of the CLASSES is class containing
34257 registers from multiple units (SSE, MMX, integer). We avoid this by never
34258 combining those units in single alternative in the machine description.
34259 Ensure that this constraint holds to avoid unexpected surprises.
34261 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34262 enforce these sanity checks. */
34264 static inline bool
34265 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34266 enum machine_mode mode, int strict)
34268 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34269 return false;
34270 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34271 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34272 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34273 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34274 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34275 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34277 gcc_assert (!strict || lra_in_progress);
34278 return true;
34281 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34282 return true;
34284 /* ??? This is a lie. We do have moves between mmx/general, and for
34285 mmx/sse2. But by saying we need secondary memory we discourage the
34286 register allocator from using the mmx registers unless needed. */
34287 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34288 return true;
34290 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34292 /* SSE1 doesn't have any direct moves from other classes. */
34293 if (!TARGET_SSE2)
34294 return true;
34296 /* If the target says that inter-unit moves are more expensive
34297 than moving through memory, then don't generate them. */
34298 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34299 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34300 return true;
34302 /* Between SSE and general, we have moves no larger than word size. */
34303 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34304 return true;
34307 return false;
34310 bool
34311 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34312 enum machine_mode mode, int strict)
34314 return inline_secondary_memory_needed (class1, class2, mode, strict);
34317 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34319 On the 80386, this is the size of MODE in words,
34320 except in the FP regs, where a single reg is always enough. */
34322 static unsigned char
34323 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34325 if (MAYBE_INTEGER_CLASS_P (rclass))
34327 if (mode == XFmode)
34328 return (TARGET_64BIT ? 2 : 3);
34329 else if (mode == XCmode)
34330 return (TARGET_64BIT ? 4 : 6);
34331 else
34332 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34334 else
34336 if (COMPLEX_MODE_P (mode))
34337 return 2;
34338 else
34339 return 1;
34343 /* Return true if the registers in CLASS cannot represent the change from
34344 modes FROM to TO. */
34346 bool
34347 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34348 enum reg_class regclass)
34350 if (from == to)
34351 return false;
34353 /* x87 registers can't do subreg at all, as all values are reformatted
34354 to extended precision. */
34355 if (MAYBE_FLOAT_CLASS_P (regclass))
34356 return true;
34358 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34360 /* Vector registers do not support QI or HImode loads. If we don't
34361 disallow a change to these modes, reload will assume it's ok to
34362 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34363 the vec_dupv4hi pattern. */
34364 if (GET_MODE_SIZE (from) < 4)
34365 return true;
34367 /* Vector registers do not support subreg with nonzero offsets, which
34368 are otherwise valid for integer registers. Since we can't see
34369 whether we have a nonzero offset from here, prohibit all
34370 nonparadoxical subregs changing size. */
34371 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34372 return true;
34375 return false;
34378 /* Return the cost of moving data of mode M between a
34379 register and memory. A value of 2 is the default; this cost is
34380 relative to those in `REGISTER_MOVE_COST'.
34382 This function is used extensively by register_move_cost that is used to
34383 build tables at startup. Make it inline in this case.
34384 When IN is 2, return maximum of in and out move cost.
34386 If moving between registers and memory is more expensive than
34387 between two registers, you should define this macro to express the
34388 relative cost.
34390 Model also increased moving costs of QImode registers in non
34391 Q_REGS classes.
34393 static inline int
34394 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34395 int in)
34397 int cost;
34398 if (FLOAT_CLASS_P (regclass))
34400 int index;
34401 switch (mode)
34403 case SFmode:
34404 index = 0;
34405 break;
34406 case DFmode:
34407 index = 1;
34408 break;
34409 case XFmode:
34410 index = 2;
34411 break;
34412 default:
34413 return 100;
34415 if (in == 2)
34416 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34417 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34419 if (SSE_CLASS_P (regclass))
34421 int index;
34422 switch (GET_MODE_SIZE (mode))
34424 case 4:
34425 index = 0;
34426 break;
34427 case 8:
34428 index = 1;
34429 break;
34430 case 16:
34431 index = 2;
34432 break;
34433 default:
34434 return 100;
34436 if (in == 2)
34437 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34438 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34440 if (MMX_CLASS_P (regclass))
34442 int index;
34443 switch (GET_MODE_SIZE (mode))
34445 case 4:
34446 index = 0;
34447 break;
34448 case 8:
34449 index = 1;
34450 break;
34451 default:
34452 return 100;
34454 if (in)
34455 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34456 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34458 switch (GET_MODE_SIZE (mode))
34460 case 1:
34461 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34463 if (!in)
34464 return ix86_cost->int_store[0];
34465 if (TARGET_PARTIAL_REG_DEPENDENCY
34466 && optimize_function_for_speed_p (cfun))
34467 cost = ix86_cost->movzbl_load;
34468 else
34469 cost = ix86_cost->int_load[0];
34470 if (in == 2)
34471 return MAX (cost, ix86_cost->int_store[0]);
34472 return cost;
34474 else
34476 if (in == 2)
34477 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
34478 if (in)
34479 return ix86_cost->movzbl_load;
34480 else
34481 return ix86_cost->int_store[0] + 4;
34483 break;
34484 case 2:
34485 if (in == 2)
34486 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
34487 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
34488 default:
34489 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
34490 if (mode == TFmode)
34491 mode = XFmode;
34492 if (in == 2)
34493 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
34494 else if (in)
34495 cost = ix86_cost->int_load[2];
34496 else
34497 cost = ix86_cost->int_store[2];
34498 return (cost * (((int) GET_MODE_SIZE (mode)
34499 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
34503 static int
34504 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
34505 bool in)
34507 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
34511 /* Return the cost of moving data from a register in class CLASS1 to
34512 one in class CLASS2.
34514 It is not required that the cost always equal 2 when FROM is the same as TO;
34515 on some machines it is expensive to move between registers if they are not
34516 general registers. */
34518 static int
34519 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
34520 reg_class_t class2_i)
34522 enum reg_class class1 = (enum reg_class) class1_i;
34523 enum reg_class class2 = (enum reg_class) class2_i;
34525 /* In case we require secondary memory, compute cost of the store followed
34526 by load. In order to avoid bad register allocation choices, we need
34527 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
34529 if (inline_secondary_memory_needed (class1, class2, mode, 0))
34531 int cost = 1;
34533 cost += inline_memory_move_cost (mode, class1, 2);
34534 cost += inline_memory_move_cost (mode, class2, 2);
34536 /* In case of copying from general_purpose_register we may emit multiple
34537 stores followed by single load causing memory size mismatch stall.
34538 Count this as arbitrarily high cost of 20. */
34539 if (targetm.class_max_nregs (class1, mode)
34540 > targetm.class_max_nregs (class2, mode))
34541 cost += 20;
34543 /* In the case of FP/MMX moves, the registers actually overlap, and we
34544 have to switch modes in order to treat them differently. */
34545 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
34546 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
34547 cost += 20;
34549 return cost;
34552 /* Moves between SSE/MMX and integer unit are expensive. */
34553 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
34554 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34556 /* ??? By keeping returned value relatively high, we limit the number
34557 of moves between integer and MMX/SSE registers for all targets.
34558 Additionally, high value prevents problem with x86_modes_tieable_p(),
34559 where integer modes in MMX/SSE registers are not tieable
34560 because of missing QImode and HImode moves to, from or between
34561 MMX/SSE registers. */
34562 return MAX (8, ix86_cost->mmxsse_to_integer);
34564 if (MAYBE_FLOAT_CLASS_P (class1))
34565 return ix86_cost->fp_move;
34566 if (MAYBE_SSE_CLASS_P (class1))
34567 return ix86_cost->sse_move;
34568 if (MAYBE_MMX_CLASS_P (class1))
34569 return ix86_cost->mmx_move;
34570 return 2;
34573 /* Return TRUE if hard register REGNO can hold a value of machine-mode
34574 MODE. */
34576 bool
34577 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
34579 /* Flags and only flags can only hold CCmode values. */
34580 if (CC_REGNO_P (regno))
34581 return GET_MODE_CLASS (mode) == MODE_CC;
34582 if (GET_MODE_CLASS (mode) == MODE_CC
34583 || GET_MODE_CLASS (mode) == MODE_RANDOM
34584 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
34585 return false;
34586 if (STACK_REGNO_P (regno))
34587 return VALID_FP_MODE_P (mode);
34588 if (SSE_REGNO_P (regno))
34590 /* We implement the move patterns for all vector modes into and
34591 out of SSE registers, even when no operation instructions
34592 are available. OImode move is available only when AVX is
34593 enabled. */
34594 return ((TARGET_AVX && mode == OImode)
34595 || VALID_AVX256_REG_MODE (mode)
34596 || VALID_SSE_REG_MODE (mode)
34597 || VALID_SSE2_REG_MODE (mode)
34598 || VALID_MMX_REG_MODE (mode)
34599 || VALID_MMX_REG_MODE_3DNOW (mode));
34601 if (MMX_REGNO_P (regno))
34603 /* We implement the move patterns for 3DNOW modes even in MMX mode,
34604 so if the register is available at all, then we can move data of
34605 the given mode into or out of it. */
34606 return (VALID_MMX_REG_MODE (mode)
34607 || VALID_MMX_REG_MODE_3DNOW (mode));
34610 if (mode == QImode)
34612 /* Take care for QImode values - they can be in non-QI regs,
34613 but then they do cause partial register stalls. */
34614 if (ANY_QI_REGNO_P (regno))
34615 return true;
34616 if (!TARGET_PARTIAL_REG_STALL)
34617 return true;
34618 /* LRA checks if the hard register is OK for the given mode.
34619 QImode values can live in non-QI regs, so we allow all
34620 registers here. */
34621 if (lra_in_progress)
34622 return true;
34623 return !can_create_pseudo_p ();
34625 /* We handle both integer and floats in the general purpose registers. */
34626 else if (VALID_INT_MODE_P (mode))
34627 return true;
34628 else if (VALID_FP_MODE_P (mode))
34629 return true;
34630 else if (VALID_DFP_MODE_P (mode))
34631 return true;
34632 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
34633 on to use that value in smaller contexts, this can easily force a
34634 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
34635 supporting DImode, allow it. */
34636 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
34637 return true;
34639 return false;
34642 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
34643 tieable integer mode. */
34645 static bool
34646 ix86_tieable_integer_mode_p (enum machine_mode mode)
34648 switch (mode)
34650 case HImode:
34651 case SImode:
34652 return true;
34654 case QImode:
34655 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
34657 case DImode:
34658 return TARGET_64BIT;
34660 default:
34661 return false;
34665 /* Return true if MODE1 is accessible in a register that can hold MODE2
34666 without copying. That is, all register classes that can hold MODE2
34667 can also hold MODE1. */
34669 bool
34670 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
34672 if (mode1 == mode2)
34673 return true;
34675 if (ix86_tieable_integer_mode_p (mode1)
34676 && ix86_tieable_integer_mode_p (mode2))
34677 return true;
34679 /* MODE2 being XFmode implies fp stack or general regs, which means we
34680 can tie any smaller floating point modes to it. Note that we do not
34681 tie this with TFmode. */
34682 if (mode2 == XFmode)
34683 return mode1 == SFmode || mode1 == DFmode;
34685 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
34686 that we can tie it with SFmode. */
34687 if (mode2 == DFmode)
34688 return mode1 == SFmode;
34690 /* If MODE2 is only appropriate for an SSE register, then tie with
34691 any other mode acceptable to SSE registers. */
34692 if (GET_MODE_SIZE (mode2) == 32
34693 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34694 return (GET_MODE_SIZE (mode1) == 32
34695 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34696 if (GET_MODE_SIZE (mode2) == 16
34697 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34698 return (GET_MODE_SIZE (mode1) == 16
34699 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34701 /* If MODE2 is appropriate for an MMX register, then tie
34702 with any other mode acceptable to MMX registers. */
34703 if (GET_MODE_SIZE (mode2) == 8
34704 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
34705 return (GET_MODE_SIZE (mode1) == 8
34706 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
34708 return false;
34711 /* Return the cost of moving between two registers of mode MODE. */
34713 static int
34714 ix86_set_reg_reg_cost (enum machine_mode mode)
34716 unsigned int units = UNITS_PER_WORD;
34718 switch (GET_MODE_CLASS (mode))
34720 default:
34721 break;
34723 case MODE_CC:
34724 units = GET_MODE_SIZE (CCmode);
34725 break;
34727 case MODE_FLOAT:
34728 if ((TARGET_SSE && mode == TFmode)
34729 || (TARGET_80387 && mode == XFmode)
34730 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
34731 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
34732 units = GET_MODE_SIZE (mode);
34733 break;
34735 case MODE_COMPLEX_FLOAT:
34736 if ((TARGET_SSE && mode == TCmode)
34737 || (TARGET_80387 && mode == XCmode)
34738 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
34739 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34740 units = GET_MODE_SIZE (mode);
34741 break;
34743 case MODE_VECTOR_INT:
34744 case MODE_VECTOR_FLOAT:
34745 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34746 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34747 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34748 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34749 units = GET_MODE_SIZE (mode);
34752 /* Return the cost of moving between two registers of mode MODE,
34753 assuming that the move will be in pieces of at most UNITS bytes. */
34754 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34757 /* Compute a (partial) cost for rtx X. Return true if the complete
34758 cost has been computed, and false if subexpressions should be
34759 scanned. In either case, *TOTAL contains the cost result. */
34761 static bool
34762 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34763 bool speed)
34765 enum rtx_code code = (enum rtx_code) code_i;
34766 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34767 enum machine_mode mode = GET_MODE (x);
34768 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34770 switch (code)
34772 case SET:
34773 if (register_operand (SET_DEST (x), VOIDmode)
34774 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34776 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34777 return true;
34779 return false;
34781 case CONST_INT:
34782 case CONST:
34783 case LABEL_REF:
34784 case SYMBOL_REF:
34785 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34786 *total = 3;
34787 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34788 *total = 2;
34789 else if (flag_pic && SYMBOLIC_CONST (x)
34790 && (!TARGET_64BIT
34791 || (!GET_CODE (x) != LABEL_REF
34792 && (GET_CODE (x) != SYMBOL_REF
34793 || !SYMBOL_REF_LOCAL_P (x)))))
34794 *total = 1;
34795 else
34796 *total = 0;
34797 return true;
34799 case CONST_DOUBLE:
34800 if (mode == VOIDmode)
34802 *total = 0;
34803 return true;
34805 switch (standard_80387_constant_p (x))
34807 case 1: /* 0.0 */
34808 *total = 1;
34809 return true;
34810 default: /* Other constants */
34811 *total = 2;
34812 return true;
34813 case 0:
34814 case -1:
34815 break;
34817 if (SSE_FLOAT_MODE_P (mode))
34819 case CONST_VECTOR:
34820 switch (standard_sse_constant_p (x))
34822 case 0:
34823 break;
34824 case 1: /* 0: xor eliminates false dependency */
34825 *total = 0;
34826 return true;
34827 default: /* -1: cmp contains false dependency */
34828 *total = 1;
34829 return true;
34832 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34833 it'll probably end up. Add a penalty for size. */
34834 *total = (COSTS_N_INSNS (1)
34835 + (flag_pic != 0 && !TARGET_64BIT)
34836 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34837 return true;
34839 case ZERO_EXTEND:
34840 /* The zero extensions is often completely free on x86_64, so make
34841 it as cheap as possible. */
34842 if (TARGET_64BIT && mode == DImode
34843 && GET_MODE (XEXP (x, 0)) == SImode)
34844 *total = 1;
34845 else if (TARGET_ZERO_EXTEND_WITH_AND)
34846 *total = cost->add;
34847 else
34848 *total = cost->movzx;
34849 return false;
34851 case SIGN_EXTEND:
34852 *total = cost->movsx;
34853 return false;
34855 case ASHIFT:
34856 if (SCALAR_INT_MODE_P (mode)
34857 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34858 && CONST_INT_P (XEXP (x, 1)))
34860 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34861 if (value == 1)
34863 *total = cost->add;
34864 return false;
34866 if ((value == 2 || value == 3)
34867 && cost->lea <= cost->shift_const)
34869 *total = cost->lea;
34870 return false;
34873 /* FALLTHRU */
34875 case ROTATE:
34876 case ASHIFTRT:
34877 case LSHIFTRT:
34878 case ROTATERT:
34879 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34881 /* ??? Should be SSE vector operation cost. */
34882 /* At least for published AMD latencies, this really is the same
34883 as the latency for a simple fpu operation like fabs. */
34884 /* V*QImode is emulated with 1-11 insns. */
34885 if (mode == V16QImode || mode == V32QImode)
34887 int count = 11;
34888 if (TARGET_XOP && mode == V16QImode)
34890 /* For XOP we use vpshab, which requires a broadcast of the
34891 value to the variable shift insn. For constants this
34892 means a V16Q const in mem; even when we can perform the
34893 shift with one insn set the cost to prefer paddb. */
34894 if (CONSTANT_P (XEXP (x, 1)))
34896 *total = (cost->fabs
34897 + rtx_cost (XEXP (x, 0), code, 0, speed)
34898 + (speed ? 2 : COSTS_N_BYTES (16)));
34899 return true;
34901 count = 3;
34903 else if (TARGET_SSSE3)
34904 count = 7;
34905 *total = cost->fabs * count;
34907 else
34908 *total = cost->fabs;
34910 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34912 if (CONST_INT_P (XEXP (x, 1)))
34914 if (INTVAL (XEXP (x, 1)) > 32)
34915 *total = cost->shift_const + COSTS_N_INSNS (2);
34916 else
34917 *total = cost->shift_const * 2;
34919 else
34921 if (GET_CODE (XEXP (x, 1)) == AND)
34922 *total = cost->shift_var * 2;
34923 else
34924 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34927 else
34929 if (CONST_INT_P (XEXP (x, 1)))
34930 *total = cost->shift_const;
34931 else if (GET_CODE (XEXP (x, 1)) == SUBREG
34932 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
34934 /* Return the cost after shift-and truncation. */
34935 *total = cost->shift_var;
34936 return true;
34938 else
34939 *total = cost->shift_var;
34941 return false;
34943 case FMA:
34945 rtx sub;
34947 gcc_assert (FLOAT_MODE_P (mode));
34948 gcc_assert (TARGET_FMA || TARGET_FMA4);
34950 /* ??? SSE scalar/vector cost should be used here. */
34951 /* ??? Bald assumption that fma has the same cost as fmul. */
34952 *total = cost->fmul;
34953 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34955 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34956 sub = XEXP (x, 0);
34957 if (GET_CODE (sub) == NEG)
34958 sub = XEXP (sub, 0);
34959 *total += rtx_cost (sub, FMA, 0, speed);
34961 sub = XEXP (x, 2);
34962 if (GET_CODE (sub) == NEG)
34963 sub = XEXP (sub, 0);
34964 *total += rtx_cost (sub, FMA, 2, speed);
34965 return true;
34968 case MULT:
34969 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34971 /* ??? SSE scalar cost should be used here. */
34972 *total = cost->fmul;
34973 return false;
34975 else if (X87_FLOAT_MODE_P (mode))
34977 *total = cost->fmul;
34978 return false;
34980 else if (FLOAT_MODE_P (mode))
34982 /* ??? SSE vector cost should be used here. */
34983 *total = cost->fmul;
34984 return false;
34986 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34988 /* V*QImode is emulated with 7-13 insns. */
34989 if (mode == V16QImode || mode == V32QImode)
34991 int extra = 11;
34992 if (TARGET_XOP && mode == V16QImode)
34993 extra = 5;
34994 else if (TARGET_SSSE3)
34995 extra = 6;
34996 *total = cost->fmul * 2 + cost->fabs * extra;
34998 /* V*DImode is emulated with 5-8 insns. */
34999 else if (mode == V2DImode || mode == V4DImode)
35001 if (TARGET_XOP && mode == V2DImode)
35002 *total = cost->fmul * 2 + cost->fabs * 3;
35003 else
35004 *total = cost->fmul * 3 + cost->fabs * 5;
35006 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
35007 insns, including two PMULUDQ. */
35008 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
35009 *total = cost->fmul * 2 + cost->fabs * 5;
35010 else
35011 *total = cost->fmul;
35012 return false;
35014 else
35016 rtx op0 = XEXP (x, 0);
35017 rtx op1 = XEXP (x, 1);
35018 int nbits;
35019 if (CONST_INT_P (XEXP (x, 1)))
35021 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35022 for (nbits = 0; value != 0; value &= value - 1)
35023 nbits++;
35025 else
35026 /* This is arbitrary. */
35027 nbits = 7;
35029 /* Compute costs correctly for widening multiplication. */
35030 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
35031 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
35032 == GET_MODE_SIZE (mode))
35034 int is_mulwiden = 0;
35035 enum machine_mode inner_mode = GET_MODE (op0);
35037 if (GET_CODE (op0) == GET_CODE (op1))
35038 is_mulwiden = 1, op1 = XEXP (op1, 0);
35039 else if (CONST_INT_P (op1))
35041 if (GET_CODE (op0) == SIGN_EXTEND)
35042 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
35043 == INTVAL (op1);
35044 else
35045 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
35048 if (is_mulwiden)
35049 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
35052 *total = (cost->mult_init[MODE_INDEX (mode)]
35053 + nbits * cost->mult_bit
35054 + rtx_cost (op0, outer_code, opno, speed)
35055 + rtx_cost (op1, outer_code, opno, speed));
35057 return true;
35060 case DIV:
35061 case UDIV:
35062 case MOD:
35063 case UMOD:
35064 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35065 /* ??? SSE cost should be used here. */
35066 *total = cost->fdiv;
35067 else if (X87_FLOAT_MODE_P (mode))
35068 *total = cost->fdiv;
35069 else if (FLOAT_MODE_P (mode))
35070 /* ??? SSE vector cost should be used here. */
35071 *total = cost->fdiv;
35072 else
35073 *total = cost->divide[MODE_INDEX (mode)];
35074 return false;
35076 case PLUS:
35077 if (GET_MODE_CLASS (mode) == MODE_INT
35078 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
35080 if (GET_CODE (XEXP (x, 0)) == PLUS
35081 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
35082 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
35083 && CONSTANT_P (XEXP (x, 1)))
35085 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
35086 if (val == 2 || val == 4 || val == 8)
35088 *total = cost->lea;
35089 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35090 outer_code, opno, speed);
35091 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
35092 outer_code, opno, speed);
35093 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35094 return true;
35097 else if (GET_CODE (XEXP (x, 0)) == MULT
35098 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
35100 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
35101 if (val == 2 || val == 4 || val == 8)
35103 *total = cost->lea;
35104 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35105 outer_code, opno, speed);
35106 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35107 return true;
35110 else if (GET_CODE (XEXP (x, 0)) == PLUS)
35112 *total = cost->lea;
35113 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35114 outer_code, opno, speed);
35115 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35116 outer_code, opno, speed);
35117 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35118 return true;
35121 /* FALLTHRU */
35123 case MINUS:
35124 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35126 /* ??? SSE cost should be used here. */
35127 *total = cost->fadd;
35128 return false;
35130 else if (X87_FLOAT_MODE_P (mode))
35132 *total = cost->fadd;
35133 return false;
35135 else if (FLOAT_MODE_P (mode))
35137 /* ??? SSE vector cost should be used here. */
35138 *total = cost->fadd;
35139 return false;
35141 /* FALLTHRU */
35143 case AND:
35144 case IOR:
35145 case XOR:
35146 if (GET_MODE_CLASS (mode) == MODE_INT
35147 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35149 *total = (cost->add * 2
35150 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
35151 << (GET_MODE (XEXP (x, 0)) != DImode))
35152 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
35153 << (GET_MODE (XEXP (x, 1)) != DImode)));
35154 return true;
35156 /* FALLTHRU */
35158 case NEG:
35159 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35161 /* ??? SSE cost should be used here. */
35162 *total = cost->fchs;
35163 return false;
35165 else if (X87_FLOAT_MODE_P (mode))
35167 *total = cost->fchs;
35168 return false;
35170 else if (FLOAT_MODE_P (mode))
35172 /* ??? SSE vector cost should be used here. */
35173 *total = cost->fchs;
35174 return false;
35176 /* FALLTHRU */
35178 case NOT:
35179 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35181 /* ??? Should be SSE vector operation cost. */
35182 /* At least for published AMD latencies, this really is the same
35183 as the latency for a simple fpu operation like fabs. */
35184 *total = cost->fabs;
35186 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35187 *total = cost->add * 2;
35188 else
35189 *total = cost->add;
35190 return false;
35192 case COMPARE:
35193 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
35194 && XEXP (XEXP (x, 0), 1) == const1_rtx
35195 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
35196 && XEXP (x, 1) == const0_rtx)
35198 /* This kind of construct is implemented using test[bwl].
35199 Treat it as if we had an AND. */
35200 *total = (cost->add
35201 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
35202 + rtx_cost (const1_rtx, outer_code, opno, speed));
35203 return true;
35205 return false;
35207 case FLOAT_EXTEND:
35208 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35209 *total = 0;
35210 return false;
35212 case ABS:
35213 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35214 /* ??? SSE cost should be used here. */
35215 *total = cost->fabs;
35216 else if (X87_FLOAT_MODE_P (mode))
35217 *total = cost->fabs;
35218 else if (FLOAT_MODE_P (mode))
35219 /* ??? SSE vector cost should be used here. */
35220 *total = cost->fabs;
35221 return false;
35223 case SQRT:
35224 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35225 /* ??? SSE cost should be used here. */
35226 *total = cost->fsqrt;
35227 else if (X87_FLOAT_MODE_P (mode))
35228 *total = cost->fsqrt;
35229 else if (FLOAT_MODE_P (mode))
35230 /* ??? SSE vector cost should be used here. */
35231 *total = cost->fsqrt;
35232 return false;
35234 case UNSPEC:
35235 if (XINT (x, 1) == UNSPEC_TP)
35236 *total = 0;
35237 return false;
35239 case VEC_SELECT:
35240 case VEC_CONCAT:
35241 case VEC_MERGE:
35242 case VEC_DUPLICATE:
35243 /* ??? Assume all of these vector manipulation patterns are
35244 recognizable. In which case they all pretty much have the
35245 same cost. */
35246 *total = cost->fabs;
35247 return true;
35249 default:
35250 return false;
35254 #if TARGET_MACHO
35256 static int current_machopic_label_num;
35258 /* Given a symbol name and its associated stub, write out the
35259 definition of the stub. */
35261 void
35262 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35264 unsigned int length;
35265 char *binder_name, *symbol_name, lazy_ptr_name[32];
35266 int label = ++current_machopic_label_num;
35268 /* For 64-bit we shouldn't get here. */
35269 gcc_assert (!TARGET_64BIT);
35271 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35272 symb = targetm.strip_name_encoding (symb);
35274 length = strlen (stub);
35275 binder_name = XALLOCAVEC (char, length + 32);
35276 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35278 length = strlen (symb);
35279 symbol_name = XALLOCAVEC (char, length + 32);
35280 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35282 sprintf (lazy_ptr_name, "L%d$lz", label);
35284 if (MACHOPIC_ATT_STUB)
35285 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35286 else if (MACHOPIC_PURE)
35287 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35288 else
35289 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35291 fprintf (file, "%s:\n", stub);
35292 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35294 if (MACHOPIC_ATT_STUB)
35296 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35298 else if (MACHOPIC_PURE)
35300 /* PIC stub. */
35301 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35302 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35303 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35304 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35305 label, lazy_ptr_name, label);
35306 fprintf (file, "\tjmp\t*%%ecx\n");
35308 else
35309 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35311 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35312 it needs no stub-binding-helper. */
35313 if (MACHOPIC_ATT_STUB)
35314 return;
35316 fprintf (file, "%s:\n", binder_name);
35318 if (MACHOPIC_PURE)
35320 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35321 fprintf (file, "\tpushl\t%%ecx\n");
35323 else
35324 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35326 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35328 /* N.B. Keep the correspondence of these
35329 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35330 old-pic/new-pic/non-pic stubs; altering this will break
35331 compatibility with existing dylibs. */
35332 if (MACHOPIC_PURE)
35334 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35335 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35337 else
35338 /* 16-byte -mdynamic-no-pic stub. */
35339 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35341 fprintf (file, "%s:\n", lazy_ptr_name);
35342 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35343 fprintf (file, ASM_LONG "%s\n", binder_name);
35345 #endif /* TARGET_MACHO */
35347 /* Order the registers for register allocator. */
35349 void
35350 x86_order_regs_for_local_alloc (void)
35352 int pos = 0;
35353 int i;
35355 /* First allocate the local general purpose registers. */
35356 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35357 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35358 reg_alloc_order [pos++] = i;
35360 /* Global general purpose registers. */
35361 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35362 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35363 reg_alloc_order [pos++] = i;
35365 /* x87 registers come first in case we are doing FP math
35366 using them. */
35367 if (!TARGET_SSE_MATH)
35368 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35369 reg_alloc_order [pos++] = i;
35371 /* SSE registers. */
35372 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35373 reg_alloc_order [pos++] = i;
35374 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35375 reg_alloc_order [pos++] = i;
35377 /* x87 registers. */
35378 if (TARGET_SSE_MATH)
35379 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35380 reg_alloc_order [pos++] = i;
35382 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35383 reg_alloc_order [pos++] = i;
35385 /* Initialize the rest of array as we do not allocate some registers
35386 at all. */
35387 while (pos < FIRST_PSEUDO_REGISTER)
35388 reg_alloc_order [pos++] = 0;
35391 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35392 in struct attribute_spec handler. */
35393 static tree
35394 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35395 tree args,
35396 int flags ATTRIBUTE_UNUSED,
35397 bool *no_add_attrs)
35399 if (TREE_CODE (*node) != FUNCTION_TYPE
35400 && TREE_CODE (*node) != METHOD_TYPE
35401 && TREE_CODE (*node) != FIELD_DECL
35402 && TREE_CODE (*node) != TYPE_DECL)
35404 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35405 name);
35406 *no_add_attrs = true;
35407 return NULL_TREE;
35409 if (TARGET_64BIT)
35411 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35412 name);
35413 *no_add_attrs = true;
35414 return NULL_TREE;
35416 if (is_attribute_p ("callee_pop_aggregate_return", name))
35418 tree cst;
35420 cst = TREE_VALUE (args);
35421 if (TREE_CODE (cst) != INTEGER_CST)
35423 warning (OPT_Wattributes,
35424 "%qE attribute requires an integer constant argument",
35425 name);
35426 *no_add_attrs = true;
35428 else if (compare_tree_int (cst, 0) != 0
35429 && compare_tree_int (cst, 1) != 0)
35431 warning (OPT_Wattributes,
35432 "argument to %qE attribute is neither zero, nor one",
35433 name);
35434 *no_add_attrs = true;
35437 return NULL_TREE;
35440 return NULL_TREE;
35443 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
35444 struct attribute_spec.handler. */
35445 static tree
35446 ix86_handle_abi_attribute (tree *node, tree name,
35447 tree args ATTRIBUTE_UNUSED,
35448 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35450 if (TREE_CODE (*node) != FUNCTION_TYPE
35451 && TREE_CODE (*node) != METHOD_TYPE
35452 && TREE_CODE (*node) != FIELD_DECL
35453 && TREE_CODE (*node) != TYPE_DECL)
35455 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35456 name);
35457 *no_add_attrs = true;
35458 return NULL_TREE;
35461 /* Can combine regparm with all attributes but fastcall. */
35462 if (is_attribute_p ("ms_abi", name))
35464 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
35466 error ("ms_abi and sysv_abi attributes are not compatible");
35469 return NULL_TREE;
35471 else if (is_attribute_p ("sysv_abi", name))
35473 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
35475 error ("ms_abi and sysv_abi attributes are not compatible");
35478 return NULL_TREE;
35481 return NULL_TREE;
35484 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
35485 struct attribute_spec.handler. */
35486 static tree
35487 ix86_handle_struct_attribute (tree *node, tree name,
35488 tree args ATTRIBUTE_UNUSED,
35489 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35491 tree *type = NULL;
35492 if (DECL_P (*node))
35494 if (TREE_CODE (*node) == TYPE_DECL)
35495 type = &TREE_TYPE (*node);
35497 else
35498 type = node;
35500 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
35502 warning (OPT_Wattributes, "%qE attribute ignored",
35503 name);
35504 *no_add_attrs = true;
35507 else if ((is_attribute_p ("ms_struct", name)
35508 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
35509 || ((is_attribute_p ("gcc_struct", name)
35510 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
35512 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
35513 name);
35514 *no_add_attrs = true;
35517 return NULL_TREE;
35520 static tree
35521 ix86_handle_fndecl_attribute (tree *node, tree name,
35522 tree args ATTRIBUTE_UNUSED,
35523 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35525 if (TREE_CODE (*node) != FUNCTION_DECL)
35527 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35528 name);
35529 *no_add_attrs = true;
35531 return NULL_TREE;
35534 static bool
35535 ix86_ms_bitfield_layout_p (const_tree record_type)
35537 return ((TARGET_MS_BITFIELD_LAYOUT
35538 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
35539 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
35542 /* Returns an expression indicating where the this parameter is
35543 located on entry to the FUNCTION. */
35545 static rtx
35546 x86_this_parameter (tree function)
35548 tree type = TREE_TYPE (function);
35549 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
35550 int nregs;
35552 if (TARGET_64BIT)
35554 const int *parm_regs;
35556 if (ix86_function_type_abi (type) == MS_ABI)
35557 parm_regs = x86_64_ms_abi_int_parameter_registers;
35558 else
35559 parm_regs = x86_64_int_parameter_registers;
35560 return gen_rtx_REG (Pmode, parm_regs[aggr]);
35563 nregs = ix86_function_regparm (type, function);
35565 if (nregs > 0 && !stdarg_p (type))
35567 int regno;
35568 unsigned int ccvt = ix86_get_callcvt (type);
35570 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35571 regno = aggr ? DX_REG : CX_REG;
35572 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35574 regno = CX_REG;
35575 if (aggr)
35576 return gen_rtx_MEM (SImode,
35577 plus_constant (Pmode, stack_pointer_rtx, 4));
35579 else
35581 regno = AX_REG;
35582 if (aggr)
35584 regno = DX_REG;
35585 if (nregs == 1)
35586 return gen_rtx_MEM (SImode,
35587 plus_constant (Pmode,
35588 stack_pointer_rtx, 4));
35591 return gen_rtx_REG (SImode, regno);
35594 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
35595 aggr ? 8 : 4));
35598 /* Determine whether x86_output_mi_thunk can succeed. */
35600 static bool
35601 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
35602 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
35603 HOST_WIDE_INT vcall_offset, const_tree function)
35605 /* 64-bit can handle anything. */
35606 if (TARGET_64BIT)
35607 return true;
35609 /* For 32-bit, everything's fine if we have one free register. */
35610 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
35611 return true;
35613 /* Need a free register for vcall_offset. */
35614 if (vcall_offset)
35615 return false;
35617 /* Need a free register for GOT references. */
35618 if (flag_pic && !targetm.binds_local_p (function))
35619 return false;
35621 /* Otherwise ok. */
35622 return true;
35625 /* Output the assembler code for a thunk function. THUNK_DECL is the
35626 declaration for the thunk function itself, FUNCTION is the decl for
35627 the target function. DELTA is an immediate constant offset to be
35628 added to THIS. If VCALL_OFFSET is nonzero, the word at
35629 *(*this + vcall_offset) should be added to THIS. */
35631 static void
35632 x86_output_mi_thunk (FILE *file,
35633 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
35634 HOST_WIDE_INT vcall_offset, tree function)
35636 rtx this_param = x86_this_parameter (function);
35637 rtx this_reg, tmp, fnaddr;
35638 unsigned int tmp_regno;
35640 if (TARGET_64BIT)
35641 tmp_regno = R10_REG;
35642 else
35644 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
35645 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35646 tmp_regno = AX_REG;
35647 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35648 tmp_regno = DX_REG;
35649 else
35650 tmp_regno = CX_REG;
35653 emit_note (NOTE_INSN_PROLOGUE_END);
35655 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
35656 pull it in now and let DELTA benefit. */
35657 if (REG_P (this_param))
35658 this_reg = this_param;
35659 else if (vcall_offset)
35661 /* Put the this parameter into %eax. */
35662 this_reg = gen_rtx_REG (Pmode, AX_REG);
35663 emit_move_insn (this_reg, this_param);
35665 else
35666 this_reg = NULL_RTX;
35668 /* Adjust the this parameter by a fixed constant. */
35669 if (delta)
35671 rtx delta_rtx = GEN_INT (delta);
35672 rtx delta_dst = this_reg ? this_reg : this_param;
35674 if (TARGET_64BIT)
35676 if (!x86_64_general_operand (delta_rtx, Pmode))
35678 tmp = gen_rtx_REG (Pmode, tmp_regno);
35679 emit_move_insn (tmp, delta_rtx);
35680 delta_rtx = tmp;
35684 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
35687 /* Adjust the this parameter by a value stored in the vtable. */
35688 if (vcall_offset)
35690 rtx vcall_addr, vcall_mem, this_mem;
35692 tmp = gen_rtx_REG (Pmode, tmp_regno);
35694 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
35695 if (Pmode != ptr_mode)
35696 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
35697 emit_move_insn (tmp, this_mem);
35699 /* Adjust the this parameter. */
35700 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
35701 if (TARGET_64BIT
35702 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
35704 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
35705 emit_move_insn (tmp2, GEN_INT (vcall_offset));
35706 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
35709 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
35710 if (Pmode != ptr_mode)
35711 emit_insn (gen_addsi_1_zext (this_reg,
35712 gen_rtx_REG (ptr_mode,
35713 REGNO (this_reg)),
35714 vcall_mem));
35715 else
35716 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
35719 /* If necessary, drop THIS back to its stack slot. */
35720 if (this_reg && this_reg != this_param)
35721 emit_move_insn (this_param, this_reg);
35723 fnaddr = XEXP (DECL_RTL (function), 0);
35724 if (TARGET_64BIT)
35726 if (!flag_pic || targetm.binds_local_p (function)
35727 || TARGET_PECOFF)
35729 else
35731 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
35732 tmp = gen_rtx_CONST (Pmode, tmp);
35733 fnaddr = gen_rtx_MEM (Pmode, tmp);
35736 else
35738 if (!flag_pic || targetm.binds_local_p (function))
35740 #if TARGET_MACHO
35741 else if (TARGET_MACHO)
35743 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
35744 fnaddr = XEXP (fnaddr, 0);
35746 #endif /* TARGET_MACHO */
35747 else
35749 tmp = gen_rtx_REG (Pmode, CX_REG);
35750 output_set_got (tmp, NULL_RTX);
35752 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
35753 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
35754 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
35758 /* Our sibling call patterns do not allow memories, because we have no
35759 predicate that can distinguish between frame and non-frame memory.
35760 For our purposes here, we can get away with (ab)using a jump pattern,
35761 because we're going to do no optimization. */
35762 if (MEM_P (fnaddr))
35763 emit_jump_insn (gen_indirect_jump (fnaddr));
35764 else
35766 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35767 fnaddr = legitimize_pic_address (fnaddr,
35768 gen_rtx_REG (Pmode, tmp_regno));
35770 if (!sibcall_insn_operand (fnaddr, word_mode))
35772 tmp = gen_rtx_REG (word_mode, tmp_regno);
35773 if (GET_MODE (fnaddr) != word_mode)
35774 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35775 emit_move_insn (tmp, fnaddr);
35776 fnaddr = tmp;
35779 tmp = gen_rtx_MEM (QImode, fnaddr);
35780 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35781 tmp = emit_call_insn (tmp);
35782 SIBLING_CALL_P (tmp) = 1;
35784 emit_barrier ();
35786 /* Emit just enough of rest_of_compilation to get the insns emitted.
35787 Note that use_thunk calls assemble_start_function et al. */
35788 tmp = get_insns ();
35789 shorten_branches (tmp);
35790 final_start_function (tmp, file, 1);
35791 final (tmp, file, 1);
35792 final_end_function ();
35795 static void
35796 x86_file_start (void)
35798 default_file_start ();
35799 #if TARGET_MACHO
35800 darwin_file_start ();
35801 #endif
35802 if (X86_FILE_START_VERSION_DIRECTIVE)
35803 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35804 if (X86_FILE_START_FLTUSED)
35805 fputs ("\t.global\t__fltused\n", asm_out_file);
35806 if (ix86_asm_dialect == ASM_INTEL)
35807 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35811 x86_field_alignment (tree field, int computed)
35813 enum machine_mode mode;
35814 tree type = TREE_TYPE (field);
35816 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35817 return computed;
35818 mode = TYPE_MODE (strip_array_types (type));
35819 if (mode == DFmode || mode == DCmode
35820 || GET_MODE_CLASS (mode) == MODE_INT
35821 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35822 return MIN (32, computed);
35823 return computed;
35826 /* Output assembler code to FILE to increment profiler label # LABELNO
35827 for profiling a function entry. */
35828 void
35829 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35831 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35832 : MCOUNT_NAME);
35834 if (TARGET_64BIT)
35836 #ifndef NO_PROFILE_COUNTERS
35837 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35838 #endif
35840 if (!TARGET_PECOFF && flag_pic)
35841 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35842 else
35843 fprintf (file, "\tcall\t%s\n", mcount_name);
35845 else if (flag_pic)
35847 #ifndef NO_PROFILE_COUNTERS
35848 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35849 LPREFIX, labelno);
35850 #endif
35851 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35853 else
35855 #ifndef NO_PROFILE_COUNTERS
35856 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35857 LPREFIX, labelno);
35858 #endif
35859 fprintf (file, "\tcall\t%s\n", mcount_name);
35863 /* We don't have exact information about the insn sizes, but we may assume
35864 quite safely that we are informed about all 1 byte insns and memory
35865 address sizes. This is enough to eliminate unnecessary padding in
35866 99% of cases. */
35868 static int
35869 min_insn_size (rtx insn)
35871 int l = 0, len;
35873 if (!INSN_P (insn) || !active_insn_p (insn))
35874 return 0;
35876 /* Discard alignments we've emit and jump instructions. */
35877 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35878 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35879 return 0;
35881 /* Important case - calls are always 5 bytes.
35882 It is common to have many calls in the row. */
35883 if (CALL_P (insn)
35884 && symbolic_reference_mentioned_p (PATTERN (insn))
35885 && !SIBLING_CALL_P (insn))
35886 return 5;
35887 len = get_attr_length (insn);
35888 if (len <= 1)
35889 return 1;
35891 /* For normal instructions we rely on get_attr_length being exact,
35892 with a few exceptions. */
35893 if (!JUMP_P (insn))
35895 enum attr_type type = get_attr_type (insn);
35897 switch (type)
35899 case TYPE_MULTI:
35900 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35901 || asm_noperands (PATTERN (insn)) >= 0)
35902 return 0;
35903 break;
35904 case TYPE_OTHER:
35905 case TYPE_FCMP:
35906 break;
35907 default:
35908 /* Otherwise trust get_attr_length. */
35909 return len;
35912 l = get_attr_length_address (insn);
35913 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35914 l = 4;
35916 if (l)
35917 return 1+l;
35918 else
35919 return 2;
35922 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35924 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35925 window. */
35927 static void
35928 ix86_avoid_jump_mispredicts (void)
35930 rtx insn, start = get_insns ();
35931 int nbytes = 0, njumps = 0;
35932 int isjump = 0;
35934 /* Look for all minimal intervals of instructions containing 4 jumps.
35935 The intervals are bounded by START and INSN. NBYTES is the total
35936 size of instructions in the interval including INSN and not including
35937 START. When the NBYTES is smaller than 16 bytes, it is possible
35938 that the end of START and INSN ends up in the same 16byte page.
35940 The smallest offset in the page INSN can start is the case where START
35941 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35942 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35944 for (insn = start; insn; insn = NEXT_INSN (insn))
35946 int min_size;
35948 if (LABEL_P (insn))
35950 int align = label_to_alignment (insn);
35951 int max_skip = label_to_max_skip (insn);
35953 if (max_skip > 15)
35954 max_skip = 15;
35955 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35956 already in the current 16 byte page, because otherwise
35957 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35958 bytes to reach 16 byte boundary. */
35959 if (align <= 0
35960 || (align <= 3 && max_skip != (1 << align) - 1))
35961 max_skip = 0;
35962 if (dump_file)
35963 fprintf (dump_file, "Label %i with max_skip %i\n",
35964 INSN_UID (insn), max_skip);
35965 if (max_skip)
35967 while (nbytes + max_skip >= 16)
35969 start = NEXT_INSN (start);
35970 if (JUMP_P (start) || CALL_P (start))
35971 njumps--, isjump = 1;
35972 else
35973 isjump = 0;
35974 nbytes -= min_insn_size (start);
35977 continue;
35980 min_size = min_insn_size (insn);
35981 nbytes += min_size;
35982 if (dump_file)
35983 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35984 INSN_UID (insn), min_size);
35985 if (JUMP_P (insn) || CALL_P (insn))
35986 njumps++;
35987 else
35988 continue;
35990 while (njumps > 3)
35992 start = NEXT_INSN (start);
35993 if (JUMP_P (start) || CALL_P (start))
35994 njumps--, isjump = 1;
35995 else
35996 isjump = 0;
35997 nbytes -= min_insn_size (start);
35999 gcc_assert (njumps >= 0);
36000 if (dump_file)
36001 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
36002 INSN_UID (start), INSN_UID (insn), nbytes);
36004 if (njumps == 3 && isjump && nbytes < 16)
36006 int padsize = 15 - nbytes + min_insn_size (insn);
36008 if (dump_file)
36009 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
36010 INSN_UID (insn), padsize);
36011 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
36015 #endif
36017 /* AMD Athlon works faster
36018 when RET is not destination of conditional jump or directly preceded
36019 by other jump instruction. We avoid the penalty by inserting NOP just
36020 before the RET instructions in such cases. */
36021 static void
36022 ix86_pad_returns (void)
36024 edge e;
36025 edge_iterator ei;
36027 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36029 basic_block bb = e->src;
36030 rtx ret = BB_END (bb);
36031 rtx prev;
36032 bool replace = false;
36034 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
36035 || optimize_bb_for_size_p (bb))
36036 continue;
36037 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
36038 if (active_insn_p (prev) || LABEL_P (prev))
36039 break;
36040 if (prev && LABEL_P (prev))
36042 edge e;
36043 edge_iterator ei;
36045 FOR_EACH_EDGE (e, ei, bb->preds)
36046 if (EDGE_FREQUENCY (e) && e->src->index >= 0
36047 && !(e->flags & EDGE_FALLTHRU))
36049 replace = true;
36050 break;
36053 if (!replace)
36055 prev = prev_active_insn (ret);
36056 if (prev
36057 && ((JUMP_P (prev) && any_condjump_p (prev))
36058 || CALL_P (prev)))
36059 replace = true;
36060 /* Empty functions get branch mispredict even when
36061 the jump destination is not visible to us. */
36062 if (!prev && !optimize_function_for_size_p (cfun))
36063 replace = true;
36065 if (replace)
36067 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
36068 delete_insn (ret);
36073 /* Count the minimum number of instructions in BB. Return 4 if the
36074 number of instructions >= 4. */
36076 static int
36077 ix86_count_insn_bb (basic_block bb)
36079 rtx insn;
36080 int insn_count = 0;
36082 /* Count number of instructions in this block. Return 4 if the number
36083 of instructions >= 4. */
36084 FOR_BB_INSNS (bb, insn)
36086 /* Only happen in exit blocks. */
36087 if (JUMP_P (insn)
36088 && ANY_RETURN_P (PATTERN (insn)))
36089 break;
36091 if (NONDEBUG_INSN_P (insn)
36092 && GET_CODE (PATTERN (insn)) != USE
36093 && GET_CODE (PATTERN (insn)) != CLOBBER)
36095 insn_count++;
36096 if (insn_count >= 4)
36097 return insn_count;
36101 return insn_count;
36105 /* Count the minimum number of instructions in code path in BB.
36106 Return 4 if the number of instructions >= 4. */
36108 static int
36109 ix86_count_insn (basic_block bb)
36111 edge e;
36112 edge_iterator ei;
36113 int min_prev_count;
36115 /* Only bother counting instructions along paths with no
36116 more than 2 basic blocks between entry and exit. Given
36117 that BB has an edge to exit, determine if a predecessor
36118 of BB has an edge from entry. If so, compute the number
36119 of instructions in the predecessor block. If there
36120 happen to be multiple such blocks, compute the minimum. */
36121 min_prev_count = 4;
36122 FOR_EACH_EDGE (e, ei, bb->preds)
36124 edge prev_e;
36125 edge_iterator prev_ei;
36127 if (e->src == ENTRY_BLOCK_PTR)
36129 min_prev_count = 0;
36130 break;
36132 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
36134 if (prev_e->src == ENTRY_BLOCK_PTR)
36136 int count = ix86_count_insn_bb (e->src);
36137 if (count < min_prev_count)
36138 min_prev_count = count;
36139 break;
36144 if (min_prev_count < 4)
36145 min_prev_count += ix86_count_insn_bb (bb);
36147 return min_prev_count;
36150 /* Pad short function to 4 instructions. */
36152 static void
36153 ix86_pad_short_function (void)
36155 edge e;
36156 edge_iterator ei;
36158 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36160 rtx ret = BB_END (e->src);
36161 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
36163 int insn_count = ix86_count_insn (e->src);
36165 /* Pad short function. */
36166 if (insn_count < 4)
36168 rtx insn = ret;
36170 /* Find epilogue. */
36171 while (insn
36172 && (!NOTE_P (insn)
36173 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
36174 insn = PREV_INSN (insn);
36176 if (!insn)
36177 insn = ret;
36179 /* Two NOPs count as one instruction. */
36180 insn_count = 2 * (4 - insn_count);
36181 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
36187 /* Fix up a Windows system unwinder issue. If an EH region falls through into
36188 the epilogue, the Windows system unwinder will apply epilogue logic and
36189 produce incorrect offsets. This can be avoided by adding a nop between
36190 the last insn that can throw and the first insn of the epilogue. */
36192 static void
36193 ix86_seh_fixup_eh_fallthru (void)
36195 edge e;
36196 edge_iterator ei;
36198 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36200 rtx insn, next;
36202 /* Find the beginning of the epilogue. */
36203 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36204 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36205 break;
36206 if (insn == NULL)
36207 continue;
36209 /* We only care about preceding insns that can throw. */
36210 insn = prev_active_insn (insn);
36211 if (insn == NULL || !can_throw_internal (insn))
36212 continue;
36214 /* Do not separate calls from their debug information. */
36215 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36216 if (NOTE_P (next)
36217 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36218 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36219 insn = next;
36220 else
36221 break;
36223 emit_insn_after (gen_nops (const1_rtx), insn);
36227 /* Implement machine specific optimizations. We implement padding of returns
36228 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36229 static void
36230 ix86_reorg (void)
36232 /* We are freeing block_for_insn in the toplev to keep compatibility
36233 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36234 compute_bb_for_insn ();
36236 if (TARGET_SEH && current_function_has_exception_handlers ())
36237 ix86_seh_fixup_eh_fallthru ();
36239 if (optimize && optimize_function_for_speed_p (cfun))
36241 if (TARGET_PAD_SHORT_FUNCTION)
36242 ix86_pad_short_function ();
36243 else if (TARGET_PAD_RETURNS)
36244 ix86_pad_returns ();
36245 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36246 if (TARGET_FOUR_JUMP_LIMIT)
36247 ix86_avoid_jump_mispredicts ();
36248 #endif
36252 /* Return nonzero when QImode register that must be represented via REX prefix
36253 is used. */
36254 bool
36255 x86_extended_QIreg_mentioned_p (rtx insn)
36257 int i;
36258 extract_insn_cached (insn);
36259 for (i = 0; i < recog_data.n_operands; i++)
36260 if (GENERAL_REG_P (recog_data.operand[i])
36261 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36262 return true;
36263 return false;
36266 /* Return nonzero when P points to register encoded via REX prefix.
36267 Called via for_each_rtx. */
36268 static int
36269 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36271 unsigned int regno;
36272 if (!REG_P (*p))
36273 return 0;
36274 regno = REGNO (*p);
36275 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36278 /* Return true when INSN mentions register that must be encoded using REX
36279 prefix. */
36280 bool
36281 x86_extended_reg_mentioned_p (rtx insn)
36283 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36284 extended_reg_mentioned_1, NULL);
36287 /* If profitable, negate (without causing overflow) integer constant
36288 of mode MODE at location LOC. Return true in this case. */
36289 bool
36290 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36292 HOST_WIDE_INT val;
36294 if (!CONST_INT_P (*loc))
36295 return false;
36297 switch (mode)
36299 case DImode:
36300 /* DImode x86_64 constants must fit in 32 bits. */
36301 gcc_assert (x86_64_immediate_operand (*loc, mode));
36303 mode = SImode;
36304 break;
36306 case SImode:
36307 case HImode:
36308 case QImode:
36309 break;
36311 default:
36312 gcc_unreachable ();
36315 /* Avoid overflows. */
36316 if (mode_signbit_p (mode, *loc))
36317 return false;
36319 val = INTVAL (*loc);
36321 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36322 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36323 if ((val < 0 && val != -128)
36324 || val == 128)
36326 *loc = GEN_INT (-val);
36327 return true;
36330 return false;
36333 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36334 optabs would emit if we didn't have TFmode patterns. */
36336 void
36337 x86_emit_floatuns (rtx operands[2])
36339 rtx neglab, donelab, i0, i1, f0, in, out;
36340 enum machine_mode mode, inmode;
36342 inmode = GET_MODE (operands[1]);
36343 gcc_assert (inmode == SImode || inmode == DImode);
36345 out = operands[0];
36346 in = force_reg (inmode, operands[1]);
36347 mode = GET_MODE (out);
36348 neglab = gen_label_rtx ();
36349 donelab = gen_label_rtx ();
36350 f0 = gen_reg_rtx (mode);
36352 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36354 expand_float (out, in, 0);
36356 emit_jump_insn (gen_jump (donelab));
36357 emit_barrier ();
36359 emit_label (neglab);
36361 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36362 1, OPTAB_DIRECT);
36363 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36364 1, OPTAB_DIRECT);
36365 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36367 expand_float (f0, i0, 0);
36369 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36371 emit_label (donelab);
36374 /* AVX2 does support 32-byte integer vector operations,
36375 thus the longest vector we are faced with is V32QImode. */
36376 #define MAX_VECT_LEN 32
36378 struct expand_vec_perm_d
36380 rtx target, op0, op1;
36381 unsigned char perm[MAX_VECT_LEN];
36382 enum machine_mode vmode;
36383 unsigned char nelt;
36384 bool one_operand_p;
36385 bool testing_p;
36388 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36389 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36390 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36392 /* Get a vector mode of the same size as the original but with elements
36393 twice as wide. This is only guaranteed to apply to integral vectors. */
36395 static inline enum machine_mode
36396 get_mode_wider_vector (enum machine_mode o)
36398 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36399 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36400 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36401 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36402 return n;
36405 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36406 with all elements equal to VAR. Return true if successful. */
36408 static bool
36409 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36410 rtx target, rtx val)
36412 bool ok;
36414 switch (mode)
36416 case V2SImode:
36417 case V2SFmode:
36418 if (!mmx_ok)
36419 return false;
36420 /* FALLTHRU */
36422 case V4DFmode:
36423 case V4DImode:
36424 case V8SFmode:
36425 case V8SImode:
36426 case V2DFmode:
36427 case V2DImode:
36428 case V4SFmode:
36429 case V4SImode:
36431 rtx insn, dup;
36433 /* First attempt to recognize VAL as-is. */
36434 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36435 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36436 if (recog_memoized (insn) < 0)
36438 rtx seq;
36439 /* If that fails, force VAL into a register. */
36441 start_sequence ();
36442 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
36443 seq = get_insns ();
36444 end_sequence ();
36445 if (seq)
36446 emit_insn_before (seq, insn);
36448 ok = recog_memoized (insn) >= 0;
36449 gcc_assert (ok);
36452 return true;
36454 case V4HImode:
36455 if (!mmx_ok)
36456 return false;
36457 if (TARGET_SSE || TARGET_3DNOW_A)
36459 rtx x;
36461 val = gen_lowpart (SImode, val);
36462 x = gen_rtx_TRUNCATE (HImode, val);
36463 x = gen_rtx_VEC_DUPLICATE (mode, x);
36464 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36465 return true;
36467 goto widen;
36469 case V8QImode:
36470 if (!mmx_ok)
36471 return false;
36472 goto widen;
36474 case V8HImode:
36475 if (TARGET_SSE2)
36477 struct expand_vec_perm_d dperm;
36478 rtx tmp1, tmp2;
36480 permute:
36481 memset (&dperm, 0, sizeof (dperm));
36482 dperm.target = target;
36483 dperm.vmode = mode;
36484 dperm.nelt = GET_MODE_NUNITS (mode);
36485 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
36486 dperm.one_operand_p = true;
36488 /* Extend to SImode using a paradoxical SUBREG. */
36489 tmp1 = gen_reg_rtx (SImode);
36490 emit_move_insn (tmp1, gen_lowpart (SImode, val));
36492 /* Insert the SImode value as low element of a V4SImode vector. */
36493 tmp2 = gen_lowpart (V4SImode, dperm.op0);
36494 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
36496 ok = (expand_vec_perm_1 (&dperm)
36497 || expand_vec_perm_broadcast_1 (&dperm));
36498 gcc_assert (ok);
36499 return ok;
36501 goto widen;
36503 case V16QImode:
36504 if (TARGET_SSE2)
36505 goto permute;
36506 goto widen;
36508 widen:
36509 /* Replicate the value once into the next wider mode and recurse. */
36511 enum machine_mode smode, wsmode, wvmode;
36512 rtx x;
36514 smode = GET_MODE_INNER (mode);
36515 wvmode = get_mode_wider_vector (mode);
36516 wsmode = GET_MODE_INNER (wvmode);
36518 val = convert_modes (wsmode, smode, val, true);
36519 x = expand_simple_binop (wsmode, ASHIFT, val,
36520 GEN_INT (GET_MODE_BITSIZE (smode)),
36521 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36522 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
36524 x = gen_lowpart (wvmode, target);
36525 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
36526 gcc_assert (ok);
36527 return ok;
36530 case V16HImode:
36531 case V32QImode:
36533 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
36534 rtx x = gen_reg_rtx (hvmode);
36536 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
36537 gcc_assert (ok);
36539 x = gen_rtx_VEC_CONCAT (mode, x, x);
36540 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36542 return true;
36544 default:
36545 return false;
36549 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36550 whose ONE_VAR element is VAR, and other elements are zero. Return true
36551 if successful. */
36553 static bool
36554 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
36555 rtx target, rtx var, int one_var)
36557 enum machine_mode vsimode;
36558 rtx new_target;
36559 rtx x, tmp;
36560 bool use_vector_set = false;
36562 switch (mode)
36564 case V2DImode:
36565 /* For SSE4.1, we normally use vector set. But if the second
36566 element is zero and inter-unit moves are OK, we use movq
36567 instead. */
36568 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
36569 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
36570 && one_var == 0));
36571 break;
36572 case V16QImode:
36573 case V4SImode:
36574 case V4SFmode:
36575 use_vector_set = TARGET_SSE4_1;
36576 break;
36577 case V8HImode:
36578 use_vector_set = TARGET_SSE2;
36579 break;
36580 case V4HImode:
36581 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
36582 break;
36583 case V32QImode:
36584 case V16HImode:
36585 case V8SImode:
36586 case V8SFmode:
36587 case V4DFmode:
36588 use_vector_set = TARGET_AVX;
36589 break;
36590 case V4DImode:
36591 /* Use ix86_expand_vector_set in 64bit mode only. */
36592 use_vector_set = TARGET_AVX && TARGET_64BIT;
36593 break;
36594 default:
36595 break;
36598 if (use_vector_set)
36600 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
36601 var = force_reg (GET_MODE_INNER (mode), var);
36602 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36603 return true;
36606 switch (mode)
36608 case V2SFmode:
36609 case V2SImode:
36610 if (!mmx_ok)
36611 return false;
36612 /* FALLTHRU */
36614 case V2DFmode:
36615 case V2DImode:
36616 if (one_var != 0)
36617 return false;
36618 var = force_reg (GET_MODE_INNER (mode), var);
36619 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
36620 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36621 return true;
36623 case V4SFmode:
36624 case V4SImode:
36625 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
36626 new_target = gen_reg_rtx (mode);
36627 else
36628 new_target = target;
36629 var = force_reg (GET_MODE_INNER (mode), var);
36630 x = gen_rtx_VEC_DUPLICATE (mode, var);
36631 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
36632 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
36633 if (one_var != 0)
36635 /* We need to shuffle the value to the correct position, so
36636 create a new pseudo to store the intermediate result. */
36638 /* With SSE2, we can use the integer shuffle insns. */
36639 if (mode != V4SFmode && TARGET_SSE2)
36641 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
36642 const1_rtx,
36643 GEN_INT (one_var == 1 ? 0 : 1),
36644 GEN_INT (one_var == 2 ? 0 : 1),
36645 GEN_INT (one_var == 3 ? 0 : 1)));
36646 if (target != new_target)
36647 emit_move_insn (target, new_target);
36648 return true;
36651 /* Otherwise convert the intermediate result to V4SFmode and
36652 use the SSE1 shuffle instructions. */
36653 if (mode != V4SFmode)
36655 tmp = gen_reg_rtx (V4SFmode);
36656 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
36658 else
36659 tmp = new_target;
36661 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
36662 const1_rtx,
36663 GEN_INT (one_var == 1 ? 0 : 1),
36664 GEN_INT (one_var == 2 ? 0+4 : 1+4),
36665 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
36667 if (mode != V4SFmode)
36668 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
36669 else if (tmp != target)
36670 emit_move_insn (target, tmp);
36672 else if (target != new_target)
36673 emit_move_insn (target, new_target);
36674 return true;
36676 case V8HImode:
36677 case V16QImode:
36678 vsimode = V4SImode;
36679 goto widen;
36680 case V4HImode:
36681 case V8QImode:
36682 if (!mmx_ok)
36683 return false;
36684 vsimode = V2SImode;
36685 goto widen;
36686 widen:
36687 if (one_var != 0)
36688 return false;
36690 /* Zero extend the variable element to SImode and recurse. */
36691 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
36693 x = gen_reg_rtx (vsimode);
36694 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
36695 var, one_var))
36696 gcc_unreachable ();
36698 emit_move_insn (target, gen_lowpart (mode, x));
36699 return true;
36701 default:
36702 return false;
36706 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36707 consisting of the values in VALS. It is known that all elements
36708 except ONE_VAR are constants. Return true if successful. */
36710 static bool
36711 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
36712 rtx target, rtx vals, int one_var)
36714 rtx var = XVECEXP (vals, 0, one_var);
36715 enum machine_mode wmode;
36716 rtx const_vec, x;
36718 const_vec = copy_rtx (vals);
36719 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
36720 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
36722 switch (mode)
36724 case V2DFmode:
36725 case V2DImode:
36726 case V2SFmode:
36727 case V2SImode:
36728 /* For the two element vectors, it's just as easy to use
36729 the general case. */
36730 return false;
36732 case V4DImode:
36733 /* Use ix86_expand_vector_set in 64bit mode only. */
36734 if (!TARGET_64BIT)
36735 return false;
36736 case V4DFmode:
36737 case V8SFmode:
36738 case V8SImode:
36739 case V16HImode:
36740 case V32QImode:
36741 case V4SFmode:
36742 case V4SImode:
36743 case V8HImode:
36744 case V4HImode:
36745 break;
36747 case V16QImode:
36748 if (TARGET_SSE4_1)
36749 break;
36750 wmode = V8HImode;
36751 goto widen;
36752 case V8QImode:
36753 wmode = V4HImode;
36754 goto widen;
36755 widen:
36756 /* There's no way to set one QImode entry easily. Combine
36757 the variable value with its adjacent constant value, and
36758 promote to an HImode set. */
36759 x = XVECEXP (vals, 0, one_var ^ 1);
36760 if (one_var & 1)
36762 var = convert_modes (HImode, QImode, var, true);
36763 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
36764 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36765 x = GEN_INT (INTVAL (x) & 0xff);
36767 else
36769 var = convert_modes (HImode, QImode, var, true);
36770 x = gen_int_mode (INTVAL (x) << 8, HImode);
36772 if (x != const0_rtx)
36773 var = expand_simple_binop (HImode, IOR, var, x, var,
36774 1, OPTAB_LIB_WIDEN);
36776 x = gen_reg_rtx (wmode);
36777 emit_move_insn (x, gen_lowpart (wmode, const_vec));
36778 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
36780 emit_move_insn (target, gen_lowpart (mode, x));
36781 return true;
36783 default:
36784 return false;
36787 emit_move_insn (target, const_vec);
36788 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36789 return true;
36792 /* A subroutine of ix86_expand_vector_init_general. Use vector
36793 concatenate to handle the most general case: all values variable,
36794 and none identical. */
36796 static void
36797 ix86_expand_vector_init_concat (enum machine_mode mode,
36798 rtx target, rtx *ops, int n)
36800 enum machine_mode cmode, hmode = VOIDmode;
36801 rtx first[8], second[4];
36802 rtvec v;
36803 int i, j;
36805 switch (n)
36807 case 2:
36808 switch (mode)
36810 case V8SImode:
36811 cmode = V4SImode;
36812 break;
36813 case V8SFmode:
36814 cmode = V4SFmode;
36815 break;
36816 case V4DImode:
36817 cmode = V2DImode;
36818 break;
36819 case V4DFmode:
36820 cmode = V2DFmode;
36821 break;
36822 case V4SImode:
36823 cmode = V2SImode;
36824 break;
36825 case V4SFmode:
36826 cmode = V2SFmode;
36827 break;
36828 case V2DImode:
36829 cmode = DImode;
36830 break;
36831 case V2SImode:
36832 cmode = SImode;
36833 break;
36834 case V2DFmode:
36835 cmode = DFmode;
36836 break;
36837 case V2SFmode:
36838 cmode = SFmode;
36839 break;
36840 default:
36841 gcc_unreachable ();
36844 if (!register_operand (ops[1], cmode))
36845 ops[1] = force_reg (cmode, ops[1]);
36846 if (!register_operand (ops[0], cmode))
36847 ops[0] = force_reg (cmode, ops[0]);
36848 emit_insn (gen_rtx_SET (VOIDmode, target,
36849 gen_rtx_VEC_CONCAT (mode, ops[0],
36850 ops[1])));
36851 break;
36853 case 4:
36854 switch (mode)
36856 case V4DImode:
36857 cmode = V2DImode;
36858 break;
36859 case V4DFmode:
36860 cmode = V2DFmode;
36861 break;
36862 case V4SImode:
36863 cmode = V2SImode;
36864 break;
36865 case V4SFmode:
36866 cmode = V2SFmode;
36867 break;
36868 default:
36869 gcc_unreachable ();
36871 goto half;
36873 case 8:
36874 switch (mode)
36876 case V8SImode:
36877 cmode = V2SImode;
36878 hmode = V4SImode;
36879 break;
36880 case V8SFmode:
36881 cmode = V2SFmode;
36882 hmode = V4SFmode;
36883 break;
36884 default:
36885 gcc_unreachable ();
36887 goto half;
36889 half:
36890 /* FIXME: We process inputs backward to help RA. PR 36222. */
36891 i = n - 1;
36892 j = (n >> 1) - 1;
36893 for (; i > 0; i -= 2, j--)
36895 first[j] = gen_reg_rtx (cmode);
36896 v = gen_rtvec (2, ops[i - 1], ops[i]);
36897 ix86_expand_vector_init (false, first[j],
36898 gen_rtx_PARALLEL (cmode, v));
36901 n >>= 1;
36902 if (n > 2)
36904 gcc_assert (hmode != VOIDmode);
36905 for (i = j = 0; i < n; i += 2, j++)
36907 second[j] = gen_reg_rtx (hmode);
36908 ix86_expand_vector_init_concat (hmode, second [j],
36909 &first [i], 2);
36911 n >>= 1;
36912 ix86_expand_vector_init_concat (mode, target, second, n);
36914 else
36915 ix86_expand_vector_init_concat (mode, target, first, n);
36916 break;
36918 default:
36919 gcc_unreachable ();
36923 /* A subroutine of ix86_expand_vector_init_general. Use vector
36924 interleave to handle the most general case: all values variable,
36925 and none identical. */
36927 static void
36928 ix86_expand_vector_init_interleave (enum machine_mode mode,
36929 rtx target, rtx *ops, int n)
36931 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36932 int i, j;
36933 rtx op0, op1;
36934 rtx (*gen_load_even) (rtx, rtx, rtx);
36935 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36936 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36938 switch (mode)
36940 case V8HImode:
36941 gen_load_even = gen_vec_setv8hi;
36942 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36943 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36944 inner_mode = HImode;
36945 first_imode = V4SImode;
36946 second_imode = V2DImode;
36947 third_imode = VOIDmode;
36948 break;
36949 case V16QImode:
36950 gen_load_even = gen_vec_setv16qi;
36951 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36952 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36953 inner_mode = QImode;
36954 first_imode = V8HImode;
36955 second_imode = V4SImode;
36956 third_imode = V2DImode;
36957 break;
36958 default:
36959 gcc_unreachable ();
36962 for (i = 0; i < n; i++)
36964 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36965 op0 = gen_reg_rtx (SImode);
36966 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36968 /* Insert the SImode value as low element of V4SImode vector. */
36969 op1 = gen_reg_rtx (V4SImode);
36970 op0 = gen_rtx_VEC_MERGE (V4SImode,
36971 gen_rtx_VEC_DUPLICATE (V4SImode,
36972 op0),
36973 CONST0_RTX (V4SImode),
36974 const1_rtx);
36975 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36977 /* Cast the V4SImode vector back to a vector in orignal mode. */
36978 op0 = gen_reg_rtx (mode);
36979 emit_move_insn (op0, gen_lowpart (mode, op1));
36981 /* Load even elements into the second position. */
36982 emit_insn (gen_load_even (op0,
36983 force_reg (inner_mode,
36984 ops [i + i + 1]),
36985 const1_rtx));
36987 /* Cast vector to FIRST_IMODE vector. */
36988 ops[i] = gen_reg_rtx (first_imode);
36989 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36992 /* Interleave low FIRST_IMODE vectors. */
36993 for (i = j = 0; i < n; i += 2, j++)
36995 op0 = gen_reg_rtx (first_imode);
36996 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36998 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36999 ops[j] = gen_reg_rtx (second_imode);
37000 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
37003 /* Interleave low SECOND_IMODE vectors. */
37004 switch (second_imode)
37006 case V4SImode:
37007 for (i = j = 0; i < n / 2; i += 2, j++)
37009 op0 = gen_reg_rtx (second_imode);
37010 emit_insn (gen_interleave_second_low (op0, ops[i],
37011 ops[i + 1]));
37013 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
37014 vector. */
37015 ops[j] = gen_reg_rtx (third_imode);
37016 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
37018 second_imode = V2DImode;
37019 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37020 /* FALLTHRU */
37022 case V2DImode:
37023 op0 = gen_reg_rtx (second_imode);
37024 emit_insn (gen_interleave_second_low (op0, ops[0],
37025 ops[1]));
37027 /* Cast the SECOND_IMODE vector back to a vector on original
37028 mode. */
37029 emit_insn (gen_rtx_SET (VOIDmode, target,
37030 gen_lowpart (mode, op0)));
37031 break;
37033 default:
37034 gcc_unreachable ();
37038 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
37039 all values variable, and none identical. */
37041 static void
37042 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
37043 rtx target, rtx vals)
37045 rtx ops[32], op0, op1;
37046 enum machine_mode half_mode = VOIDmode;
37047 int n, i;
37049 switch (mode)
37051 case V2SFmode:
37052 case V2SImode:
37053 if (!mmx_ok && !TARGET_SSE)
37054 break;
37055 /* FALLTHRU */
37057 case V8SFmode:
37058 case V8SImode:
37059 case V4DFmode:
37060 case V4DImode:
37061 case V4SFmode:
37062 case V4SImode:
37063 case V2DFmode:
37064 case V2DImode:
37065 n = GET_MODE_NUNITS (mode);
37066 for (i = 0; i < n; i++)
37067 ops[i] = XVECEXP (vals, 0, i);
37068 ix86_expand_vector_init_concat (mode, target, ops, n);
37069 return;
37071 case V32QImode:
37072 half_mode = V16QImode;
37073 goto half;
37075 case V16HImode:
37076 half_mode = V8HImode;
37077 goto half;
37079 half:
37080 n = GET_MODE_NUNITS (mode);
37081 for (i = 0; i < n; i++)
37082 ops[i] = XVECEXP (vals, 0, i);
37083 op0 = gen_reg_rtx (half_mode);
37084 op1 = gen_reg_rtx (half_mode);
37085 ix86_expand_vector_init_interleave (half_mode, op0, ops,
37086 n >> 2);
37087 ix86_expand_vector_init_interleave (half_mode, op1,
37088 &ops [n >> 1], n >> 2);
37089 emit_insn (gen_rtx_SET (VOIDmode, target,
37090 gen_rtx_VEC_CONCAT (mode, op0, op1)));
37091 return;
37093 case V16QImode:
37094 if (!TARGET_SSE4_1)
37095 break;
37096 /* FALLTHRU */
37098 case V8HImode:
37099 if (!TARGET_SSE2)
37100 break;
37102 /* Don't use ix86_expand_vector_init_interleave if we can't
37103 move from GPR to SSE register directly. */
37104 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
37105 break;
37107 n = GET_MODE_NUNITS (mode);
37108 for (i = 0; i < n; i++)
37109 ops[i] = XVECEXP (vals, 0, i);
37110 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
37111 return;
37113 case V4HImode:
37114 case V8QImode:
37115 break;
37117 default:
37118 gcc_unreachable ();
37122 int i, j, n_elts, n_words, n_elt_per_word;
37123 enum machine_mode inner_mode;
37124 rtx words[4], shift;
37126 inner_mode = GET_MODE_INNER (mode);
37127 n_elts = GET_MODE_NUNITS (mode);
37128 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
37129 n_elt_per_word = n_elts / n_words;
37130 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
37132 for (i = 0; i < n_words; ++i)
37134 rtx word = NULL_RTX;
37136 for (j = 0; j < n_elt_per_word; ++j)
37138 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
37139 elt = convert_modes (word_mode, inner_mode, elt, true);
37141 if (j == 0)
37142 word = elt;
37143 else
37145 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
37146 word, 1, OPTAB_LIB_WIDEN);
37147 word = expand_simple_binop (word_mode, IOR, word, elt,
37148 word, 1, OPTAB_LIB_WIDEN);
37152 words[i] = word;
37155 if (n_words == 1)
37156 emit_move_insn (target, gen_lowpart (mode, words[0]));
37157 else if (n_words == 2)
37159 rtx tmp = gen_reg_rtx (mode);
37160 emit_clobber (tmp);
37161 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
37162 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
37163 emit_move_insn (target, tmp);
37165 else if (n_words == 4)
37167 rtx tmp = gen_reg_rtx (V4SImode);
37168 gcc_assert (word_mode == SImode);
37169 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
37170 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
37171 emit_move_insn (target, gen_lowpart (mode, tmp));
37173 else
37174 gcc_unreachable ();
37178 /* Initialize vector TARGET via VALS. Suppress the use of MMX
37179 instructions unless MMX_OK is true. */
37181 void
37182 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
37184 enum machine_mode mode = GET_MODE (target);
37185 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37186 int n_elts = GET_MODE_NUNITS (mode);
37187 int n_var = 0, one_var = -1;
37188 bool all_same = true, all_const_zero = true;
37189 int i;
37190 rtx x;
37192 for (i = 0; i < n_elts; ++i)
37194 x = XVECEXP (vals, 0, i);
37195 if (!(CONST_INT_P (x)
37196 || GET_CODE (x) == CONST_DOUBLE
37197 || GET_CODE (x) == CONST_FIXED))
37198 n_var++, one_var = i;
37199 else if (x != CONST0_RTX (inner_mode))
37200 all_const_zero = false;
37201 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37202 all_same = false;
37205 /* Constants are best loaded from the constant pool. */
37206 if (n_var == 0)
37208 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37209 return;
37212 /* If all values are identical, broadcast the value. */
37213 if (all_same
37214 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37215 XVECEXP (vals, 0, 0)))
37216 return;
37218 /* Values where only one field is non-constant are best loaded from
37219 the pool and overwritten via move later. */
37220 if (n_var == 1)
37222 if (all_const_zero
37223 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37224 XVECEXP (vals, 0, one_var),
37225 one_var))
37226 return;
37228 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37229 return;
37232 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37235 void
37236 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37238 enum machine_mode mode = GET_MODE (target);
37239 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37240 enum machine_mode half_mode;
37241 bool use_vec_merge = false;
37242 rtx tmp;
37243 static rtx (*gen_extract[6][2]) (rtx, rtx)
37245 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37246 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37247 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37248 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37249 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37250 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37252 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37254 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37255 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37256 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37257 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37258 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37259 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37261 int i, j, n;
37263 switch (mode)
37265 case V2SFmode:
37266 case V2SImode:
37267 if (mmx_ok)
37269 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37270 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37271 if (elt == 0)
37272 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37273 else
37274 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37275 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37276 return;
37278 break;
37280 case V2DImode:
37281 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37282 if (use_vec_merge)
37283 break;
37285 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37286 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37287 if (elt == 0)
37288 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37289 else
37290 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37291 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37292 return;
37294 case V2DFmode:
37296 rtx op0, op1;
37298 /* For the two element vectors, we implement a VEC_CONCAT with
37299 the extraction of the other element. */
37301 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37302 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37304 if (elt == 0)
37305 op0 = val, op1 = tmp;
37306 else
37307 op0 = tmp, op1 = val;
37309 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37310 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37312 return;
37314 case V4SFmode:
37315 use_vec_merge = TARGET_SSE4_1;
37316 if (use_vec_merge)
37317 break;
37319 switch (elt)
37321 case 0:
37322 use_vec_merge = true;
37323 break;
37325 case 1:
37326 /* tmp = target = A B C D */
37327 tmp = copy_to_reg (target);
37328 /* target = A A B B */
37329 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37330 /* target = X A B B */
37331 ix86_expand_vector_set (false, target, val, 0);
37332 /* target = A X C D */
37333 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37334 const1_rtx, const0_rtx,
37335 GEN_INT (2+4), GEN_INT (3+4)));
37336 return;
37338 case 2:
37339 /* tmp = target = A B C D */
37340 tmp = copy_to_reg (target);
37341 /* tmp = X B C D */
37342 ix86_expand_vector_set (false, tmp, val, 0);
37343 /* target = A B X D */
37344 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37345 const0_rtx, const1_rtx,
37346 GEN_INT (0+4), GEN_INT (3+4)));
37347 return;
37349 case 3:
37350 /* tmp = target = A B C D */
37351 tmp = copy_to_reg (target);
37352 /* tmp = X B C D */
37353 ix86_expand_vector_set (false, tmp, val, 0);
37354 /* target = A B X D */
37355 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37356 const0_rtx, const1_rtx,
37357 GEN_INT (2+4), GEN_INT (0+4)));
37358 return;
37360 default:
37361 gcc_unreachable ();
37363 break;
37365 case V4SImode:
37366 use_vec_merge = TARGET_SSE4_1;
37367 if (use_vec_merge)
37368 break;
37370 /* Element 0 handled by vec_merge below. */
37371 if (elt == 0)
37373 use_vec_merge = true;
37374 break;
37377 if (TARGET_SSE2)
37379 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37380 store into element 0, then shuffle them back. */
37382 rtx order[4];
37384 order[0] = GEN_INT (elt);
37385 order[1] = const1_rtx;
37386 order[2] = const2_rtx;
37387 order[3] = GEN_INT (3);
37388 order[elt] = const0_rtx;
37390 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37391 order[1], order[2], order[3]));
37393 ix86_expand_vector_set (false, target, val, 0);
37395 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37396 order[1], order[2], order[3]));
37398 else
37400 /* For SSE1, we have to reuse the V4SF code. */
37401 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
37402 gen_lowpart (SFmode, val), elt);
37404 return;
37406 case V8HImode:
37407 use_vec_merge = TARGET_SSE2;
37408 break;
37409 case V4HImode:
37410 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37411 break;
37413 case V16QImode:
37414 use_vec_merge = TARGET_SSE4_1;
37415 break;
37417 case V8QImode:
37418 break;
37420 case V32QImode:
37421 half_mode = V16QImode;
37422 j = 0;
37423 n = 16;
37424 goto half;
37426 case V16HImode:
37427 half_mode = V8HImode;
37428 j = 1;
37429 n = 8;
37430 goto half;
37432 case V8SImode:
37433 half_mode = V4SImode;
37434 j = 2;
37435 n = 4;
37436 goto half;
37438 case V4DImode:
37439 half_mode = V2DImode;
37440 j = 3;
37441 n = 2;
37442 goto half;
37444 case V8SFmode:
37445 half_mode = V4SFmode;
37446 j = 4;
37447 n = 4;
37448 goto half;
37450 case V4DFmode:
37451 half_mode = V2DFmode;
37452 j = 5;
37453 n = 2;
37454 goto half;
37456 half:
37457 /* Compute offset. */
37458 i = elt / n;
37459 elt %= n;
37461 gcc_assert (i <= 1);
37463 /* Extract the half. */
37464 tmp = gen_reg_rtx (half_mode);
37465 emit_insn (gen_extract[j][i] (tmp, target));
37467 /* Put val in tmp at elt. */
37468 ix86_expand_vector_set (false, tmp, val, elt);
37470 /* Put it back. */
37471 emit_insn (gen_insert[j][i] (target, target, tmp));
37472 return;
37474 default:
37475 break;
37478 if (use_vec_merge)
37480 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
37481 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
37482 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37484 else
37486 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37488 emit_move_insn (mem, target);
37490 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37491 emit_move_insn (tmp, val);
37493 emit_move_insn (target, mem);
37497 void
37498 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
37500 enum machine_mode mode = GET_MODE (vec);
37501 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37502 bool use_vec_extr = false;
37503 rtx tmp;
37505 switch (mode)
37507 case V2SImode:
37508 case V2SFmode:
37509 if (!mmx_ok)
37510 break;
37511 /* FALLTHRU */
37513 case V2DFmode:
37514 case V2DImode:
37515 use_vec_extr = true;
37516 break;
37518 case V4SFmode:
37519 use_vec_extr = TARGET_SSE4_1;
37520 if (use_vec_extr)
37521 break;
37523 switch (elt)
37525 case 0:
37526 tmp = vec;
37527 break;
37529 case 1:
37530 case 3:
37531 tmp = gen_reg_rtx (mode);
37532 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
37533 GEN_INT (elt), GEN_INT (elt),
37534 GEN_INT (elt+4), GEN_INT (elt+4)));
37535 break;
37537 case 2:
37538 tmp = gen_reg_rtx (mode);
37539 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
37540 break;
37542 default:
37543 gcc_unreachable ();
37545 vec = tmp;
37546 use_vec_extr = true;
37547 elt = 0;
37548 break;
37550 case V4SImode:
37551 use_vec_extr = TARGET_SSE4_1;
37552 if (use_vec_extr)
37553 break;
37555 if (TARGET_SSE2)
37557 switch (elt)
37559 case 0:
37560 tmp = vec;
37561 break;
37563 case 1:
37564 case 3:
37565 tmp = gen_reg_rtx (mode);
37566 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
37567 GEN_INT (elt), GEN_INT (elt),
37568 GEN_INT (elt), GEN_INT (elt)));
37569 break;
37571 case 2:
37572 tmp = gen_reg_rtx (mode);
37573 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
37574 break;
37576 default:
37577 gcc_unreachable ();
37579 vec = tmp;
37580 use_vec_extr = true;
37581 elt = 0;
37583 else
37585 /* For SSE1, we have to reuse the V4SF code. */
37586 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
37587 gen_lowpart (V4SFmode, vec), elt);
37588 return;
37590 break;
37592 case V8HImode:
37593 use_vec_extr = TARGET_SSE2;
37594 break;
37595 case V4HImode:
37596 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37597 break;
37599 case V16QImode:
37600 use_vec_extr = TARGET_SSE4_1;
37601 break;
37603 case V8SFmode:
37604 if (TARGET_AVX)
37606 tmp = gen_reg_rtx (V4SFmode);
37607 if (elt < 4)
37608 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
37609 else
37610 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
37611 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37612 return;
37614 break;
37616 case V4DFmode:
37617 if (TARGET_AVX)
37619 tmp = gen_reg_rtx (V2DFmode);
37620 if (elt < 2)
37621 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
37622 else
37623 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
37624 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37625 return;
37627 break;
37629 case V32QImode:
37630 if (TARGET_AVX)
37632 tmp = gen_reg_rtx (V16QImode);
37633 if (elt < 16)
37634 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
37635 else
37636 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
37637 ix86_expand_vector_extract (false, target, tmp, elt & 15);
37638 return;
37640 break;
37642 case V16HImode:
37643 if (TARGET_AVX)
37645 tmp = gen_reg_rtx (V8HImode);
37646 if (elt < 8)
37647 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
37648 else
37649 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
37650 ix86_expand_vector_extract (false, target, tmp, elt & 7);
37651 return;
37653 break;
37655 case V8SImode:
37656 if (TARGET_AVX)
37658 tmp = gen_reg_rtx (V4SImode);
37659 if (elt < 4)
37660 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
37661 else
37662 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
37663 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37664 return;
37666 break;
37668 case V4DImode:
37669 if (TARGET_AVX)
37671 tmp = gen_reg_rtx (V2DImode);
37672 if (elt < 2)
37673 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
37674 else
37675 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
37676 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37677 return;
37679 break;
37681 case V8QImode:
37682 /* ??? Could extract the appropriate HImode element and shift. */
37683 default:
37684 break;
37687 if (use_vec_extr)
37689 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
37690 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
37692 /* Let the rtl optimizers know about the zero extension performed. */
37693 if (inner_mode == QImode || inner_mode == HImode)
37695 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
37696 target = gen_lowpart (SImode, target);
37699 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37701 else
37703 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37705 emit_move_insn (mem, vec);
37707 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37708 emit_move_insn (target, tmp);
37712 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
37713 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
37714 The upper bits of DEST are undefined, though they shouldn't cause
37715 exceptions (some bits from src or all zeros are ok). */
37717 static void
37718 emit_reduc_half (rtx dest, rtx src, int i)
37720 rtx tem;
37721 switch (GET_MODE (src))
37723 case V4SFmode:
37724 if (i == 128)
37725 tem = gen_sse_movhlps (dest, src, src);
37726 else
37727 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
37728 GEN_INT (1 + 4), GEN_INT (1 + 4));
37729 break;
37730 case V2DFmode:
37731 tem = gen_vec_interleave_highv2df (dest, src, src);
37732 break;
37733 case V16QImode:
37734 case V8HImode:
37735 case V4SImode:
37736 case V2DImode:
37737 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
37738 gen_lowpart (V1TImode, src),
37739 GEN_INT (i / 2));
37740 break;
37741 case V8SFmode:
37742 if (i == 256)
37743 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
37744 else
37745 tem = gen_avx_shufps256 (dest, src, src,
37746 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
37747 break;
37748 case V4DFmode:
37749 if (i == 256)
37750 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
37751 else
37752 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
37753 break;
37754 case V32QImode:
37755 case V16HImode:
37756 case V8SImode:
37757 case V4DImode:
37758 if (i == 256)
37759 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
37760 gen_lowpart (V4DImode, src),
37761 gen_lowpart (V4DImode, src),
37762 const1_rtx);
37763 else
37764 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
37765 gen_lowpart (V2TImode, src),
37766 GEN_INT (i / 2));
37767 break;
37768 default:
37769 gcc_unreachable ();
37771 emit_insn (tem);
37774 /* Expand a vector reduction. FN is the binary pattern to reduce;
37775 DEST is the destination; IN is the input vector. */
37777 void
37778 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
37780 rtx half, dst, vec = in;
37781 enum machine_mode mode = GET_MODE (in);
37782 int i;
37784 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
37785 if (TARGET_SSE4_1
37786 && mode == V8HImode
37787 && fn == gen_uminv8hi3)
37789 emit_insn (gen_sse4_1_phminposuw (dest, in));
37790 return;
37793 for (i = GET_MODE_BITSIZE (mode);
37794 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37795 i >>= 1)
37797 half = gen_reg_rtx (mode);
37798 emit_reduc_half (half, vec, i);
37799 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37800 dst = dest;
37801 else
37802 dst = gen_reg_rtx (mode);
37803 emit_insn (fn (dst, half, vec));
37804 vec = dst;
37808 /* Target hook for scalar_mode_supported_p. */
37809 static bool
37810 ix86_scalar_mode_supported_p (enum machine_mode mode)
37812 if (DECIMAL_FLOAT_MODE_P (mode))
37813 return default_decimal_float_supported_p ();
37814 else if (mode == TFmode)
37815 return true;
37816 else
37817 return default_scalar_mode_supported_p (mode);
37820 /* Implements target hook vector_mode_supported_p. */
37821 static bool
37822 ix86_vector_mode_supported_p (enum machine_mode mode)
37824 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37825 return true;
37826 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37827 return true;
37828 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37829 return true;
37830 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37831 return true;
37832 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37833 return true;
37834 return false;
37837 /* Target hook for c_mode_for_suffix. */
37838 static enum machine_mode
37839 ix86_c_mode_for_suffix (char suffix)
37841 if (suffix == 'q')
37842 return TFmode;
37843 if (suffix == 'w')
37844 return XFmode;
37846 return VOIDmode;
37849 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37851 We do this in the new i386 backend to maintain source compatibility
37852 with the old cc0-based compiler. */
37854 static tree
37855 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37856 tree inputs ATTRIBUTE_UNUSED,
37857 tree clobbers)
37859 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37860 clobbers);
37861 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37862 clobbers);
37863 return clobbers;
37866 /* Implements target vector targetm.asm.encode_section_info. */
37868 static void ATTRIBUTE_UNUSED
37869 ix86_encode_section_info (tree decl, rtx rtl, int first)
37871 default_encode_section_info (decl, rtl, first);
37873 if (TREE_CODE (decl) == VAR_DECL
37874 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37875 && ix86_in_large_data_p (decl))
37876 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37879 /* Worker function for REVERSE_CONDITION. */
37881 enum rtx_code
37882 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37884 return (mode != CCFPmode && mode != CCFPUmode
37885 ? reverse_condition (code)
37886 : reverse_condition_maybe_unordered (code));
37889 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37890 to OPERANDS[0]. */
37892 const char *
37893 output_387_reg_move (rtx insn, rtx *operands)
37895 if (REG_P (operands[0]))
37897 if (REG_P (operands[1])
37898 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37900 if (REGNO (operands[0]) == FIRST_STACK_REG)
37901 return output_387_ffreep (operands, 0);
37902 return "fstp\t%y0";
37904 if (STACK_TOP_P (operands[0]))
37905 return "fld%Z1\t%y1";
37906 return "fst\t%y0";
37908 else if (MEM_P (operands[0]))
37910 gcc_assert (REG_P (operands[1]));
37911 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37912 return "fstp%Z0\t%y0";
37913 else
37915 /* There is no non-popping store to memory for XFmode.
37916 So if we need one, follow the store with a load. */
37917 if (GET_MODE (operands[0]) == XFmode)
37918 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37919 else
37920 return "fst%Z0\t%y0";
37923 else
37924 gcc_unreachable();
37927 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37928 FP status register is set. */
37930 void
37931 ix86_emit_fp_unordered_jump (rtx label)
37933 rtx reg = gen_reg_rtx (HImode);
37934 rtx temp;
37936 emit_insn (gen_x86_fnstsw_1 (reg));
37938 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37940 emit_insn (gen_x86_sahf_1 (reg));
37942 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37943 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37945 else
37947 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37949 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37950 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37953 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37954 gen_rtx_LABEL_REF (VOIDmode, label),
37955 pc_rtx);
37956 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37958 emit_jump_insn (temp);
37959 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37962 /* Output code to perform a log1p XFmode calculation. */
37964 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37966 rtx label1 = gen_label_rtx ();
37967 rtx label2 = gen_label_rtx ();
37969 rtx tmp = gen_reg_rtx (XFmode);
37970 rtx tmp2 = gen_reg_rtx (XFmode);
37971 rtx test;
37973 emit_insn (gen_absxf2 (tmp, op1));
37974 test = gen_rtx_GE (VOIDmode, tmp,
37975 CONST_DOUBLE_FROM_REAL_VALUE (
37976 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37977 XFmode));
37978 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37980 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37981 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37982 emit_jump (label2);
37984 emit_label (label1);
37985 emit_move_insn (tmp, CONST1_RTX (XFmode));
37986 emit_insn (gen_addxf3 (tmp, op1, tmp));
37987 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37988 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37990 emit_label (label2);
37993 /* Emit code for round calculation. */
37994 void ix86_emit_i387_round (rtx op0, rtx op1)
37996 enum machine_mode inmode = GET_MODE (op1);
37997 enum machine_mode outmode = GET_MODE (op0);
37998 rtx e1, e2, res, tmp, tmp1, half;
37999 rtx scratch = gen_reg_rtx (HImode);
38000 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
38001 rtx jump_label = gen_label_rtx ();
38002 rtx insn;
38003 rtx (*gen_abs) (rtx, rtx);
38004 rtx (*gen_neg) (rtx, rtx);
38006 switch (inmode)
38008 case SFmode:
38009 gen_abs = gen_abssf2;
38010 break;
38011 case DFmode:
38012 gen_abs = gen_absdf2;
38013 break;
38014 case XFmode:
38015 gen_abs = gen_absxf2;
38016 break;
38017 default:
38018 gcc_unreachable ();
38021 switch (outmode)
38023 case SFmode:
38024 gen_neg = gen_negsf2;
38025 break;
38026 case DFmode:
38027 gen_neg = gen_negdf2;
38028 break;
38029 case XFmode:
38030 gen_neg = gen_negxf2;
38031 break;
38032 case HImode:
38033 gen_neg = gen_neghi2;
38034 break;
38035 case SImode:
38036 gen_neg = gen_negsi2;
38037 break;
38038 case DImode:
38039 gen_neg = gen_negdi2;
38040 break;
38041 default:
38042 gcc_unreachable ();
38045 e1 = gen_reg_rtx (inmode);
38046 e2 = gen_reg_rtx (inmode);
38047 res = gen_reg_rtx (outmode);
38049 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
38051 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
38053 /* scratch = fxam(op1) */
38054 emit_insn (gen_rtx_SET (VOIDmode, scratch,
38055 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
38056 UNSPEC_FXAM)));
38057 /* e1 = fabs(op1) */
38058 emit_insn (gen_abs (e1, op1));
38060 /* e2 = e1 + 0.5 */
38061 half = force_reg (inmode, half);
38062 emit_insn (gen_rtx_SET (VOIDmode, e2,
38063 gen_rtx_PLUS (inmode, e1, half)));
38065 /* res = floor(e2) */
38066 if (inmode != XFmode)
38068 tmp1 = gen_reg_rtx (XFmode);
38070 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
38071 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
38073 else
38074 tmp1 = e2;
38076 switch (outmode)
38078 case SFmode:
38079 case DFmode:
38081 rtx tmp0 = gen_reg_rtx (XFmode);
38083 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
38085 emit_insn (gen_rtx_SET (VOIDmode, res,
38086 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
38087 UNSPEC_TRUNC_NOOP)));
38089 break;
38090 case XFmode:
38091 emit_insn (gen_frndintxf2_floor (res, tmp1));
38092 break;
38093 case HImode:
38094 emit_insn (gen_lfloorxfhi2 (res, tmp1));
38095 break;
38096 case SImode:
38097 emit_insn (gen_lfloorxfsi2 (res, tmp1));
38098 break;
38099 case DImode:
38100 emit_insn (gen_lfloorxfdi2 (res, tmp1));
38101 break;
38102 default:
38103 gcc_unreachable ();
38106 /* flags = signbit(a) */
38107 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
38109 /* if (flags) then res = -res */
38110 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
38111 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
38112 gen_rtx_LABEL_REF (VOIDmode, jump_label),
38113 pc_rtx);
38114 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38115 predict_jump (REG_BR_PROB_BASE * 50 / 100);
38116 JUMP_LABEL (insn) = jump_label;
38118 emit_insn (gen_neg (res, res));
38120 emit_label (jump_label);
38121 LABEL_NUSES (jump_label) = 1;
38123 emit_move_insn (op0, res);
38126 /* Output code to perform a Newton-Rhapson approximation of a single precision
38127 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
38129 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
38131 rtx x0, x1, e0, e1;
38133 x0 = gen_reg_rtx (mode);
38134 e0 = gen_reg_rtx (mode);
38135 e1 = gen_reg_rtx (mode);
38136 x1 = gen_reg_rtx (mode);
38138 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
38140 b = force_reg (mode, b);
38142 /* x0 = rcp(b) estimate */
38143 emit_insn (gen_rtx_SET (VOIDmode, x0,
38144 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
38145 UNSPEC_RCP)));
38146 /* e0 = x0 * b */
38147 emit_insn (gen_rtx_SET (VOIDmode, e0,
38148 gen_rtx_MULT (mode, x0, b)));
38150 /* e0 = x0 * e0 */
38151 emit_insn (gen_rtx_SET (VOIDmode, e0,
38152 gen_rtx_MULT (mode, x0, e0)));
38154 /* e1 = x0 + x0 */
38155 emit_insn (gen_rtx_SET (VOIDmode, e1,
38156 gen_rtx_PLUS (mode, x0, x0)));
38158 /* x1 = e1 - e0 */
38159 emit_insn (gen_rtx_SET (VOIDmode, x1,
38160 gen_rtx_MINUS (mode, e1, e0)));
38162 /* res = a * x1 */
38163 emit_insn (gen_rtx_SET (VOIDmode, res,
38164 gen_rtx_MULT (mode, a, x1)));
38167 /* Output code to perform a Newton-Rhapson approximation of a
38168 single precision floating point [reciprocal] square root. */
38170 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
38171 bool recip)
38173 rtx x0, e0, e1, e2, e3, mthree, mhalf;
38174 REAL_VALUE_TYPE r;
38176 x0 = gen_reg_rtx (mode);
38177 e0 = gen_reg_rtx (mode);
38178 e1 = gen_reg_rtx (mode);
38179 e2 = gen_reg_rtx (mode);
38180 e3 = gen_reg_rtx (mode);
38182 real_from_integer (&r, VOIDmode, -3, -1, 0);
38183 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38185 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
38186 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38188 if (VECTOR_MODE_P (mode))
38190 mthree = ix86_build_const_vector (mode, true, mthree);
38191 mhalf = ix86_build_const_vector (mode, true, mhalf);
38194 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
38195 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
38197 a = force_reg (mode, a);
38199 /* x0 = rsqrt(a) estimate */
38200 emit_insn (gen_rtx_SET (VOIDmode, x0,
38201 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38202 UNSPEC_RSQRT)));
38204 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38205 if (!recip)
38207 rtx zero, mask;
38209 zero = gen_reg_rtx (mode);
38210 mask = gen_reg_rtx (mode);
38212 zero = force_reg (mode, CONST0_RTX(mode));
38213 emit_insn (gen_rtx_SET (VOIDmode, mask,
38214 gen_rtx_NE (mode, zero, a)));
38216 emit_insn (gen_rtx_SET (VOIDmode, x0,
38217 gen_rtx_AND (mode, x0, mask)));
38220 /* e0 = x0 * a */
38221 emit_insn (gen_rtx_SET (VOIDmode, e0,
38222 gen_rtx_MULT (mode, x0, a)));
38223 /* e1 = e0 * x0 */
38224 emit_insn (gen_rtx_SET (VOIDmode, e1,
38225 gen_rtx_MULT (mode, e0, x0)));
38227 /* e2 = e1 - 3. */
38228 mthree = force_reg (mode, mthree);
38229 emit_insn (gen_rtx_SET (VOIDmode, e2,
38230 gen_rtx_PLUS (mode, e1, mthree)));
38232 mhalf = force_reg (mode, mhalf);
38233 if (recip)
38234 /* e3 = -.5 * x0 */
38235 emit_insn (gen_rtx_SET (VOIDmode, e3,
38236 gen_rtx_MULT (mode, x0, mhalf)));
38237 else
38238 /* e3 = -.5 * e0 */
38239 emit_insn (gen_rtx_SET (VOIDmode, e3,
38240 gen_rtx_MULT (mode, e0, mhalf)));
38241 /* ret = e2 * e3 */
38242 emit_insn (gen_rtx_SET (VOIDmode, res,
38243 gen_rtx_MULT (mode, e2, e3)));
38246 #ifdef TARGET_SOLARIS
38247 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38249 static void
38250 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38251 tree decl)
38253 /* With Binutils 2.15, the "@unwind" marker must be specified on
38254 every occurrence of the ".eh_frame" section, not just the first
38255 one. */
38256 if (TARGET_64BIT
38257 && strcmp (name, ".eh_frame") == 0)
38259 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38260 flags & SECTION_WRITE ? "aw" : "a");
38261 return;
38264 #ifndef USE_GAS
38265 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38267 solaris_elf_asm_comdat_section (name, flags, decl);
38268 return;
38270 #endif
38272 default_elf_asm_named_section (name, flags, decl);
38274 #endif /* TARGET_SOLARIS */
38276 /* Return the mangling of TYPE if it is an extended fundamental type. */
38278 static const char *
38279 ix86_mangle_type (const_tree type)
38281 type = TYPE_MAIN_VARIANT (type);
38283 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38284 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38285 return NULL;
38287 switch (TYPE_MODE (type))
38289 case TFmode:
38290 /* __float128 is "g". */
38291 return "g";
38292 case XFmode:
38293 /* "long double" or __float80 is "e". */
38294 return "e";
38295 default:
38296 return NULL;
38300 /* For 32-bit code we can save PIC register setup by using
38301 __stack_chk_fail_local hidden function instead of calling
38302 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38303 register, so it is better to call __stack_chk_fail directly. */
38305 static tree ATTRIBUTE_UNUSED
38306 ix86_stack_protect_fail (void)
38308 return TARGET_64BIT
38309 ? default_external_stack_protect_fail ()
38310 : default_hidden_stack_protect_fail ();
38313 /* Select a format to encode pointers in exception handling data. CODE
38314 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38315 true if the symbol may be affected by dynamic relocations.
38317 ??? All x86 object file formats are capable of representing this.
38318 After all, the relocation needed is the same as for the call insn.
38319 Whether or not a particular assembler allows us to enter such, I
38320 guess we'll have to see. */
38322 asm_preferred_eh_data_format (int code, int global)
38324 if (flag_pic)
38326 int type = DW_EH_PE_sdata8;
38327 if (!TARGET_64BIT
38328 || ix86_cmodel == CM_SMALL_PIC
38329 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38330 type = DW_EH_PE_sdata4;
38331 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38333 if (ix86_cmodel == CM_SMALL
38334 || (ix86_cmodel == CM_MEDIUM && code))
38335 return DW_EH_PE_udata4;
38336 return DW_EH_PE_absptr;
38339 /* Expand copysign from SIGN to the positive value ABS_VALUE
38340 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38341 the sign-bit. */
38342 static void
38343 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38345 enum machine_mode mode = GET_MODE (sign);
38346 rtx sgn = gen_reg_rtx (mode);
38347 if (mask == NULL_RTX)
38349 enum machine_mode vmode;
38351 if (mode == SFmode)
38352 vmode = V4SFmode;
38353 else if (mode == DFmode)
38354 vmode = V2DFmode;
38355 else
38356 vmode = mode;
38358 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38359 if (!VECTOR_MODE_P (mode))
38361 /* We need to generate a scalar mode mask in this case. */
38362 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38363 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38364 mask = gen_reg_rtx (mode);
38365 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38368 else
38369 mask = gen_rtx_NOT (mode, mask);
38370 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38371 gen_rtx_AND (mode, mask, sign)));
38372 emit_insn (gen_rtx_SET (VOIDmode, result,
38373 gen_rtx_IOR (mode, abs_value, sgn)));
38376 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38377 mask for masking out the sign-bit is stored in *SMASK, if that is
38378 non-null. */
38379 static rtx
38380 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38382 enum machine_mode vmode, mode = GET_MODE (op0);
38383 rtx xa, mask;
38385 xa = gen_reg_rtx (mode);
38386 if (mode == SFmode)
38387 vmode = V4SFmode;
38388 else if (mode == DFmode)
38389 vmode = V2DFmode;
38390 else
38391 vmode = mode;
38392 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38393 if (!VECTOR_MODE_P (mode))
38395 /* We need to generate a scalar mode mask in this case. */
38396 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38397 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38398 mask = gen_reg_rtx (mode);
38399 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38401 emit_insn (gen_rtx_SET (VOIDmode, xa,
38402 gen_rtx_AND (mode, op0, mask)));
38404 if (smask)
38405 *smask = mask;
38407 return xa;
38410 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38411 swapping the operands if SWAP_OPERANDS is true. The expanded
38412 code is a forward jump to a newly created label in case the
38413 comparison is true. The generated label rtx is returned. */
38414 static rtx
38415 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38416 bool swap_operands)
38418 rtx label, tmp;
38420 if (swap_operands)
38422 tmp = op0;
38423 op0 = op1;
38424 op1 = tmp;
38427 label = gen_label_rtx ();
38428 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
38429 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38430 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
38431 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
38432 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
38433 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
38434 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38435 JUMP_LABEL (tmp) = label;
38437 return label;
38440 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
38441 using comparison code CODE. Operands are swapped for the comparison if
38442 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
38443 static rtx
38444 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
38445 bool swap_operands)
38447 rtx (*insn)(rtx, rtx, rtx, rtx);
38448 enum machine_mode mode = GET_MODE (op0);
38449 rtx mask = gen_reg_rtx (mode);
38451 if (swap_operands)
38453 rtx tmp = op0;
38454 op0 = op1;
38455 op1 = tmp;
38458 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
38460 emit_insn (insn (mask, op0, op1,
38461 gen_rtx_fmt_ee (code, mode, op0, op1)));
38462 return mask;
38465 /* Generate and return a rtx of mode MODE for 2**n where n is the number
38466 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
38467 static rtx
38468 ix86_gen_TWO52 (enum machine_mode mode)
38470 REAL_VALUE_TYPE TWO52r;
38471 rtx TWO52;
38473 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
38474 TWO52 = const_double_from_real_value (TWO52r, mode);
38475 TWO52 = force_reg (mode, TWO52);
38477 return TWO52;
38480 /* Expand SSE sequence for computing lround from OP1 storing
38481 into OP0. */
38482 void
38483 ix86_expand_lround (rtx op0, rtx op1)
38485 /* C code for the stuff we're doing below:
38486 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
38487 return (long)tmp;
38489 enum machine_mode mode = GET_MODE (op1);
38490 const struct real_format *fmt;
38491 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38492 rtx adj;
38494 /* load nextafter (0.5, 0.0) */
38495 fmt = REAL_MODE_FORMAT (mode);
38496 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38497 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38499 /* adj = copysign (0.5, op1) */
38500 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
38501 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
38503 /* adj = op1 + adj */
38504 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
38506 /* op0 = (imode)adj */
38507 expand_fix (op0, adj, 0);
38510 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
38511 into OPERAND0. */
38512 void
38513 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
38515 /* C code for the stuff we're doing below (for do_floor):
38516 xi = (long)op1;
38517 xi -= (double)xi > op1 ? 1 : 0;
38518 return xi;
38520 enum machine_mode fmode = GET_MODE (op1);
38521 enum machine_mode imode = GET_MODE (op0);
38522 rtx ireg, freg, label, tmp;
38524 /* reg = (long)op1 */
38525 ireg = gen_reg_rtx (imode);
38526 expand_fix (ireg, op1, 0);
38528 /* freg = (double)reg */
38529 freg = gen_reg_rtx (fmode);
38530 expand_float (freg, ireg, 0);
38532 /* ireg = (freg > op1) ? ireg - 1 : ireg */
38533 label = ix86_expand_sse_compare_and_jump (UNLE,
38534 freg, op1, !do_floor);
38535 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
38536 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
38537 emit_move_insn (ireg, tmp);
38539 emit_label (label);
38540 LABEL_NUSES (label) = 1;
38542 emit_move_insn (op0, ireg);
38545 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
38546 result in OPERAND0. */
38547 void
38548 ix86_expand_rint (rtx operand0, rtx operand1)
38550 /* C code for the stuff we're doing below:
38551 xa = fabs (operand1);
38552 if (!isless (xa, 2**52))
38553 return operand1;
38554 xa = xa + 2**52 - 2**52;
38555 return copysign (xa, operand1);
38557 enum machine_mode mode = GET_MODE (operand0);
38558 rtx res, xa, label, TWO52, mask;
38560 res = gen_reg_rtx (mode);
38561 emit_move_insn (res, operand1);
38563 /* xa = abs (operand1) */
38564 xa = ix86_expand_sse_fabs (res, &mask);
38566 /* if (!isless (xa, TWO52)) goto label; */
38567 TWO52 = ix86_gen_TWO52 (mode);
38568 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38570 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38571 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38573 ix86_sse_copysign_to_positive (res, xa, res, mask);
38575 emit_label (label);
38576 LABEL_NUSES (label) = 1;
38578 emit_move_insn (operand0, res);
38581 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38582 into OPERAND0. */
38583 void
38584 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
38586 /* C code for the stuff we expand below.
38587 double xa = fabs (x), x2;
38588 if (!isless (xa, TWO52))
38589 return x;
38590 xa = xa + TWO52 - TWO52;
38591 x2 = copysign (xa, x);
38592 Compensate. Floor:
38593 if (x2 > x)
38594 x2 -= 1;
38595 Compensate. Ceil:
38596 if (x2 < x)
38597 x2 -= -1;
38598 return x2;
38600 enum machine_mode mode = GET_MODE (operand0);
38601 rtx xa, TWO52, tmp, label, one, res, mask;
38603 TWO52 = ix86_gen_TWO52 (mode);
38605 /* Temporary for holding the result, initialized to the input
38606 operand to ease control flow. */
38607 res = gen_reg_rtx (mode);
38608 emit_move_insn (res, operand1);
38610 /* xa = abs (operand1) */
38611 xa = ix86_expand_sse_fabs (res, &mask);
38613 /* if (!isless (xa, TWO52)) goto label; */
38614 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38616 /* xa = xa + TWO52 - TWO52; */
38617 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38618 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38620 /* xa = copysign (xa, operand1) */
38621 ix86_sse_copysign_to_positive (xa, xa, res, mask);
38623 /* generate 1.0 or -1.0 */
38624 one = force_reg (mode,
38625 const_double_from_real_value (do_floor
38626 ? dconst1 : dconstm1, mode));
38628 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38629 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38630 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38631 gen_rtx_AND (mode, one, tmp)));
38632 /* We always need to subtract here to preserve signed zero. */
38633 tmp = expand_simple_binop (mode, MINUS,
38634 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38635 emit_move_insn (res, tmp);
38637 emit_label (label);
38638 LABEL_NUSES (label) = 1;
38640 emit_move_insn (operand0, res);
38643 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38644 into OPERAND0. */
38645 void
38646 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
38648 /* C code for the stuff we expand below.
38649 double xa = fabs (x), x2;
38650 if (!isless (xa, TWO52))
38651 return x;
38652 x2 = (double)(long)x;
38653 Compensate. Floor:
38654 if (x2 > x)
38655 x2 -= 1;
38656 Compensate. Ceil:
38657 if (x2 < x)
38658 x2 += 1;
38659 if (HONOR_SIGNED_ZEROS (mode))
38660 return copysign (x2, x);
38661 return x2;
38663 enum machine_mode mode = GET_MODE (operand0);
38664 rtx xa, xi, TWO52, tmp, label, one, res, mask;
38666 TWO52 = ix86_gen_TWO52 (mode);
38668 /* Temporary for holding the result, initialized to the input
38669 operand to ease control flow. */
38670 res = gen_reg_rtx (mode);
38671 emit_move_insn (res, operand1);
38673 /* xa = abs (operand1) */
38674 xa = ix86_expand_sse_fabs (res, &mask);
38676 /* if (!isless (xa, TWO52)) goto label; */
38677 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38679 /* xa = (double)(long)x */
38680 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38681 expand_fix (xi, res, 0);
38682 expand_float (xa, xi, 0);
38684 /* generate 1.0 */
38685 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38687 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38688 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38689 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38690 gen_rtx_AND (mode, one, tmp)));
38691 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
38692 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38693 emit_move_insn (res, tmp);
38695 if (HONOR_SIGNED_ZEROS (mode))
38696 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38698 emit_label (label);
38699 LABEL_NUSES (label) = 1;
38701 emit_move_insn (operand0, res);
38704 /* Expand SSE sequence for computing round from OPERAND1 storing
38705 into OPERAND0. Sequence that works without relying on DImode truncation
38706 via cvttsd2siq that is only available on 64bit targets. */
38707 void
38708 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
38710 /* C code for the stuff we expand below.
38711 double xa = fabs (x), xa2, x2;
38712 if (!isless (xa, TWO52))
38713 return x;
38714 Using the absolute value and copying back sign makes
38715 -0.0 -> -0.0 correct.
38716 xa2 = xa + TWO52 - TWO52;
38717 Compensate.
38718 dxa = xa2 - xa;
38719 if (dxa <= -0.5)
38720 xa2 += 1;
38721 else if (dxa > 0.5)
38722 xa2 -= 1;
38723 x2 = copysign (xa2, x);
38724 return x2;
38726 enum machine_mode mode = GET_MODE (operand0);
38727 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
38729 TWO52 = ix86_gen_TWO52 (mode);
38731 /* Temporary for holding the result, initialized to the input
38732 operand to ease control flow. */
38733 res = gen_reg_rtx (mode);
38734 emit_move_insn (res, operand1);
38736 /* xa = abs (operand1) */
38737 xa = ix86_expand_sse_fabs (res, &mask);
38739 /* if (!isless (xa, TWO52)) goto label; */
38740 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38742 /* xa2 = xa + TWO52 - TWO52; */
38743 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38744 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
38746 /* dxa = xa2 - xa; */
38747 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
38749 /* generate 0.5, 1.0 and -0.5 */
38750 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
38751 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
38752 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
38753 0, OPTAB_DIRECT);
38755 /* Compensate. */
38756 tmp = gen_reg_rtx (mode);
38757 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
38758 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
38759 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38760 gen_rtx_AND (mode, one, tmp)));
38761 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38762 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
38763 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
38764 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38765 gen_rtx_AND (mode, one, tmp)));
38766 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38768 /* res = copysign (xa2, operand1) */
38769 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
38771 emit_label (label);
38772 LABEL_NUSES (label) = 1;
38774 emit_move_insn (operand0, res);
38777 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38778 into OPERAND0. */
38779 void
38780 ix86_expand_trunc (rtx operand0, rtx operand1)
38782 /* C code for SSE variant we expand below.
38783 double xa = fabs (x), x2;
38784 if (!isless (xa, TWO52))
38785 return x;
38786 x2 = (double)(long)x;
38787 if (HONOR_SIGNED_ZEROS (mode))
38788 return copysign (x2, x);
38789 return x2;
38791 enum machine_mode mode = GET_MODE (operand0);
38792 rtx xa, xi, TWO52, label, res, mask;
38794 TWO52 = ix86_gen_TWO52 (mode);
38796 /* Temporary for holding the result, initialized to the input
38797 operand to ease control flow. */
38798 res = gen_reg_rtx (mode);
38799 emit_move_insn (res, operand1);
38801 /* xa = abs (operand1) */
38802 xa = ix86_expand_sse_fabs (res, &mask);
38804 /* if (!isless (xa, TWO52)) goto label; */
38805 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38807 /* x = (double)(long)x */
38808 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38809 expand_fix (xi, res, 0);
38810 expand_float (res, xi, 0);
38812 if (HONOR_SIGNED_ZEROS (mode))
38813 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38815 emit_label (label);
38816 LABEL_NUSES (label) = 1;
38818 emit_move_insn (operand0, res);
38821 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38822 into OPERAND0. */
38823 void
38824 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38826 enum machine_mode mode = GET_MODE (operand0);
38827 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38829 /* C code for SSE variant we expand below.
38830 double xa = fabs (x), x2;
38831 if (!isless (xa, TWO52))
38832 return x;
38833 xa2 = xa + TWO52 - TWO52;
38834 Compensate:
38835 if (xa2 > xa)
38836 xa2 -= 1.0;
38837 x2 = copysign (xa2, x);
38838 return x2;
38841 TWO52 = ix86_gen_TWO52 (mode);
38843 /* Temporary for holding the result, initialized to the input
38844 operand to ease control flow. */
38845 res = gen_reg_rtx (mode);
38846 emit_move_insn (res, operand1);
38848 /* xa = abs (operand1) */
38849 xa = ix86_expand_sse_fabs (res, &smask);
38851 /* if (!isless (xa, TWO52)) goto label; */
38852 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38854 /* res = xa + TWO52 - TWO52; */
38855 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38856 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38857 emit_move_insn (res, tmp);
38859 /* generate 1.0 */
38860 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38862 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38863 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38864 emit_insn (gen_rtx_SET (VOIDmode, mask,
38865 gen_rtx_AND (mode, mask, one)));
38866 tmp = expand_simple_binop (mode, MINUS,
38867 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38868 emit_move_insn (res, tmp);
38870 /* res = copysign (res, operand1) */
38871 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38873 emit_label (label);
38874 LABEL_NUSES (label) = 1;
38876 emit_move_insn (operand0, res);
38879 /* Expand SSE sequence for computing round from OPERAND1 storing
38880 into OPERAND0. */
38881 void
38882 ix86_expand_round (rtx operand0, rtx operand1)
38884 /* C code for the stuff we're doing below:
38885 double xa = fabs (x);
38886 if (!isless (xa, TWO52))
38887 return x;
38888 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38889 return copysign (xa, x);
38891 enum machine_mode mode = GET_MODE (operand0);
38892 rtx res, TWO52, xa, label, xi, half, mask;
38893 const struct real_format *fmt;
38894 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38896 /* Temporary for holding the result, initialized to the input
38897 operand to ease control flow. */
38898 res = gen_reg_rtx (mode);
38899 emit_move_insn (res, operand1);
38901 TWO52 = ix86_gen_TWO52 (mode);
38902 xa = ix86_expand_sse_fabs (res, &mask);
38903 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38905 /* load nextafter (0.5, 0.0) */
38906 fmt = REAL_MODE_FORMAT (mode);
38907 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38908 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38910 /* xa = xa + 0.5 */
38911 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38912 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38914 /* xa = (double)(int64_t)xa */
38915 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38916 expand_fix (xi, xa, 0);
38917 expand_float (xa, xi, 0);
38919 /* res = copysign (xa, operand1) */
38920 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38922 emit_label (label);
38923 LABEL_NUSES (label) = 1;
38925 emit_move_insn (operand0, res);
38928 /* Expand SSE sequence for computing round
38929 from OP1 storing into OP0 using sse4 round insn. */
38930 void
38931 ix86_expand_round_sse4 (rtx op0, rtx op1)
38933 enum machine_mode mode = GET_MODE (op0);
38934 rtx e1, e2, res, half;
38935 const struct real_format *fmt;
38936 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38937 rtx (*gen_copysign) (rtx, rtx, rtx);
38938 rtx (*gen_round) (rtx, rtx, rtx);
38940 switch (mode)
38942 case SFmode:
38943 gen_copysign = gen_copysignsf3;
38944 gen_round = gen_sse4_1_roundsf2;
38945 break;
38946 case DFmode:
38947 gen_copysign = gen_copysigndf3;
38948 gen_round = gen_sse4_1_rounddf2;
38949 break;
38950 default:
38951 gcc_unreachable ();
38954 /* round (a) = trunc (a + copysign (0.5, a)) */
38956 /* load nextafter (0.5, 0.0) */
38957 fmt = REAL_MODE_FORMAT (mode);
38958 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38959 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38960 half = const_double_from_real_value (pred_half, mode);
38962 /* e1 = copysign (0.5, op1) */
38963 e1 = gen_reg_rtx (mode);
38964 emit_insn (gen_copysign (e1, half, op1));
38966 /* e2 = op1 + e1 */
38967 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38969 /* res = trunc (e2) */
38970 res = gen_reg_rtx (mode);
38971 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38973 emit_move_insn (op0, res);
38977 /* Table of valid machine attributes. */
38978 static const struct attribute_spec ix86_attribute_table[] =
38980 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38981 affects_type_identity } */
38982 /* Stdcall attribute says callee is responsible for popping arguments
38983 if they are not variable. */
38984 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38985 true },
38986 /* Fastcall attribute says callee is responsible for popping arguments
38987 if they are not variable. */
38988 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38989 true },
38990 /* Thiscall attribute says callee is responsible for popping arguments
38991 if they are not variable. */
38992 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38993 true },
38994 /* Cdecl attribute says the callee is a normal C declaration */
38995 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38996 true },
38997 /* Regparm attribute specifies how many integer arguments are to be
38998 passed in registers. */
38999 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
39000 true },
39001 /* Sseregparm attribute says we are using x86_64 calling conventions
39002 for FP arguments. */
39003 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39004 true },
39005 /* The transactional memory builtins are implicitly regparm or fastcall
39006 depending on the ABI. Override the generic do-nothing attribute that
39007 these builtins were declared with. */
39008 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
39009 true },
39010 /* force_align_arg_pointer says this function realigns the stack at entry. */
39011 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
39012 false, true, true, ix86_handle_cconv_attribute, false },
39013 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39014 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
39015 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
39016 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
39017 false },
39018 #endif
39019 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39020 false },
39021 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39022 false },
39023 #ifdef SUBTARGET_ATTRIBUTE_TABLE
39024 SUBTARGET_ATTRIBUTE_TABLE,
39025 #endif
39026 /* ms_abi and sysv_abi calling convention function attributes. */
39027 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39028 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39029 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
39030 false },
39031 { "callee_pop_aggregate_return", 1, 1, false, true, true,
39032 ix86_handle_callee_pop_aggregate_return, true },
39033 /* End element. */
39034 { NULL, 0, 0, false, false, false, NULL, false }
39037 /* Implement targetm.vectorize.builtin_vectorization_cost. */
39038 static int
39039 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
39040 tree vectype,
39041 int misalign ATTRIBUTE_UNUSED)
39043 unsigned elements;
39045 switch (type_of_cost)
39047 case scalar_stmt:
39048 return ix86_cost->scalar_stmt_cost;
39050 case scalar_load:
39051 return ix86_cost->scalar_load_cost;
39053 case scalar_store:
39054 return ix86_cost->scalar_store_cost;
39056 case vector_stmt:
39057 return ix86_cost->vec_stmt_cost;
39059 case vector_load:
39060 return ix86_cost->vec_align_load_cost;
39062 case vector_store:
39063 return ix86_cost->vec_store_cost;
39065 case vec_to_scalar:
39066 return ix86_cost->vec_to_scalar_cost;
39068 case scalar_to_vec:
39069 return ix86_cost->scalar_to_vec_cost;
39071 case unaligned_load:
39072 case unaligned_store:
39073 return ix86_cost->vec_unalign_load_cost;
39075 case cond_branch_taken:
39076 return ix86_cost->cond_taken_branch_cost;
39078 case cond_branch_not_taken:
39079 return ix86_cost->cond_not_taken_branch_cost;
39081 case vec_perm:
39082 case vec_promote_demote:
39083 return ix86_cost->vec_stmt_cost;
39085 case vec_construct:
39086 elements = TYPE_VECTOR_SUBPARTS (vectype);
39087 return elements / 2 + 1;
39089 default:
39090 gcc_unreachable ();
39094 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
39095 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
39096 insn every time. */
39098 static GTY(()) rtx vselect_insn;
39100 /* Initialize vselect_insn. */
39102 static void
39103 init_vselect_insn (void)
39105 unsigned i;
39106 rtx x;
39108 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
39109 for (i = 0; i < MAX_VECT_LEN; ++i)
39110 XVECEXP (x, 0, i) = const0_rtx;
39111 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
39112 const0_rtx), x);
39113 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
39114 start_sequence ();
39115 vselect_insn = emit_insn (x);
39116 end_sequence ();
39119 /* Construct (set target (vec_select op0 (parallel perm))) and
39120 return true if that's a valid instruction in the active ISA. */
39122 static bool
39123 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
39124 unsigned nelt, bool testing_p)
39126 unsigned int i;
39127 rtx x, save_vconcat;
39128 int icode;
39130 if (vselect_insn == NULL_RTX)
39131 init_vselect_insn ();
39133 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
39134 PUT_NUM_ELEM (XVEC (x, 0), nelt);
39135 for (i = 0; i < nelt; ++i)
39136 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
39137 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39138 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
39139 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
39140 SET_DEST (PATTERN (vselect_insn)) = target;
39141 icode = recog_memoized (vselect_insn);
39143 if (icode >= 0 && !testing_p)
39144 emit_insn (copy_rtx (PATTERN (vselect_insn)));
39146 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
39147 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
39148 INSN_CODE (vselect_insn) = -1;
39150 return icode >= 0;
39153 /* Similar, but generate a vec_concat from op0 and op1 as well. */
39155 static bool
39156 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
39157 const unsigned char *perm, unsigned nelt,
39158 bool testing_p)
39160 enum machine_mode v2mode;
39161 rtx x;
39162 bool ok;
39164 if (vselect_insn == NULL_RTX)
39165 init_vselect_insn ();
39167 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
39168 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39169 PUT_MODE (x, v2mode);
39170 XEXP (x, 0) = op0;
39171 XEXP (x, 1) = op1;
39172 ok = expand_vselect (target, x, perm, nelt, testing_p);
39173 XEXP (x, 0) = const0_rtx;
39174 XEXP (x, 1) = const0_rtx;
39175 return ok;
39178 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39179 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
39181 static bool
39182 expand_vec_perm_blend (struct expand_vec_perm_d *d)
39184 enum machine_mode vmode = d->vmode;
39185 unsigned i, mask, nelt = d->nelt;
39186 rtx target, op0, op1, x;
39187 rtx rperm[32], vperm;
39189 if (d->one_operand_p)
39190 return false;
39191 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
39193 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
39195 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
39197 else
39198 return false;
39200 /* This is a blend, not a permute. Elements must stay in their
39201 respective lanes. */
39202 for (i = 0; i < nelt; ++i)
39204 unsigned e = d->perm[i];
39205 if (!(e == i || e == i + nelt))
39206 return false;
39209 if (d->testing_p)
39210 return true;
39212 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39213 decision should be extracted elsewhere, so that we only try that
39214 sequence once all budget==3 options have been tried. */
39215 target = d->target;
39216 op0 = d->op0;
39217 op1 = d->op1;
39218 mask = 0;
39220 switch (vmode)
39222 case V4DFmode:
39223 case V8SFmode:
39224 case V2DFmode:
39225 case V4SFmode:
39226 case V8HImode:
39227 case V8SImode:
39228 for (i = 0; i < nelt; ++i)
39229 mask |= (d->perm[i] >= nelt) << i;
39230 break;
39232 case V2DImode:
39233 for (i = 0; i < 2; ++i)
39234 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39235 vmode = V8HImode;
39236 goto do_subreg;
39238 case V4SImode:
39239 for (i = 0; i < 4; ++i)
39240 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39241 vmode = V8HImode;
39242 goto do_subreg;
39244 case V16QImode:
39245 /* See if bytes move in pairs so we can use pblendw with
39246 an immediate argument, rather than pblendvb with a vector
39247 argument. */
39248 for (i = 0; i < 16; i += 2)
39249 if (d->perm[i] + 1 != d->perm[i + 1])
39251 use_pblendvb:
39252 for (i = 0; i < nelt; ++i)
39253 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39255 finish_pblendvb:
39256 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39257 vperm = force_reg (vmode, vperm);
39259 if (GET_MODE_SIZE (vmode) == 16)
39260 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39261 else
39262 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39263 return true;
39266 for (i = 0; i < 8; ++i)
39267 mask |= (d->perm[i * 2] >= 16) << i;
39268 vmode = V8HImode;
39269 /* FALLTHRU */
39271 do_subreg:
39272 target = gen_lowpart (vmode, target);
39273 op0 = gen_lowpart (vmode, op0);
39274 op1 = gen_lowpart (vmode, op1);
39275 break;
39277 case V32QImode:
39278 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39279 for (i = 0; i < 32; i += 2)
39280 if (d->perm[i] + 1 != d->perm[i + 1])
39281 goto use_pblendvb;
39282 /* See if bytes move in quadruplets. If yes, vpblendd
39283 with immediate can be used. */
39284 for (i = 0; i < 32; i += 4)
39285 if (d->perm[i] + 2 != d->perm[i + 2])
39286 break;
39287 if (i < 32)
39289 /* See if bytes move the same in both lanes. If yes,
39290 vpblendw with immediate can be used. */
39291 for (i = 0; i < 16; i += 2)
39292 if (d->perm[i] + 16 != d->perm[i + 16])
39293 goto use_pblendvb;
39295 /* Use vpblendw. */
39296 for (i = 0; i < 16; ++i)
39297 mask |= (d->perm[i * 2] >= 32) << i;
39298 vmode = V16HImode;
39299 goto do_subreg;
39302 /* Use vpblendd. */
39303 for (i = 0; i < 8; ++i)
39304 mask |= (d->perm[i * 4] >= 32) << i;
39305 vmode = V8SImode;
39306 goto do_subreg;
39308 case V16HImode:
39309 /* See if words move in pairs. If yes, vpblendd can be used. */
39310 for (i = 0; i < 16; i += 2)
39311 if (d->perm[i] + 1 != d->perm[i + 1])
39312 break;
39313 if (i < 16)
39315 /* See if words move the same in both lanes. If not,
39316 vpblendvb must be used. */
39317 for (i = 0; i < 8; i++)
39318 if (d->perm[i] + 8 != d->perm[i + 8])
39320 /* Use vpblendvb. */
39321 for (i = 0; i < 32; ++i)
39322 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39324 vmode = V32QImode;
39325 nelt = 32;
39326 target = gen_lowpart (vmode, target);
39327 op0 = gen_lowpart (vmode, op0);
39328 op1 = gen_lowpart (vmode, op1);
39329 goto finish_pblendvb;
39332 /* Use vpblendw. */
39333 for (i = 0; i < 16; ++i)
39334 mask |= (d->perm[i] >= 16) << i;
39335 break;
39338 /* Use vpblendd. */
39339 for (i = 0; i < 8; ++i)
39340 mask |= (d->perm[i * 2] >= 16) << i;
39341 vmode = V8SImode;
39342 goto do_subreg;
39344 case V4DImode:
39345 /* Use vpblendd. */
39346 for (i = 0; i < 4; ++i)
39347 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39348 vmode = V8SImode;
39349 goto do_subreg;
39351 default:
39352 gcc_unreachable ();
39355 /* This matches five different patterns with the different modes. */
39356 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39357 x = gen_rtx_SET (VOIDmode, target, x);
39358 emit_insn (x);
39360 return true;
39363 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39364 in terms of the variable form of vpermilps.
39366 Note that we will have already failed the immediate input vpermilps,
39367 which requires that the high and low part shuffle be identical; the
39368 variable form doesn't require that. */
39370 static bool
39371 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39373 rtx rperm[8], vperm;
39374 unsigned i;
39376 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39377 return false;
39379 /* We can only permute within the 128-bit lane. */
39380 for (i = 0; i < 8; ++i)
39382 unsigned e = d->perm[i];
39383 if (i < 4 ? e >= 4 : e < 4)
39384 return false;
39387 if (d->testing_p)
39388 return true;
39390 for (i = 0; i < 8; ++i)
39392 unsigned e = d->perm[i];
39394 /* Within each 128-bit lane, the elements of op0 are numbered
39395 from 0 and the elements of op1 are numbered from 4. */
39396 if (e >= 8 + 4)
39397 e -= 8;
39398 else if (e >= 4)
39399 e -= 4;
39401 rperm[i] = GEN_INT (e);
39404 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39405 vperm = force_reg (V8SImode, vperm);
39406 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39408 return true;
39411 /* Return true if permutation D can be performed as VMODE permutation
39412 instead. */
39414 static bool
39415 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39417 unsigned int i, j, chunk;
39419 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39420 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39421 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39422 return false;
39424 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39425 return true;
39427 chunk = d->nelt / GET_MODE_NUNITS (vmode);
39428 for (i = 0; i < d->nelt; i += chunk)
39429 if (d->perm[i] & (chunk - 1))
39430 return false;
39431 else
39432 for (j = 1; j < chunk; ++j)
39433 if (d->perm[i] + j != d->perm[i + j])
39434 return false;
39436 return true;
39439 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39440 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
39442 static bool
39443 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
39445 unsigned i, nelt, eltsz, mask;
39446 unsigned char perm[32];
39447 enum machine_mode vmode = V16QImode;
39448 rtx rperm[32], vperm, target, op0, op1;
39450 nelt = d->nelt;
39452 if (!d->one_operand_p)
39454 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
39456 if (TARGET_AVX2
39457 && valid_perm_using_mode_p (V2TImode, d))
39459 if (d->testing_p)
39460 return true;
39462 /* Use vperm2i128 insn. The pattern uses
39463 V4DImode instead of V2TImode. */
39464 target = gen_lowpart (V4DImode, d->target);
39465 op0 = gen_lowpart (V4DImode, d->op0);
39466 op1 = gen_lowpart (V4DImode, d->op1);
39467 rperm[0]
39468 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
39469 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
39470 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
39471 return true;
39473 return false;
39476 else
39478 if (GET_MODE_SIZE (d->vmode) == 16)
39480 if (!TARGET_SSSE3)
39481 return false;
39483 else if (GET_MODE_SIZE (d->vmode) == 32)
39485 if (!TARGET_AVX2)
39486 return false;
39488 /* V4DImode should be already handled through
39489 expand_vselect by vpermq instruction. */
39490 gcc_assert (d->vmode != V4DImode);
39492 vmode = V32QImode;
39493 if (d->vmode == V8SImode
39494 || d->vmode == V16HImode
39495 || d->vmode == V32QImode)
39497 /* First see if vpermq can be used for
39498 V8SImode/V16HImode/V32QImode. */
39499 if (valid_perm_using_mode_p (V4DImode, d))
39501 for (i = 0; i < 4; i++)
39502 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
39503 if (d->testing_p)
39504 return true;
39505 return expand_vselect (gen_lowpart (V4DImode, d->target),
39506 gen_lowpart (V4DImode, d->op0),
39507 perm, 4, false);
39510 /* Next see if vpermd can be used. */
39511 if (valid_perm_using_mode_p (V8SImode, d))
39512 vmode = V8SImode;
39514 /* Or if vpermps can be used. */
39515 else if (d->vmode == V8SFmode)
39516 vmode = V8SImode;
39518 if (vmode == V32QImode)
39520 /* vpshufb only works intra lanes, it is not
39521 possible to shuffle bytes in between the lanes. */
39522 for (i = 0; i < nelt; ++i)
39523 if ((d->perm[i] ^ i) & (nelt / 2))
39524 return false;
39527 else
39528 return false;
39531 if (d->testing_p)
39532 return true;
39534 if (vmode == V8SImode)
39535 for (i = 0; i < 8; ++i)
39536 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
39537 else
39539 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39540 if (!d->one_operand_p)
39541 mask = 2 * nelt - 1;
39542 else if (vmode == V16QImode)
39543 mask = nelt - 1;
39544 else
39545 mask = nelt / 2 - 1;
39547 for (i = 0; i < nelt; ++i)
39549 unsigned j, e = d->perm[i] & mask;
39550 for (j = 0; j < eltsz; ++j)
39551 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
39555 vperm = gen_rtx_CONST_VECTOR (vmode,
39556 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
39557 vperm = force_reg (vmode, vperm);
39559 target = gen_lowpart (vmode, d->target);
39560 op0 = gen_lowpart (vmode, d->op0);
39561 if (d->one_operand_p)
39563 if (vmode == V16QImode)
39564 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
39565 else if (vmode == V32QImode)
39566 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
39567 else if (vmode == V8SFmode)
39568 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
39569 else
39570 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
39572 else
39574 op1 = gen_lowpart (vmode, d->op1);
39575 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
39578 return true;
39581 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
39582 in a single instruction. */
39584 static bool
39585 expand_vec_perm_1 (struct expand_vec_perm_d *d)
39587 unsigned i, nelt = d->nelt;
39588 unsigned char perm2[MAX_VECT_LEN];
39590 /* Check plain VEC_SELECT first, because AVX has instructions that could
39591 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
39592 input where SEL+CONCAT may not. */
39593 if (d->one_operand_p)
39595 int mask = nelt - 1;
39596 bool identity_perm = true;
39597 bool broadcast_perm = true;
39599 for (i = 0; i < nelt; i++)
39601 perm2[i] = d->perm[i] & mask;
39602 if (perm2[i] != i)
39603 identity_perm = false;
39604 if (perm2[i])
39605 broadcast_perm = false;
39608 if (identity_perm)
39610 if (!d->testing_p)
39611 emit_move_insn (d->target, d->op0);
39612 return true;
39614 else if (broadcast_perm && TARGET_AVX2)
39616 /* Use vpbroadcast{b,w,d}. */
39617 rtx (*gen) (rtx, rtx) = NULL;
39618 switch (d->vmode)
39620 case V32QImode:
39621 gen = gen_avx2_pbroadcastv32qi_1;
39622 break;
39623 case V16HImode:
39624 gen = gen_avx2_pbroadcastv16hi_1;
39625 break;
39626 case V8SImode:
39627 gen = gen_avx2_pbroadcastv8si_1;
39628 break;
39629 case V16QImode:
39630 gen = gen_avx2_pbroadcastv16qi;
39631 break;
39632 case V8HImode:
39633 gen = gen_avx2_pbroadcastv8hi;
39634 break;
39635 case V8SFmode:
39636 gen = gen_avx2_vec_dupv8sf_1;
39637 break;
39638 /* For other modes prefer other shuffles this function creates. */
39639 default: break;
39641 if (gen != NULL)
39643 if (!d->testing_p)
39644 emit_insn (gen (d->target, d->op0));
39645 return true;
39649 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
39650 return true;
39652 /* There are plenty of patterns in sse.md that are written for
39653 SEL+CONCAT and are not replicated for a single op. Perhaps
39654 that should be changed, to avoid the nastiness here. */
39656 /* Recognize interleave style patterns, which means incrementing
39657 every other permutation operand. */
39658 for (i = 0; i < nelt; i += 2)
39660 perm2[i] = d->perm[i] & mask;
39661 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
39663 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39664 d->testing_p))
39665 return true;
39667 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
39668 if (nelt >= 4)
39670 for (i = 0; i < nelt; i += 4)
39672 perm2[i + 0] = d->perm[i + 0] & mask;
39673 perm2[i + 1] = d->perm[i + 1] & mask;
39674 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
39675 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
39678 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39679 d->testing_p))
39680 return true;
39684 /* Finally, try the fully general two operand permute. */
39685 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
39686 d->testing_p))
39687 return true;
39689 /* Recognize interleave style patterns with reversed operands. */
39690 if (!d->one_operand_p)
39692 for (i = 0; i < nelt; ++i)
39694 unsigned e = d->perm[i];
39695 if (e >= nelt)
39696 e -= nelt;
39697 else
39698 e += nelt;
39699 perm2[i] = e;
39702 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
39703 d->testing_p))
39704 return true;
39707 /* Try the SSE4.1 blend variable merge instructions. */
39708 if (expand_vec_perm_blend (d))
39709 return true;
39711 /* Try one of the AVX vpermil variable permutations. */
39712 if (expand_vec_perm_vpermil (d))
39713 return true;
39715 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
39716 vpshufb, vpermd, vpermps or vpermq variable permutation. */
39717 if (expand_vec_perm_pshufb (d))
39718 return true;
39720 return false;
39723 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39724 in terms of a pair of pshuflw + pshufhw instructions. */
39726 static bool
39727 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
39729 unsigned char perm2[MAX_VECT_LEN];
39730 unsigned i;
39731 bool ok;
39733 if (d->vmode != V8HImode || !d->one_operand_p)
39734 return false;
39736 /* The two permutations only operate in 64-bit lanes. */
39737 for (i = 0; i < 4; ++i)
39738 if (d->perm[i] >= 4)
39739 return false;
39740 for (i = 4; i < 8; ++i)
39741 if (d->perm[i] < 4)
39742 return false;
39744 if (d->testing_p)
39745 return true;
39747 /* Emit the pshuflw. */
39748 memcpy (perm2, d->perm, 4);
39749 for (i = 4; i < 8; ++i)
39750 perm2[i] = i;
39751 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
39752 gcc_assert (ok);
39754 /* Emit the pshufhw. */
39755 memcpy (perm2 + 4, d->perm + 4, 4);
39756 for (i = 0; i < 4; ++i)
39757 perm2[i] = i;
39758 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
39759 gcc_assert (ok);
39761 return true;
39764 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39765 the permutation using the SSSE3 palignr instruction. This succeeds
39766 when all of the elements in PERM fit within one vector and we merely
39767 need to shift them down so that a single vector permutation has a
39768 chance to succeed. */
39770 static bool
39771 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
39773 unsigned i, nelt = d->nelt;
39774 unsigned min, max;
39775 bool in_order, ok;
39776 rtx shift;
39778 /* Even with AVX, palignr only operates on 128-bit vectors. */
39779 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39780 return false;
39782 min = nelt, max = 0;
39783 for (i = 0; i < nelt; ++i)
39785 unsigned e = d->perm[i];
39786 if (e < min)
39787 min = e;
39788 if (e > max)
39789 max = e;
39791 if (min == 0 || max - min >= nelt)
39792 return false;
39794 /* Given that we have SSSE3, we know we'll be able to implement the
39795 single operand permutation after the palignr with pshufb. */
39796 if (d->testing_p)
39797 return true;
39799 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39800 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39801 gen_lowpart (TImode, d->op1),
39802 gen_lowpart (TImode, d->op0), shift));
39804 d->op0 = d->op1 = d->target;
39805 d->one_operand_p = true;
39807 in_order = true;
39808 for (i = 0; i < nelt; ++i)
39810 unsigned e = d->perm[i] - min;
39811 if (e != i)
39812 in_order = false;
39813 d->perm[i] = e;
39816 /* Test for the degenerate case where the alignment by itself
39817 produces the desired permutation. */
39818 if (in_order)
39819 return true;
39821 ok = expand_vec_perm_1 (d);
39822 gcc_assert (ok);
39824 return ok;
39827 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39829 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39830 a two vector permutation into a single vector permutation by using
39831 an interleave operation to merge the vectors. */
39833 static bool
39834 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39836 struct expand_vec_perm_d dremap, dfinal;
39837 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39838 unsigned HOST_WIDE_INT contents;
39839 unsigned char remap[2 * MAX_VECT_LEN];
39840 rtx seq;
39841 bool ok, same_halves = false;
39843 if (GET_MODE_SIZE (d->vmode) == 16)
39845 if (d->one_operand_p)
39846 return false;
39848 else if (GET_MODE_SIZE (d->vmode) == 32)
39850 if (!TARGET_AVX)
39851 return false;
39852 /* For 32-byte modes allow even d->one_operand_p.
39853 The lack of cross-lane shuffling in some instructions
39854 might prevent a single insn shuffle. */
39855 dfinal = *d;
39856 dfinal.testing_p = true;
39857 /* If expand_vec_perm_interleave3 can expand this into
39858 a 3 insn sequence, give up and let it be expanded as
39859 3 insn sequence. While that is one insn longer,
39860 it doesn't need a memory operand and in the common
39861 case that both interleave low and high permutations
39862 with the same operands are adjacent needs 4 insns
39863 for both after CSE. */
39864 if (expand_vec_perm_interleave3 (&dfinal))
39865 return false;
39867 else
39868 return false;
39870 /* Examine from whence the elements come. */
39871 contents = 0;
39872 for (i = 0; i < nelt; ++i)
39873 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39875 memset (remap, 0xff, sizeof (remap));
39876 dremap = *d;
39878 if (GET_MODE_SIZE (d->vmode) == 16)
39880 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39882 /* Split the two input vectors into 4 halves. */
39883 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39884 h2 = h1 << nelt2;
39885 h3 = h2 << nelt2;
39886 h4 = h3 << nelt2;
39888 /* If the elements from the low halves use interleave low, and similarly
39889 for interleave high. If the elements are from mis-matched halves, we
39890 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39891 if ((contents & (h1 | h3)) == contents)
39893 /* punpckl* */
39894 for (i = 0; i < nelt2; ++i)
39896 remap[i] = i * 2;
39897 remap[i + nelt] = i * 2 + 1;
39898 dremap.perm[i * 2] = i;
39899 dremap.perm[i * 2 + 1] = i + nelt;
39901 if (!TARGET_SSE2 && d->vmode == V4SImode)
39902 dremap.vmode = V4SFmode;
39904 else if ((contents & (h2 | h4)) == contents)
39906 /* punpckh* */
39907 for (i = 0; i < nelt2; ++i)
39909 remap[i + nelt2] = i * 2;
39910 remap[i + nelt + nelt2] = i * 2 + 1;
39911 dremap.perm[i * 2] = i + nelt2;
39912 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39914 if (!TARGET_SSE2 && d->vmode == V4SImode)
39915 dremap.vmode = V4SFmode;
39917 else if ((contents & (h1 | h4)) == contents)
39919 /* shufps */
39920 for (i = 0; i < nelt2; ++i)
39922 remap[i] = i;
39923 remap[i + nelt + nelt2] = i + nelt2;
39924 dremap.perm[i] = i;
39925 dremap.perm[i + nelt2] = i + nelt + nelt2;
39927 if (nelt != 4)
39929 /* shufpd */
39930 dremap.vmode = V2DImode;
39931 dremap.nelt = 2;
39932 dremap.perm[0] = 0;
39933 dremap.perm[1] = 3;
39936 else if ((contents & (h2 | h3)) == contents)
39938 /* shufps */
39939 for (i = 0; i < nelt2; ++i)
39941 remap[i + nelt2] = i;
39942 remap[i + nelt] = i + nelt2;
39943 dremap.perm[i] = i + nelt2;
39944 dremap.perm[i + nelt2] = i + nelt;
39946 if (nelt != 4)
39948 /* shufpd */
39949 dremap.vmode = V2DImode;
39950 dremap.nelt = 2;
39951 dremap.perm[0] = 1;
39952 dremap.perm[1] = 2;
39955 else
39956 return false;
39958 else
39960 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39961 unsigned HOST_WIDE_INT q[8];
39962 unsigned int nonzero_halves[4];
39964 /* Split the two input vectors into 8 quarters. */
39965 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39966 for (i = 1; i < 8; ++i)
39967 q[i] = q[0] << (nelt4 * i);
39968 for (i = 0; i < 4; ++i)
39969 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39971 nonzero_halves[nzcnt] = i;
39972 ++nzcnt;
39975 if (nzcnt == 1)
39977 gcc_assert (d->one_operand_p);
39978 nonzero_halves[1] = nonzero_halves[0];
39979 same_halves = true;
39981 else if (d->one_operand_p)
39983 gcc_assert (nonzero_halves[0] == 0);
39984 gcc_assert (nonzero_halves[1] == 1);
39987 if (nzcnt <= 2)
39989 if (d->perm[0] / nelt2 == nonzero_halves[1])
39991 /* Attempt to increase the likelihood that dfinal
39992 shuffle will be intra-lane. */
39993 char tmph = nonzero_halves[0];
39994 nonzero_halves[0] = nonzero_halves[1];
39995 nonzero_halves[1] = tmph;
39998 /* vperm2f128 or vperm2i128. */
39999 for (i = 0; i < nelt2; ++i)
40001 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
40002 remap[i + nonzero_halves[0] * nelt2] = i;
40003 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
40004 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
40007 if (d->vmode != V8SFmode
40008 && d->vmode != V4DFmode
40009 && d->vmode != V8SImode)
40011 dremap.vmode = V8SImode;
40012 dremap.nelt = 8;
40013 for (i = 0; i < 4; ++i)
40015 dremap.perm[i] = i + nonzero_halves[0] * 4;
40016 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
40020 else if (d->one_operand_p)
40021 return false;
40022 else if (TARGET_AVX2
40023 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
40025 /* vpunpckl* */
40026 for (i = 0; i < nelt4; ++i)
40028 remap[i] = i * 2;
40029 remap[i + nelt] = i * 2 + 1;
40030 remap[i + nelt2] = i * 2 + nelt2;
40031 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
40032 dremap.perm[i * 2] = i;
40033 dremap.perm[i * 2 + 1] = i + nelt;
40034 dremap.perm[i * 2 + nelt2] = i + nelt2;
40035 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
40038 else if (TARGET_AVX2
40039 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
40041 /* vpunpckh* */
40042 for (i = 0; i < nelt4; ++i)
40044 remap[i + nelt4] = i * 2;
40045 remap[i + nelt + nelt4] = i * 2 + 1;
40046 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
40047 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
40048 dremap.perm[i * 2] = i + nelt4;
40049 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
40050 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
40051 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
40054 else
40055 return false;
40058 /* Use the remapping array set up above to move the elements from their
40059 swizzled locations into their final destinations. */
40060 dfinal = *d;
40061 for (i = 0; i < nelt; ++i)
40063 unsigned e = remap[d->perm[i]];
40064 gcc_assert (e < nelt);
40065 /* If same_halves is true, both halves of the remapped vector are the
40066 same. Avoid cross-lane accesses if possible. */
40067 if (same_halves && i >= nelt2)
40069 gcc_assert (e < nelt2);
40070 dfinal.perm[i] = e + nelt2;
40072 else
40073 dfinal.perm[i] = e;
40075 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
40076 dfinal.op1 = dfinal.op0;
40077 dfinal.one_operand_p = true;
40078 dremap.target = dfinal.op0;
40080 /* Test if the final remap can be done with a single insn. For V4SFmode or
40081 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
40082 start_sequence ();
40083 ok = expand_vec_perm_1 (&dfinal);
40084 seq = get_insns ();
40085 end_sequence ();
40087 if (!ok)
40088 return false;
40090 if (d->testing_p)
40091 return true;
40093 if (dremap.vmode != dfinal.vmode)
40095 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
40096 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
40097 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
40100 ok = expand_vec_perm_1 (&dremap);
40101 gcc_assert (ok);
40103 emit_insn (seq);
40104 return true;
40107 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40108 a single vector cross-lane permutation into vpermq followed
40109 by any of the single insn permutations. */
40111 static bool
40112 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
40114 struct expand_vec_perm_d dremap, dfinal;
40115 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
40116 unsigned contents[2];
40117 bool ok;
40119 if (!(TARGET_AVX2
40120 && (d->vmode == V32QImode || d->vmode == V16HImode)
40121 && d->one_operand_p))
40122 return false;
40124 contents[0] = 0;
40125 contents[1] = 0;
40126 for (i = 0; i < nelt2; ++i)
40128 contents[0] |= 1u << (d->perm[i] / nelt4);
40129 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
40132 for (i = 0; i < 2; ++i)
40134 unsigned int cnt = 0;
40135 for (j = 0; j < 4; ++j)
40136 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
40137 return false;
40140 if (d->testing_p)
40141 return true;
40143 dremap = *d;
40144 dremap.vmode = V4DImode;
40145 dremap.nelt = 4;
40146 dremap.target = gen_reg_rtx (V4DImode);
40147 dremap.op0 = gen_lowpart (V4DImode, d->op0);
40148 dremap.op1 = dremap.op0;
40149 dremap.one_operand_p = true;
40150 for (i = 0; i < 2; ++i)
40152 unsigned int cnt = 0;
40153 for (j = 0; j < 4; ++j)
40154 if ((contents[i] & (1u << j)) != 0)
40155 dremap.perm[2 * i + cnt++] = j;
40156 for (; cnt < 2; ++cnt)
40157 dremap.perm[2 * i + cnt] = 0;
40160 dfinal = *d;
40161 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40162 dfinal.op1 = dfinal.op0;
40163 dfinal.one_operand_p = true;
40164 for (i = 0, j = 0; i < nelt; ++i)
40166 if (i == nelt2)
40167 j = 2;
40168 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
40169 if ((d->perm[i] / nelt4) == dremap.perm[j])
40171 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
40172 dfinal.perm[i] |= nelt4;
40173 else
40174 gcc_unreachable ();
40177 ok = expand_vec_perm_1 (&dremap);
40178 gcc_assert (ok);
40180 ok = expand_vec_perm_1 (&dfinal);
40181 gcc_assert (ok);
40183 return true;
40186 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
40187 a vector permutation using two instructions, vperm2f128 resp.
40188 vperm2i128 followed by any single in-lane permutation. */
40190 static bool
40191 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
40193 struct expand_vec_perm_d dfirst, dsecond;
40194 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
40195 bool ok;
40197 if (!TARGET_AVX
40198 || GET_MODE_SIZE (d->vmode) != 32
40199 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40200 return false;
40202 dsecond = *d;
40203 dsecond.one_operand_p = false;
40204 dsecond.testing_p = true;
40206 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40207 immediate. For perm < 16 the second permutation uses
40208 d->op0 as first operand, for perm >= 16 it uses d->op1
40209 as first operand. The second operand is the result of
40210 vperm2[fi]128. */
40211 for (perm = 0; perm < 32; perm++)
40213 /* Ignore permutations which do not move anything cross-lane. */
40214 if (perm < 16)
40216 /* The second shuffle for e.g. V4DFmode has
40217 0123 and ABCD operands.
40218 Ignore AB23, as 23 is already in the second lane
40219 of the first operand. */
40220 if ((perm & 0xc) == (1 << 2)) continue;
40221 /* And 01CD, as 01 is in the first lane of the first
40222 operand. */
40223 if ((perm & 3) == 0) continue;
40224 /* And 4567, as then the vperm2[fi]128 doesn't change
40225 anything on the original 4567 second operand. */
40226 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40228 else
40230 /* The second shuffle for e.g. V4DFmode has
40231 4567 and ABCD operands.
40232 Ignore AB67, as 67 is already in the second lane
40233 of the first operand. */
40234 if ((perm & 0xc) == (3 << 2)) continue;
40235 /* And 45CD, as 45 is in the first lane of the first
40236 operand. */
40237 if ((perm & 3) == 2) continue;
40238 /* And 0123, as then the vperm2[fi]128 doesn't change
40239 anything on the original 0123 first operand. */
40240 if ((perm & 0xf) == (1 << 2)) continue;
40243 for (i = 0; i < nelt; i++)
40245 j = d->perm[i] / nelt2;
40246 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40247 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40248 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40249 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40250 else
40251 break;
40254 if (i == nelt)
40256 start_sequence ();
40257 ok = expand_vec_perm_1 (&dsecond);
40258 end_sequence ();
40260 else
40261 ok = false;
40263 if (ok)
40265 if (d->testing_p)
40266 return true;
40268 /* Found a usable second shuffle. dfirst will be
40269 vperm2f128 on d->op0 and d->op1. */
40270 dsecond.testing_p = false;
40271 dfirst = *d;
40272 dfirst.target = gen_reg_rtx (d->vmode);
40273 for (i = 0; i < nelt; i++)
40274 dfirst.perm[i] = (i & (nelt2 - 1))
40275 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40277 ok = expand_vec_perm_1 (&dfirst);
40278 gcc_assert (ok);
40280 /* And dsecond is some single insn shuffle, taking
40281 d->op0 and result of vperm2f128 (if perm < 16) or
40282 d->op1 and result of vperm2f128 (otherwise). */
40283 dsecond.op1 = dfirst.target;
40284 if (perm >= 16)
40285 dsecond.op0 = dfirst.op1;
40287 ok = expand_vec_perm_1 (&dsecond);
40288 gcc_assert (ok);
40290 return true;
40293 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40294 if (d->one_operand_p)
40295 return false;
40298 return false;
40301 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40302 a two vector permutation using 2 intra-lane interleave insns
40303 and cross-lane shuffle for 32-byte vectors. */
40305 static bool
40306 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40308 unsigned i, nelt;
40309 rtx (*gen) (rtx, rtx, rtx);
40311 if (d->one_operand_p)
40312 return false;
40313 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40315 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40317 else
40318 return false;
40320 nelt = d->nelt;
40321 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40322 return false;
40323 for (i = 0; i < nelt; i += 2)
40324 if (d->perm[i] != d->perm[0] + i / 2
40325 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40326 return false;
40328 if (d->testing_p)
40329 return true;
40331 switch (d->vmode)
40333 case V32QImode:
40334 if (d->perm[0])
40335 gen = gen_vec_interleave_highv32qi;
40336 else
40337 gen = gen_vec_interleave_lowv32qi;
40338 break;
40339 case V16HImode:
40340 if (d->perm[0])
40341 gen = gen_vec_interleave_highv16hi;
40342 else
40343 gen = gen_vec_interleave_lowv16hi;
40344 break;
40345 case V8SImode:
40346 if (d->perm[0])
40347 gen = gen_vec_interleave_highv8si;
40348 else
40349 gen = gen_vec_interleave_lowv8si;
40350 break;
40351 case V4DImode:
40352 if (d->perm[0])
40353 gen = gen_vec_interleave_highv4di;
40354 else
40355 gen = gen_vec_interleave_lowv4di;
40356 break;
40357 case V8SFmode:
40358 if (d->perm[0])
40359 gen = gen_vec_interleave_highv8sf;
40360 else
40361 gen = gen_vec_interleave_lowv8sf;
40362 break;
40363 case V4DFmode:
40364 if (d->perm[0])
40365 gen = gen_vec_interleave_highv4df;
40366 else
40367 gen = gen_vec_interleave_lowv4df;
40368 break;
40369 default:
40370 gcc_unreachable ();
40373 emit_insn (gen (d->target, d->op0, d->op1));
40374 return true;
40377 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40378 a single vector permutation using a single intra-lane vector
40379 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40380 the non-swapped and swapped vectors together. */
40382 static bool
40383 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40385 struct expand_vec_perm_d dfirst, dsecond;
40386 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40387 rtx seq;
40388 bool ok;
40389 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40391 if (!TARGET_AVX
40392 || TARGET_AVX2
40393 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40394 || !d->one_operand_p)
40395 return false;
40397 dfirst = *d;
40398 for (i = 0; i < nelt; i++)
40399 dfirst.perm[i] = 0xff;
40400 for (i = 0, msk = 0; i < nelt; i++)
40402 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40403 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40404 return false;
40405 dfirst.perm[j] = d->perm[i];
40406 if (j != i)
40407 msk |= (1 << i);
40409 for (i = 0; i < nelt; i++)
40410 if (dfirst.perm[i] == 0xff)
40411 dfirst.perm[i] = i;
40413 if (!d->testing_p)
40414 dfirst.target = gen_reg_rtx (dfirst.vmode);
40416 start_sequence ();
40417 ok = expand_vec_perm_1 (&dfirst);
40418 seq = get_insns ();
40419 end_sequence ();
40421 if (!ok)
40422 return false;
40424 if (d->testing_p)
40425 return true;
40427 emit_insn (seq);
40429 dsecond = *d;
40430 dsecond.op0 = dfirst.target;
40431 dsecond.op1 = dfirst.target;
40432 dsecond.one_operand_p = true;
40433 dsecond.target = gen_reg_rtx (dsecond.vmode);
40434 for (i = 0; i < nelt; i++)
40435 dsecond.perm[i] = i ^ nelt2;
40437 ok = expand_vec_perm_1 (&dsecond);
40438 gcc_assert (ok);
40440 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
40441 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
40442 return true;
40445 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
40446 permutation using two vperm2f128, followed by a vshufpd insn blending
40447 the two vectors together. */
40449 static bool
40450 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
40452 struct expand_vec_perm_d dfirst, dsecond, dthird;
40453 bool ok;
40455 if (!TARGET_AVX || (d->vmode != V4DFmode))
40456 return false;
40458 if (d->testing_p)
40459 return true;
40461 dfirst = *d;
40462 dsecond = *d;
40463 dthird = *d;
40465 dfirst.perm[0] = (d->perm[0] & ~1);
40466 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
40467 dfirst.perm[2] = (d->perm[2] & ~1);
40468 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
40469 dsecond.perm[0] = (d->perm[1] & ~1);
40470 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
40471 dsecond.perm[2] = (d->perm[3] & ~1);
40472 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
40473 dthird.perm[0] = (d->perm[0] % 2);
40474 dthird.perm[1] = (d->perm[1] % 2) + 4;
40475 dthird.perm[2] = (d->perm[2] % 2) + 2;
40476 dthird.perm[3] = (d->perm[3] % 2) + 6;
40478 dfirst.target = gen_reg_rtx (dfirst.vmode);
40479 dsecond.target = gen_reg_rtx (dsecond.vmode);
40480 dthird.op0 = dfirst.target;
40481 dthird.op1 = dsecond.target;
40482 dthird.one_operand_p = false;
40484 canonicalize_perm (&dfirst);
40485 canonicalize_perm (&dsecond);
40487 ok = expand_vec_perm_1 (&dfirst)
40488 && expand_vec_perm_1 (&dsecond)
40489 && expand_vec_perm_1 (&dthird);
40491 gcc_assert (ok);
40493 return true;
40496 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
40497 permutation with two pshufb insns and an ior. We should have already
40498 failed all two instruction sequences. */
40500 static bool
40501 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
40503 rtx rperm[2][16], vperm, l, h, op, m128;
40504 unsigned int i, nelt, eltsz;
40506 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40507 return false;
40508 gcc_assert (!d->one_operand_p);
40510 nelt = d->nelt;
40511 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40513 /* Generate two permutation masks. If the required element is within
40514 the given vector it is shuffled into the proper lane. If the required
40515 element is in the other vector, force a zero into the lane by setting
40516 bit 7 in the permutation mask. */
40517 m128 = GEN_INT (-128);
40518 for (i = 0; i < nelt; ++i)
40520 unsigned j, e = d->perm[i];
40521 unsigned which = (e >= nelt);
40522 if (e >= nelt)
40523 e -= nelt;
40525 for (j = 0; j < eltsz; ++j)
40527 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
40528 rperm[1-which][i*eltsz + j] = m128;
40532 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
40533 vperm = force_reg (V16QImode, vperm);
40535 l = gen_reg_rtx (V16QImode);
40536 op = gen_lowpart (V16QImode, d->op0);
40537 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
40539 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
40540 vperm = force_reg (V16QImode, vperm);
40542 h = gen_reg_rtx (V16QImode);
40543 op = gen_lowpart (V16QImode, d->op1);
40544 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
40546 op = gen_lowpart (V16QImode, d->target);
40547 emit_insn (gen_iorv16qi3 (op, l, h));
40549 return true;
40552 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
40553 with two vpshufb insns, vpermq and vpor. We should have already failed
40554 all two or three instruction sequences. */
40556 static bool
40557 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
40559 rtx rperm[2][32], vperm, l, h, hp, op, m128;
40560 unsigned int i, nelt, eltsz;
40562 if (!TARGET_AVX2
40563 || !d->one_operand_p
40564 || (d->vmode != V32QImode && d->vmode != V16HImode))
40565 return false;
40567 if (d->testing_p)
40568 return true;
40570 nelt = d->nelt;
40571 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40573 /* Generate two permutation masks. If the required element is within
40574 the same lane, it is shuffled in. If the required element from the
40575 other lane, force a zero by setting bit 7 in the permutation mask.
40576 In the other mask the mask has non-negative elements if element
40577 is requested from the other lane, but also moved to the other lane,
40578 so that the result of vpshufb can have the two V2TImode halves
40579 swapped. */
40580 m128 = GEN_INT (-128);
40581 for (i = 0; i < nelt; ++i)
40583 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40584 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40586 for (j = 0; j < eltsz; ++j)
40588 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
40589 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
40593 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40594 vperm = force_reg (V32QImode, vperm);
40596 h = gen_reg_rtx (V32QImode);
40597 op = gen_lowpart (V32QImode, d->op0);
40598 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40600 /* Swap the 128-byte lanes of h into hp. */
40601 hp = gen_reg_rtx (V4DImode);
40602 op = gen_lowpart (V4DImode, h);
40603 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
40604 const1_rtx));
40606 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40607 vperm = force_reg (V32QImode, vperm);
40609 l = gen_reg_rtx (V32QImode);
40610 op = gen_lowpart (V32QImode, d->op0);
40611 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40613 op = gen_lowpart (V32QImode, d->target);
40614 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
40616 return true;
40619 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
40620 and extract-odd permutations of two V32QImode and V16QImode operand
40621 with two vpshufb insns, vpor and vpermq. We should have already
40622 failed all two or three instruction sequences. */
40624 static bool
40625 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
40627 rtx rperm[2][32], vperm, l, h, ior, op, m128;
40628 unsigned int i, nelt, eltsz;
40630 if (!TARGET_AVX2
40631 || d->one_operand_p
40632 || (d->vmode != V32QImode && d->vmode != V16HImode))
40633 return false;
40635 for (i = 0; i < d->nelt; ++i)
40636 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
40637 return false;
40639 if (d->testing_p)
40640 return true;
40642 nelt = d->nelt;
40643 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40645 /* Generate two permutation masks. In the first permutation mask
40646 the first quarter will contain indexes for the first half
40647 of the op0, the second quarter will contain bit 7 set, third quarter
40648 will contain indexes for the second half of the op0 and the
40649 last quarter bit 7 set. In the second permutation mask
40650 the first quarter will contain bit 7 set, the second quarter
40651 indexes for the first half of the op1, the third quarter bit 7 set
40652 and last quarter indexes for the second half of the op1.
40653 I.e. the first mask e.g. for V32QImode extract even will be:
40654 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
40655 (all values masked with 0xf except for -128) and second mask
40656 for extract even will be
40657 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
40658 m128 = GEN_INT (-128);
40659 for (i = 0; i < nelt; ++i)
40661 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40662 unsigned which = d->perm[i] >= nelt;
40663 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
40665 for (j = 0; j < eltsz; ++j)
40667 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
40668 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
40672 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40673 vperm = force_reg (V32QImode, vperm);
40675 l = gen_reg_rtx (V32QImode);
40676 op = gen_lowpart (V32QImode, d->op0);
40677 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40679 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40680 vperm = force_reg (V32QImode, vperm);
40682 h = gen_reg_rtx (V32QImode);
40683 op = gen_lowpart (V32QImode, d->op1);
40684 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40686 ior = gen_reg_rtx (V32QImode);
40687 emit_insn (gen_iorv32qi3 (ior, l, h));
40689 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
40690 op = gen_lowpart (V4DImode, d->target);
40691 ior = gen_lowpart (V4DImode, ior);
40692 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
40693 const1_rtx, GEN_INT (3)));
40695 return true;
40698 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
40699 and extract-odd permutations. */
40701 static bool
40702 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
40704 rtx t1, t2, t3;
40706 switch (d->vmode)
40708 case V4DFmode:
40709 t1 = gen_reg_rtx (V4DFmode);
40710 t2 = gen_reg_rtx (V4DFmode);
40712 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40713 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
40714 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
40716 /* Now an unpck[lh]pd will produce the result required. */
40717 if (odd)
40718 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
40719 else
40720 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
40721 emit_insn (t3);
40722 break;
40724 case V8SFmode:
40726 int mask = odd ? 0xdd : 0x88;
40728 t1 = gen_reg_rtx (V8SFmode);
40729 t2 = gen_reg_rtx (V8SFmode);
40730 t3 = gen_reg_rtx (V8SFmode);
40732 /* Shuffle within the 128-bit lanes to produce:
40733 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
40734 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
40735 GEN_INT (mask)));
40737 /* Shuffle the lanes around to produce:
40738 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
40739 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
40740 GEN_INT (0x3)));
40742 /* Shuffle within the 128-bit lanes to produce:
40743 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
40744 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
40746 /* Shuffle within the 128-bit lanes to produce:
40747 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
40748 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
40750 /* Shuffle the lanes around to produce:
40751 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
40752 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
40753 GEN_INT (0x20)));
40755 break;
40757 case V2DFmode:
40758 case V4SFmode:
40759 case V2DImode:
40760 case V4SImode:
40761 /* These are always directly implementable by expand_vec_perm_1. */
40762 gcc_unreachable ();
40764 case V8HImode:
40765 if (TARGET_SSSE3)
40766 return expand_vec_perm_pshufb2 (d);
40767 else
40769 /* We need 2*log2(N)-1 operations to achieve odd/even
40770 with interleave. */
40771 t1 = gen_reg_rtx (V8HImode);
40772 t2 = gen_reg_rtx (V8HImode);
40773 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
40774 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
40775 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
40776 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
40777 if (odd)
40778 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
40779 else
40780 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
40781 emit_insn (t3);
40783 break;
40785 case V16QImode:
40786 if (TARGET_SSSE3)
40787 return expand_vec_perm_pshufb2 (d);
40788 else
40790 t1 = gen_reg_rtx (V16QImode);
40791 t2 = gen_reg_rtx (V16QImode);
40792 t3 = gen_reg_rtx (V16QImode);
40793 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40794 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40795 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40796 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40797 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40798 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40799 if (odd)
40800 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40801 else
40802 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40803 emit_insn (t3);
40805 break;
40807 case V16HImode:
40808 case V32QImode:
40809 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40811 case V4DImode:
40812 if (!TARGET_AVX2)
40814 struct expand_vec_perm_d d_copy = *d;
40815 d_copy.vmode = V4DFmode;
40816 d_copy.target = gen_lowpart (V4DFmode, d->target);
40817 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40818 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40819 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40822 t1 = gen_reg_rtx (V4DImode);
40823 t2 = gen_reg_rtx (V4DImode);
40825 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40826 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40827 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40829 /* Now an vpunpck[lh]qdq will produce the result required. */
40830 if (odd)
40831 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40832 else
40833 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40834 emit_insn (t3);
40835 break;
40837 case V8SImode:
40838 if (!TARGET_AVX2)
40840 struct expand_vec_perm_d d_copy = *d;
40841 d_copy.vmode = V8SFmode;
40842 d_copy.target = gen_lowpart (V8SFmode, d->target);
40843 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40844 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40845 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40848 t1 = gen_reg_rtx (V8SImode);
40849 t2 = gen_reg_rtx (V8SImode);
40851 /* Shuffle the lanes around into
40852 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40853 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40854 gen_lowpart (V4DImode, d->op0),
40855 gen_lowpart (V4DImode, d->op1),
40856 GEN_INT (0x20)));
40857 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40858 gen_lowpart (V4DImode, d->op0),
40859 gen_lowpart (V4DImode, d->op1),
40860 GEN_INT (0x31)));
40862 /* Swap the 2nd and 3rd position in each lane into
40863 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40864 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40865 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40866 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40867 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40869 /* Now an vpunpck[lh]qdq will produce
40870 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40871 if (odd)
40872 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40873 gen_lowpart (V4DImode, t1),
40874 gen_lowpart (V4DImode, t2));
40875 else
40876 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40877 gen_lowpart (V4DImode, t1),
40878 gen_lowpart (V4DImode, t2));
40879 emit_insn (t3);
40880 break;
40882 default:
40883 gcc_unreachable ();
40886 return true;
40889 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40890 extract-even and extract-odd permutations. */
40892 static bool
40893 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40895 unsigned i, odd, nelt = d->nelt;
40897 odd = d->perm[0];
40898 if (odd != 0 && odd != 1)
40899 return false;
40901 for (i = 1; i < nelt; ++i)
40902 if (d->perm[i] != 2 * i + odd)
40903 return false;
40905 return expand_vec_perm_even_odd_1 (d, odd);
40908 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40909 permutations. We assume that expand_vec_perm_1 has already failed. */
40911 static bool
40912 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40914 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40915 enum machine_mode vmode = d->vmode;
40916 unsigned char perm2[4];
40917 rtx op0 = d->op0;
40918 bool ok;
40920 switch (vmode)
40922 case V4DFmode:
40923 case V8SFmode:
40924 /* These are special-cased in sse.md so that we can optionally
40925 use the vbroadcast instruction. They expand to two insns
40926 if the input happens to be in a register. */
40927 gcc_unreachable ();
40929 case V2DFmode:
40930 case V2DImode:
40931 case V4SFmode:
40932 case V4SImode:
40933 /* These are always implementable using standard shuffle patterns. */
40934 gcc_unreachable ();
40936 case V8HImode:
40937 case V16QImode:
40938 /* These can be implemented via interleave. We save one insn by
40939 stopping once we have promoted to V4SImode and then use pshufd. */
40942 rtx dest;
40943 rtx (*gen) (rtx, rtx, rtx)
40944 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40945 : gen_vec_interleave_lowv8hi;
40947 if (elt >= nelt2)
40949 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40950 : gen_vec_interleave_highv8hi;
40951 elt -= nelt2;
40953 nelt2 /= 2;
40955 dest = gen_reg_rtx (vmode);
40956 emit_insn (gen (dest, op0, op0));
40957 vmode = get_mode_wider_vector (vmode);
40958 op0 = gen_lowpart (vmode, dest);
40960 while (vmode != V4SImode);
40962 memset (perm2, elt, 4);
40963 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40964 d->testing_p);
40965 gcc_assert (ok);
40966 return true;
40968 case V32QImode:
40969 case V16HImode:
40970 case V8SImode:
40971 case V4DImode:
40972 /* For AVX2 broadcasts of the first element vpbroadcast* or
40973 vpermq should be used by expand_vec_perm_1. */
40974 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40975 return false;
40977 default:
40978 gcc_unreachable ();
40982 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40983 broadcast permutations. */
40985 static bool
40986 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40988 unsigned i, elt, nelt = d->nelt;
40990 if (!d->one_operand_p)
40991 return false;
40993 elt = d->perm[0];
40994 for (i = 1; i < nelt; ++i)
40995 if (d->perm[i] != elt)
40996 return false;
40998 return expand_vec_perm_broadcast_1 (d);
41001 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
41002 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
41003 all the shorter instruction sequences. */
41005 static bool
41006 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
41008 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
41009 unsigned int i, nelt, eltsz;
41010 bool used[4];
41012 if (!TARGET_AVX2
41013 || d->one_operand_p
41014 || (d->vmode != V32QImode && d->vmode != V16HImode))
41015 return false;
41017 if (d->testing_p)
41018 return true;
41020 nelt = d->nelt;
41021 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41023 /* Generate 4 permutation masks. If the required element is within
41024 the same lane, it is shuffled in. If the required element from the
41025 other lane, force a zero by setting bit 7 in the permutation mask.
41026 In the other mask the mask has non-negative elements if element
41027 is requested from the other lane, but also moved to the other lane,
41028 so that the result of vpshufb can have the two V2TImode halves
41029 swapped. */
41030 m128 = GEN_INT (-128);
41031 for (i = 0; i < 32; ++i)
41033 rperm[0][i] = m128;
41034 rperm[1][i] = m128;
41035 rperm[2][i] = m128;
41036 rperm[3][i] = m128;
41038 used[0] = false;
41039 used[1] = false;
41040 used[2] = false;
41041 used[3] = false;
41042 for (i = 0; i < nelt; ++i)
41044 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41045 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41046 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
41048 for (j = 0; j < eltsz; ++j)
41049 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
41050 used[which] = true;
41053 for (i = 0; i < 2; ++i)
41055 if (!used[2 * i + 1])
41057 h[i] = NULL_RTX;
41058 continue;
41060 vperm = gen_rtx_CONST_VECTOR (V32QImode,
41061 gen_rtvec_v (32, rperm[2 * i + 1]));
41062 vperm = force_reg (V32QImode, vperm);
41063 h[i] = gen_reg_rtx (V32QImode);
41064 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41065 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
41068 /* Swap the 128-byte lanes of h[X]. */
41069 for (i = 0; i < 2; ++i)
41071 if (h[i] == NULL_RTX)
41072 continue;
41073 op = gen_reg_rtx (V4DImode);
41074 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
41075 const2_rtx, GEN_INT (3), const0_rtx,
41076 const1_rtx));
41077 h[i] = gen_lowpart (V32QImode, op);
41080 for (i = 0; i < 2; ++i)
41082 if (!used[2 * i])
41084 l[i] = NULL_RTX;
41085 continue;
41087 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
41088 vperm = force_reg (V32QImode, vperm);
41089 l[i] = gen_reg_rtx (V32QImode);
41090 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41091 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
41094 for (i = 0; i < 2; ++i)
41096 if (h[i] && l[i])
41098 op = gen_reg_rtx (V32QImode);
41099 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
41100 l[i] = op;
41102 else if (h[i])
41103 l[i] = h[i];
41106 gcc_assert (l[0] && l[1]);
41107 op = gen_lowpart (V32QImode, d->target);
41108 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
41109 return true;
41112 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
41113 With all of the interface bits taken care of, perform the expansion
41114 in D and return true on success. */
41116 static bool
41117 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
41119 /* Try a single instruction expansion. */
41120 if (expand_vec_perm_1 (d))
41121 return true;
41123 /* Try sequences of two instructions. */
41125 if (expand_vec_perm_pshuflw_pshufhw (d))
41126 return true;
41128 if (expand_vec_perm_palignr (d))
41129 return true;
41131 if (expand_vec_perm_interleave2 (d))
41132 return true;
41134 if (expand_vec_perm_broadcast (d))
41135 return true;
41137 if (expand_vec_perm_vpermq_perm_1 (d))
41138 return true;
41140 if (expand_vec_perm_vperm2f128 (d))
41141 return true;
41143 /* Try sequences of three instructions. */
41145 if (expand_vec_perm_2vperm2f128_vshuf (d))
41146 return true;
41148 if (expand_vec_perm_pshufb2 (d))
41149 return true;
41151 if (expand_vec_perm_interleave3 (d))
41152 return true;
41154 if (expand_vec_perm_vperm2f128_vblend (d))
41155 return true;
41157 /* Try sequences of four instructions. */
41159 if (expand_vec_perm_vpshufb2_vpermq (d))
41160 return true;
41162 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
41163 return true;
41165 /* ??? Look for narrow permutations whose element orderings would
41166 allow the promotion to a wider mode. */
41168 /* ??? Look for sequences of interleave or a wider permute that place
41169 the data into the correct lanes for a half-vector shuffle like
41170 pshuf[lh]w or vpermilps. */
41172 /* ??? Look for sequences of interleave that produce the desired results.
41173 The combinatorics of punpck[lh] get pretty ugly... */
41175 if (expand_vec_perm_even_odd (d))
41176 return true;
41178 /* Even longer sequences. */
41179 if (expand_vec_perm_vpshufb4_vpermq2 (d))
41180 return true;
41182 return false;
41185 /* If a permutation only uses one operand, make it clear. Returns true
41186 if the permutation references both operands. */
41188 static bool
41189 canonicalize_perm (struct expand_vec_perm_d *d)
41191 int i, which, nelt = d->nelt;
41193 for (i = which = 0; i < nelt; ++i)
41194 which |= (d->perm[i] < nelt ? 1 : 2);
41196 d->one_operand_p = true;
41197 switch (which)
41199 default:
41200 gcc_unreachable();
41202 case 3:
41203 if (!rtx_equal_p (d->op0, d->op1))
41205 d->one_operand_p = false;
41206 break;
41208 /* The elements of PERM do not suggest that only the first operand
41209 is used, but both operands are identical. Allow easier matching
41210 of the permutation by folding the permutation into the single
41211 input vector. */
41212 /* FALLTHRU */
41214 case 2:
41215 for (i = 0; i < nelt; ++i)
41216 d->perm[i] &= nelt - 1;
41217 d->op0 = d->op1;
41218 break;
41220 case 1:
41221 d->op1 = d->op0;
41222 break;
41225 return (which == 3);
41228 bool
41229 ix86_expand_vec_perm_const (rtx operands[4])
41231 struct expand_vec_perm_d d;
41232 unsigned char perm[MAX_VECT_LEN];
41233 int i, nelt;
41234 bool two_args;
41235 rtx sel;
41237 d.target = operands[0];
41238 d.op0 = operands[1];
41239 d.op1 = operands[2];
41240 sel = operands[3];
41242 d.vmode = GET_MODE (d.target);
41243 gcc_assert (VECTOR_MODE_P (d.vmode));
41244 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41245 d.testing_p = false;
41247 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41248 gcc_assert (XVECLEN (sel, 0) == nelt);
41249 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41251 for (i = 0; i < nelt; ++i)
41253 rtx e = XVECEXP (sel, 0, i);
41254 int ei = INTVAL (e) & (2 * nelt - 1);
41255 d.perm[i] = ei;
41256 perm[i] = ei;
41259 two_args = canonicalize_perm (&d);
41261 if (ix86_expand_vec_perm_const_1 (&d))
41262 return true;
41264 /* If the selector says both arguments are needed, but the operands are the
41265 same, the above tried to expand with one_operand_p and flattened selector.
41266 If that didn't work, retry without one_operand_p; we succeeded with that
41267 during testing. */
41268 if (two_args && d.one_operand_p)
41270 d.one_operand_p = false;
41271 memcpy (d.perm, perm, sizeof (perm));
41272 return ix86_expand_vec_perm_const_1 (&d);
41275 return false;
41278 /* Implement targetm.vectorize.vec_perm_const_ok. */
41280 static bool
41281 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41282 const unsigned char *sel)
41284 struct expand_vec_perm_d d;
41285 unsigned int i, nelt, which;
41286 bool ret;
41288 d.vmode = vmode;
41289 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41290 d.testing_p = true;
41292 /* Given sufficient ISA support we can just return true here
41293 for selected vector modes. */
41294 if (GET_MODE_SIZE (d.vmode) == 16)
41296 /* All implementable with a single vpperm insn. */
41297 if (TARGET_XOP)
41298 return true;
41299 /* All implementable with 2 pshufb + 1 ior. */
41300 if (TARGET_SSSE3)
41301 return true;
41302 /* All implementable with shufpd or unpck[lh]pd. */
41303 if (d.nelt == 2)
41304 return true;
41307 /* Extract the values from the vector CST into the permutation
41308 array in D. */
41309 memcpy (d.perm, sel, nelt);
41310 for (i = which = 0; i < nelt; ++i)
41312 unsigned char e = d.perm[i];
41313 gcc_assert (e < 2 * nelt);
41314 which |= (e < nelt ? 1 : 2);
41317 /* For all elements from second vector, fold the elements to first. */
41318 if (which == 2)
41319 for (i = 0; i < nelt; ++i)
41320 d.perm[i] -= nelt;
41322 /* Check whether the mask can be applied to the vector type. */
41323 d.one_operand_p = (which != 3);
41325 /* Implementable with shufps or pshufd. */
41326 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41327 return true;
41329 /* Otherwise we have to go through the motions and see if we can
41330 figure out how to generate the requested permutation. */
41331 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41332 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41333 if (!d.one_operand_p)
41334 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41336 start_sequence ();
41337 ret = ix86_expand_vec_perm_const_1 (&d);
41338 end_sequence ();
41340 return ret;
41343 void
41344 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41346 struct expand_vec_perm_d d;
41347 unsigned i, nelt;
41349 d.target = targ;
41350 d.op0 = op0;
41351 d.op1 = op1;
41352 d.vmode = GET_MODE (targ);
41353 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41354 d.one_operand_p = false;
41355 d.testing_p = false;
41357 for (i = 0; i < nelt; ++i)
41358 d.perm[i] = i * 2 + odd;
41360 /* We'll either be able to implement the permutation directly... */
41361 if (expand_vec_perm_1 (&d))
41362 return;
41364 /* ... or we use the special-case patterns. */
41365 expand_vec_perm_even_odd_1 (&d, odd);
41368 static void
41369 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41371 struct expand_vec_perm_d d;
41372 unsigned i, nelt, base;
41373 bool ok;
41375 d.target = targ;
41376 d.op0 = op0;
41377 d.op1 = op1;
41378 d.vmode = GET_MODE (targ);
41379 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41380 d.one_operand_p = false;
41381 d.testing_p = false;
41383 base = high_p ? nelt / 2 : 0;
41384 for (i = 0; i < nelt / 2; ++i)
41386 d.perm[i * 2] = i + base;
41387 d.perm[i * 2 + 1] = i + base + nelt;
41390 /* Note that for AVX this isn't one instruction. */
41391 ok = ix86_expand_vec_perm_const_1 (&d);
41392 gcc_assert (ok);
41396 /* Expand a vector operation CODE for a V*QImode in terms of the
41397 same operation on V*HImode. */
41399 void
41400 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
41402 enum machine_mode qimode = GET_MODE (dest);
41403 enum machine_mode himode;
41404 rtx (*gen_il) (rtx, rtx, rtx);
41405 rtx (*gen_ih) (rtx, rtx, rtx);
41406 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
41407 struct expand_vec_perm_d d;
41408 bool ok, full_interleave;
41409 bool uns_p = false;
41410 int i;
41412 switch (qimode)
41414 case V16QImode:
41415 himode = V8HImode;
41416 gen_il = gen_vec_interleave_lowv16qi;
41417 gen_ih = gen_vec_interleave_highv16qi;
41418 break;
41419 case V32QImode:
41420 himode = V16HImode;
41421 gen_il = gen_avx2_interleave_lowv32qi;
41422 gen_ih = gen_avx2_interleave_highv32qi;
41423 break;
41424 default:
41425 gcc_unreachable ();
41428 op2_l = op2_h = op2;
41429 switch (code)
41431 case MULT:
41432 /* Unpack data such that we've got a source byte in each low byte of
41433 each word. We don't care what goes into the high byte of each word.
41434 Rather than trying to get zero in there, most convenient is to let
41435 it be a copy of the low byte. */
41436 op2_l = gen_reg_rtx (qimode);
41437 op2_h = gen_reg_rtx (qimode);
41438 emit_insn (gen_il (op2_l, op2, op2));
41439 emit_insn (gen_ih (op2_h, op2, op2));
41440 /* FALLTHRU */
41442 op1_l = gen_reg_rtx (qimode);
41443 op1_h = gen_reg_rtx (qimode);
41444 emit_insn (gen_il (op1_l, op1, op1));
41445 emit_insn (gen_ih (op1_h, op1, op1));
41446 full_interleave = qimode == V16QImode;
41447 break;
41449 case ASHIFT:
41450 case LSHIFTRT:
41451 uns_p = true;
41452 /* FALLTHRU */
41453 case ASHIFTRT:
41454 op1_l = gen_reg_rtx (himode);
41455 op1_h = gen_reg_rtx (himode);
41456 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
41457 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
41458 full_interleave = true;
41459 break;
41460 default:
41461 gcc_unreachable ();
41464 /* Perform the operation. */
41465 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
41466 1, OPTAB_DIRECT);
41467 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
41468 1, OPTAB_DIRECT);
41469 gcc_assert (res_l && res_h);
41471 /* Merge the data back into the right place. */
41472 d.target = dest;
41473 d.op0 = gen_lowpart (qimode, res_l);
41474 d.op1 = gen_lowpart (qimode, res_h);
41475 d.vmode = qimode;
41476 d.nelt = GET_MODE_NUNITS (qimode);
41477 d.one_operand_p = false;
41478 d.testing_p = false;
41480 if (full_interleave)
41482 /* For SSE2, we used an full interleave, so the desired
41483 results are in the even elements. */
41484 for (i = 0; i < 32; ++i)
41485 d.perm[i] = i * 2;
41487 else
41489 /* For AVX, the interleave used above was not cross-lane. So the
41490 extraction is evens but with the second and third quarter swapped.
41491 Happily, that is even one insn shorter than even extraction. */
41492 for (i = 0; i < 32; ++i)
41493 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
41496 ok = ix86_expand_vec_perm_const_1 (&d);
41497 gcc_assert (ok);
41499 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41500 gen_rtx_fmt_ee (code, qimode, op1, op2));
41503 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
41504 if op is CONST_VECTOR with all odd elements equal to their
41505 preceding element. */
41507 static bool
41508 const_vector_equal_evenodd_p (rtx op)
41510 enum machine_mode mode = GET_MODE (op);
41511 int i, nunits = GET_MODE_NUNITS (mode);
41512 if (GET_CODE (op) != CONST_VECTOR
41513 || nunits != CONST_VECTOR_NUNITS (op))
41514 return false;
41515 for (i = 0; i < nunits; i += 2)
41516 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
41517 return false;
41518 return true;
41521 void
41522 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
41523 bool uns_p, bool odd_p)
41525 enum machine_mode mode = GET_MODE (op1);
41526 enum machine_mode wmode = GET_MODE (dest);
41527 rtx x;
41528 rtx orig_op1 = op1, orig_op2 = op2;
41530 if (!nonimmediate_operand (op1, mode))
41531 op1 = force_reg (mode, op1);
41532 if (!nonimmediate_operand (op2, mode))
41533 op2 = force_reg (mode, op2);
41535 /* We only play even/odd games with vectors of SImode. */
41536 gcc_assert (mode == V4SImode || mode == V8SImode);
41538 /* If we're looking for the odd results, shift those members down to
41539 the even slots. For some cpus this is faster than a PSHUFD. */
41540 if (odd_p)
41542 /* For XOP use vpmacsdqh, but only for smult, as it is only
41543 signed. */
41544 if (TARGET_XOP && mode == V4SImode && !uns_p)
41546 x = force_reg (wmode, CONST0_RTX (wmode));
41547 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
41548 return;
41551 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
41552 if (!const_vector_equal_evenodd_p (orig_op1))
41553 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
41554 x, NULL, 1, OPTAB_DIRECT);
41555 if (!const_vector_equal_evenodd_p (orig_op2))
41556 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
41557 x, NULL, 1, OPTAB_DIRECT);
41558 op1 = gen_lowpart (mode, op1);
41559 op2 = gen_lowpart (mode, op2);
41562 if (mode == V8SImode)
41564 if (uns_p)
41565 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
41566 else
41567 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
41569 else if (uns_p)
41570 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
41571 else if (TARGET_SSE4_1)
41572 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
41573 else
41575 rtx s1, s2, t0, t1, t2;
41577 /* The easiest way to implement this without PMULDQ is to go through
41578 the motions as if we are performing a full 64-bit multiply. With
41579 the exception that we need to do less shuffling of the elements. */
41581 /* Compute the sign-extension, aka highparts, of the two operands. */
41582 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41583 op1, pc_rtx, pc_rtx);
41584 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41585 op2, pc_rtx, pc_rtx);
41587 /* Multiply LO(A) * HI(B), and vice-versa. */
41588 t1 = gen_reg_rtx (wmode);
41589 t2 = gen_reg_rtx (wmode);
41590 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
41591 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
41593 /* Multiply LO(A) * LO(B). */
41594 t0 = gen_reg_rtx (wmode);
41595 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
41597 /* Combine and shift the highparts into place. */
41598 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
41599 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
41600 1, OPTAB_DIRECT);
41602 /* Combine high and low parts. */
41603 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
41604 return;
41606 emit_insn (x);
41609 void
41610 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
41611 bool uns_p, bool high_p)
41613 enum machine_mode wmode = GET_MODE (dest);
41614 enum machine_mode mode = GET_MODE (op1);
41615 rtx t1, t2, t3, t4, mask;
41617 switch (mode)
41619 case V4SImode:
41620 t1 = gen_reg_rtx (mode);
41621 t2 = gen_reg_rtx (mode);
41622 if (TARGET_XOP && !uns_p)
41624 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
41625 shuffle the elements once so that all elements are in the right
41626 place for immediate use: { A C B D }. */
41627 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
41628 const1_rtx, GEN_INT (3)));
41629 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
41630 const1_rtx, GEN_INT (3)));
41632 else
41634 /* Put the elements into place for the multiply. */
41635 ix86_expand_vec_interleave (t1, op1, op1, high_p);
41636 ix86_expand_vec_interleave (t2, op2, op2, high_p);
41637 high_p = false;
41639 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
41640 break;
41642 case V8SImode:
41643 /* Shuffle the elements between the lanes. After this we
41644 have { A B E F | C D G H } for each operand. */
41645 t1 = gen_reg_rtx (V4DImode);
41646 t2 = gen_reg_rtx (V4DImode);
41647 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
41648 const0_rtx, const2_rtx,
41649 const1_rtx, GEN_INT (3)));
41650 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
41651 const0_rtx, const2_rtx,
41652 const1_rtx, GEN_INT (3)));
41654 /* Shuffle the elements within the lanes. After this we
41655 have { A A B B | C C D D } or { E E F F | G G H H }. */
41656 t3 = gen_reg_rtx (V8SImode);
41657 t4 = gen_reg_rtx (V8SImode);
41658 mask = GEN_INT (high_p
41659 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
41660 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
41661 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
41662 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
41664 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
41665 break;
41667 case V8HImode:
41668 case V16HImode:
41669 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
41670 uns_p, OPTAB_DIRECT);
41671 t2 = expand_binop (mode,
41672 uns_p ? umul_highpart_optab : smul_highpart_optab,
41673 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
41674 gcc_assert (t1 && t2);
41676 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
41677 break;
41679 case V16QImode:
41680 case V32QImode:
41681 t1 = gen_reg_rtx (wmode);
41682 t2 = gen_reg_rtx (wmode);
41683 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
41684 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
41686 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
41687 break;
41689 default:
41690 gcc_unreachable ();
41694 void
41695 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
41697 rtx res_1, res_2;
41699 res_1 = gen_reg_rtx (V4SImode);
41700 res_2 = gen_reg_rtx (V4SImode);
41701 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
41702 op1, op2, true, false);
41703 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
41704 op1, op2, true, true);
41706 /* Move the results in element 2 down to element 1; we don't care
41707 what goes in elements 2 and 3. Then we can merge the parts
41708 back together with an interleave.
41710 Note that two other sequences were tried:
41711 (1) Use interleaves at the start instead of psrldq, which allows
41712 us to use a single shufps to merge things back at the end.
41713 (2) Use shufps here to combine the two vectors, then pshufd to
41714 put the elements in the correct order.
41715 In both cases the cost of the reformatting stall was too high
41716 and the overall sequence slower. */
41718 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
41719 const0_rtx, const0_rtx));
41720 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
41721 const0_rtx, const0_rtx));
41722 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
41724 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
41727 void
41728 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
41730 enum machine_mode mode = GET_MODE (op0);
41731 rtx t1, t2, t3, t4, t5, t6;
41733 if (TARGET_XOP && mode == V2DImode)
41735 /* op1: A,B,C,D, op2: E,F,G,H */
41736 op1 = gen_lowpart (V4SImode, op1);
41737 op2 = gen_lowpart (V4SImode, op2);
41739 t1 = gen_reg_rtx (V4SImode);
41740 t2 = gen_reg_rtx (V4SImode);
41741 t3 = gen_reg_rtx (V2DImode);
41742 t4 = gen_reg_rtx (V2DImode);
41744 /* t1: B,A,D,C */
41745 emit_insn (gen_sse2_pshufd_1 (t1, op1,
41746 GEN_INT (1),
41747 GEN_INT (0),
41748 GEN_INT (3),
41749 GEN_INT (2)));
41751 /* t2: (B*E),(A*F),(D*G),(C*H) */
41752 emit_insn (gen_mulv4si3 (t2, t1, op2));
41754 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
41755 emit_insn (gen_xop_phadddq (t3, t2));
41757 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
41758 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
41760 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
41761 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
41763 else
41765 enum machine_mode nmode;
41766 rtx (*umul) (rtx, rtx, rtx);
41768 if (mode == V2DImode)
41770 umul = gen_vec_widen_umult_even_v4si;
41771 nmode = V4SImode;
41773 else if (mode == V4DImode)
41775 umul = gen_vec_widen_umult_even_v8si;
41776 nmode = V8SImode;
41778 else
41779 gcc_unreachable ();
41782 /* Multiply low parts. */
41783 t1 = gen_reg_rtx (mode);
41784 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
41786 /* Shift input vectors right 32 bits so we can multiply high parts. */
41787 t6 = GEN_INT (32);
41788 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
41789 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
41791 /* Multiply high parts by low parts. */
41792 t4 = gen_reg_rtx (mode);
41793 t5 = gen_reg_rtx (mode);
41794 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
41795 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
41797 /* Combine and shift the highparts back. */
41798 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
41799 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
41801 /* Combine high and low parts. */
41802 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
41805 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41806 gen_rtx_MULT (mode, op1, op2));
41809 /* Expand an insert into a vector register through pinsr insn.
41810 Return true if successful. */
41812 bool
41813 ix86_expand_pinsr (rtx *operands)
41815 rtx dst = operands[0];
41816 rtx src = operands[3];
41818 unsigned int size = INTVAL (operands[1]);
41819 unsigned int pos = INTVAL (operands[2]);
41821 if (GET_CODE (dst) == SUBREG)
41823 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41824 dst = SUBREG_REG (dst);
41827 if (GET_CODE (src) == SUBREG)
41828 src = SUBREG_REG (src);
41830 switch (GET_MODE (dst))
41832 case V16QImode:
41833 case V8HImode:
41834 case V4SImode:
41835 case V2DImode:
41837 enum machine_mode srcmode, dstmode;
41838 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41840 srcmode = mode_for_size (size, MODE_INT, 0);
41842 switch (srcmode)
41844 case QImode:
41845 if (!TARGET_SSE4_1)
41846 return false;
41847 dstmode = V16QImode;
41848 pinsr = gen_sse4_1_pinsrb;
41849 break;
41851 case HImode:
41852 if (!TARGET_SSE2)
41853 return false;
41854 dstmode = V8HImode;
41855 pinsr = gen_sse2_pinsrw;
41856 break;
41858 case SImode:
41859 if (!TARGET_SSE4_1)
41860 return false;
41861 dstmode = V4SImode;
41862 pinsr = gen_sse4_1_pinsrd;
41863 break;
41865 case DImode:
41866 gcc_assert (TARGET_64BIT);
41867 if (!TARGET_SSE4_1)
41868 return false;
41869 dstmode = V2DImode;
41870 pinsr = gen_sse4_1_pinsrq;
41871 break;
41873 default:
41874 return false;
41877 dst = gen_lowpart (dstmode, dst);
41878 src = gen_lowpart (srcmode, src);
41880 pos /= size;
41882 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41883 return true;
41886 default:
41887 return false;
41891 /* This function returns the calling abi specific va_list type node.
41892 It returns the FNDECL specific va_list type. */
41894 static tree
41895 ix86_fn_abi_va_list (tree fndecl)
41897 if (!TARGET_64BIT)
41898 return va_list_type_node;
41899 gcc_assert (fndecl != NULL_TREE);
41901 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41902 return ms_va_list_type_node;
41903 else
41904 return sysv_va_list_type_node;
41907 /* Returns the canonical va_list type specified by TYPE. If there
41908 is no valid TYPE provided, it return NULL_TREE. */
41910 static tree
41911 ix86_canonical_va_list_type (tree type)
41913 tree wtype, htype;
41915 /* Resolve references and pointers to va_list type. */
41916 if (TREE_CODE (type) == MEM_REF)
41917 type = TREE_TYPE (type);
41918 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41919 type = TREE_TYPE (type);
41920 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41921 type = TREE_TYPE (type);
41923 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41925 wtype = va_list_type_node;
41926 gcc_assert (wtype != NULL_TREE);
41927 htype = type;
41928 if (TREE_CODE (wtype) == ARRAY_TYPE)
41930 /* If va_list is an array type, the argument may have decayed
41931 to a pointer type, e.g. by being passed to another function.
41932 In that case, unwrap both types so that we can compare the
41933 underlying records. */
41934 if (TREE_CODE (htype) == ARRAY_TYPE
41935 || POINTER_TYPE_P (htype))
41937 wtype = TREE_TYPE (wtype);
41938 htype = TREE_TYPE (htype);
41941 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41942 return va_list_type_node;
41943 wtype = sysv_va_list_type_node;
41944 gcc_assert (wtype != NULL_TREE);
41945 htype = type;
41946 if (TREE_CODE (wtype) == ARRAY_TYPE)
41948 /* If va_list is an array type, the argument may have decayed
41949 to a pointer type, e.g. by being passed to another function.
41950 In that case, unwrap both types so that we can compare the
41951 underlying records. */
41952 if (TREE_CODE (htype) == ARRAY_TYPE
41953 || POINTER_TYPE_P (htype))
41955 wtype = TREE_TYPE (wtype);
41956 htype = TREE_TYPE (htype);
41959 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41960 return sysv_va_list_type_node;
41961 wtype = ms_va_list_type_node;
41962 gcc_assert (wtype != NULL_TREE);
41963 htype = type;
41964 if (TREE_CODE (wtype) == ARRAY_TYPE)
41966 /* If va_list is an array type, the argument may have decayed
41967 to a pointer type, e.g. by being passed to another function.
41968 In that case, unwrap both types so that we can compare the
41969 underlying records. */
41970 if (TREE_CODE (htype) == ARRAY_TYPE
41971 || POINTER_TYPE_P (htype))
41973 wtype = TREE_TYPE (wtype);
41974 htype = TREE_TYPE (htype);
41977 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41978 return ms_va_list_type_node;
41979 return NULL_TREE;
41981 return std_canonical_va_list_type (type);
41984 /* Iterate through the target-specific builtin types for va_list.
41985 IDX denotes the iterator, *PTREE is set to the result type of
41986 the va_list builtin, and *PNAME to its internal type.
41987 Returns zero if there is no element for this index, otherwise
41988 IDX should be increased upon the next call.
41989 Note, do not iterate a base builtin's name like __builtin_va_list.
41990 Used from c_common_nodes_and_builtins. */
41992 static int
41993 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41995 if (TARGET_64BIT)
41997 switch (idx)
41999 default:
42000 break;
42002 case 0:
42003 *ptree = ms_va_list_type_node;
42004 *pname = "__builtin_ms_va_list";
42005 return 1;
42007 case 1:
42008 *ptree = sysv_va_list_type_node;
42009 *pname = "__builtin_sysv_va_list";
42010 return 1;
42014 return 0;
42017 #undef TARGET_SCHED_DISPATCH
42018 #define TARGET_SCHED_DISPATCH has_dispatch
42019 #undef TARGET_SCHED_DISPATCH_DO
42020 #define TARGET_SCHED_DISPATCH_DO do_dispatch
42021 #undef TARGET_SCHED_REASSOCIATION_WIDTH
42022 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
42023 #undef TARGET_SCHED_REORDER
42024 #define TARGET_SCHED_REORDER ix86_sched_reorder
42025 #undef TARGET_SCHED_ADJUST_PRIORITY
42026 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
42027 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
42028 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
42029 ix86_dependencies_evaluation_hook
42031 /* The size of the dispatch window is the total number of bytes of
42032 object code allowed in a window. */
42033 #define DISPATCH_WINDOW_SIZE 16
42035 /* Number of dispatch windows considered for scheduling. */
42036 #define MAX_DISPATCH_WINDOWS 3
42038 /* Maximum number of instructions in a window. */
42039 #define MAX_INSN 4
42041 /* Maximum number of immediate operands in a window. */
42042 #define MAX_IMM 4
42044 /* Maximum number of immediate bits allowed in a window. */
42045 #define MAX_IMM_SIZE 128
42047 /* Maximum number of 32 bit immediates allowed in a window. */
42048 #define MAX_IMM_32 4
42050 /* Maximum number of 64 bit immediates allowed in a window. */
42051 #define MAX_IMM_64 2
42053 /* Maximum total of loads or prefetches allowed in a window. */
42054 #define MAX_LOAD 2
42056 /* Maximum total of stores allowed in a window. */
42057 #define MAX_STORE 1
42059 #undef BIG
42060 #define BIG 100
42063 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
42064 enum dispatch_group {
42065 disp_no_group = 0,
42066 disp_load,
42067 disp_store,
42068 disp_load_store,
42069 disp_prefetch,
42070 disp_imm,
42071 disp_imm_32,
42072 disp_imm_64,
42073 disp_branch,
42074 disp_cmp,
42075 disp_jcc,
42076 disp_last
42079 /* Number of allowable groups in a dispatch window. It is an array
42080 indexed by dispatch_group enum. 100 is used as a big number,
42081 because the number of these kind of operations does not have any
42082 effect in dispatch window, but we need them for other reasons in
42083 the table. */
42084 static unsigned int num_allowable_groups[disp_last] = {
42085 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
42088 char group_name[disp_last + 1][16] = {
42089 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
42090 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
42091 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
42094 /* Instruction path. */
42095 enum insn_path {
42096 no_path = 0,
42097 path_single, /* Single micro op. */
42098 path_double, /* Double micro op. */
42099 path_multi, /* Instructions with more than 2 micro op.. */
42100 last_path
42103 /* sched_insn_info defines a window to the instructions scheduled in
42104 the basic block. It contains a pointer to the insn_info table and
42105 the instruction scheduled.
42107 Windows are allocated for each basic block and are linked
42108 together. */
42109 typedef struct sched_insn_info_s {
42110 rtx insn;
42111 enum dispatch_group group;
42112 enum insn_path path;
42113 int byte_len;
42114 int imm_bytes;
42115 } sched_insn_info;
42117 /* Linked list of dispatch windows. This is a two way list of
42118 dispatch windows of a basic block. It contains information about
42119 the number of uops in the window and the total number of
42120 instructions and of bytes in the object code for this dispatch
42121 window. */
42122 typedef struct dispatch_windows_s {
42123 int num_insn; /* Number of insn in the window. */
42124 int num_uops; /* Number of uops in the window. */
42125 int window_size; /* Number of bytes in the window. */
42126 int window_num; /* Window number between 0 or 1. */
42127 int num_imm; /* Number of immediates in an insn. */
42128 int num_imm_32; /* Number of 32 bit immediates in an insn. */
42129 int num_imm_64; /* Number of 64 bit immediates in an insn. */
42130 int imm_size; /* Total immediates in the window. */
42131 int num_loads; /* Total memory loads in the window. */
42132 int num_stores; /* Total memory stores in the window. */
42133 int violation; /* Violation exists in window. */
42134 sched_insn_info *window; /* Pointer to the window. */
42135 struct dispatch_windows_s *next;
42136 struct dispatch_windows_s *prev;
42137 } dispatch_windows;
42139 /* Immediate valuse used in an insn. */
42140 typedef struct imm_info_s
42142 int imm;
42143 int imm32;
42144 int imm64;
42145 } imm_info;
42147 static dispatch_windows *dispatch_window_list;
42148 static dispatch_windows *dispatch_window_list1;
42150 /* Get dispatch group of insn. */
42152 static enum dispatch_group
42153 get_mem_group (rtx insn)
42155 enum attr_memory memory;
42157 if (INSN_CODE (insn) < 0)
42158 return disp_no_group;
42159 memory = get_attr_memory (insn);
42160 if (memory == MEMORY_STORE)
42161 return disp_store;
42163 if (memory == MEMORY_LOAD)
42164 return disp_load;
42166 if (memory == MEMORY_BOTH)
42167 return disp_load_store;
42169 return disp_no_group;
42172 /* Return true if insn is a compare instruction. */
42174 static bool
42175 is_cmp (rtx insn)
42177 enum attr_type type;
42179 type = get_attr_type (insn);
42180 return (type == TYPE_TEST
42181 || type == TYPE_ICMP
42182 || type == TYPE_FCMP
42183 || GET_CODE (PATTERN (insn)) == COMPARE);
42186 /* Return true if a dispatch violation encountered. */
42188 static bool
42189 dispatch_violation (void)
42191 if (dispatch_window_list->next)
42192 return dispatch_window_list->next->violation;
42193 return dispatch_window_list->violation;
42196 /* Return true if insn is a branch instruction. */
42198 static bool
42199 is_branch (rtx insn)
42201 return (CALL_P (insn) || JUMP_P (insn));
42204 /* Return true if insn is a prefetch instruction. */
42206 static bool
42207 is_prefetch (rtx insn)
42209 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
42212 /* This function initializes a dispatch window and the list container holding a
42213 pointer to the window. */
42215 static void
42216 init_window (int window_num)
42218 int i;
42219 dispatch_windows *new_list;
42221 if (window_num == 0)
42222 new_list = dispatch_window_list;
42223 else
42224 new_list = dispatch_window_list1;
42226 new_list->num_insn = 0;
42227 new_list->num_uops = 0;
42228 new_list->window_size = 0;
42229 new_list->next = NULL;
42230 new_list->prev = NULL;
42231 new_list->window_num = window_num;
42232 new_list->num_imm = 0;
42233 new_list->num_imm_32 = 0;
42234 new_list->num_imm_64 = 0;
42235 new_list->imm_size = 0;
42236 new_list->num_loads = 0;
42237 new_list->num_stores = 0;
42238 new_list->violation = false;
42240 for (i = 0; i < MAX_INSN; i++)
42242 new_list->window[i].insn = NULL;
42243 new_list->window[i].group = disp_no_group;
42244 new_list->window[i].path = no_path;
42245 new_list->window[i].byte_len = 0;
42246 new_list->window[i].imm_bytes = 0;
42248 return;
42251 /* This function allocates and initializes a dispatch window and the
42252 list container holding a pointer to the window. */
42254 static dispatch_windows *
42255 allocate_window (void)
42257 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42258 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42260 return new_list;
42263 /* This routine initializes the dispatch scheduling information. It
42264 initiates building dispatch scheduler tables and constructs the
42265 first dispatch window. */
42267 static void
42268 init_dispatch_sched (void)
42270 /* Allocate a dispatch list and a window. */
42271 dispatch_window_list = allocate_window ();
42272 dispatch_window_list1 = allocate_window ();
42273 init_window (0);
42274 init_window (1);
42277 /* This function returns true if a branch is detected. End of a basic block
42278 does not have to be a branch, but here we assume only branches end a
42279 window. */
42281 static bool
42282 is_end_basic_block (enum dispatch_group group)
42284 return group == disp_branch;
42287 /* This function is called when the end of a window processing is reached. */
42289 static void
42290 process_end_window (void)
42292 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42293 if (dispatch_window_list->next)
42295 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42296 gcc_assert (dispatch_window_list->window_size
42297 + dispatch_window_list1->window_size <= 48);
42298 init_window (1);
42300 init_window (0);
42303 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42304 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42305 for 48 bytes of instructions. Note that these windows are not dispatch
42306 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42308 static dispatch_windows *
42309 allocate_next_window (int window_num)
42311 if (window_num == 0)
42313 if (dispatch_window_list->next)
42314 init_window (1);
42315 init_window (0);
42316 return dispatch_window_list;
42319 dispatch_window_list->next = dispatch_window_list1;
42320 dispatch_window_list1->prev = dispatch_window_list;
42322 return dispatch_window_list1;
42325 /* Increment the number of immediate operands of an instruction. */
42327 static int
42328 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
42330 if (*in_rtx == 0)
42331 return 0;
42333 switch ( GET_CODE (*in_rtx))
42335 case CONST:
42336 case SYMBOL_REF:
42337 case CONST_INT:
42338 (imm_values->imm)++;
42339 if (x86_64_immediate_operand (*in_rtx, SImode))
42340 (imm_values->imm32)++;
42341 else
42342 (imm_values->imm64)++;
42343 break;
42345 case CONST_DOUBLE:
42346 (imm_values->imm)++;
42347 (imm_values->imm64)++;
42348 break;
42350 case CODE_LABEL:
42351 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
42353 (imm_values->imm)++;
42354 (imm_values->imm32)++;
42356 break;
42358 default:
42359 break;
42362 return 0;
42365 /* Compute number of immediate operands of an instruction. */
42367 static void
42368 find_constant (rtx in_rtx, imm_info *imm_values)
42370 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
42371 (rtx_function) find_constant_1, (void *) imm_values);
42374 /* Return total size of immediate operands of an instruction along with number
42375 of corresponding immediate-operands. It initializes its parameters to zero
42376 befor calling FIND_CONSTANT.
42377 INSN is the input instruction. IMM is the total of immediates.
42378 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
42379 bit immediates. */
42381 static int
42382 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
42384 imm_info imm_values = {0, 0, 0};
42386 find_constant (insn, &imm_values);
42387 *imm = imm_values.imm;
42388 *imm32 = imm_values.imm32;
42389 *imm64 = imm_values.imm64;
42390 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
42393 /* This function indicates if an operand of an instruction is an
42394 immediate. */
42396 static bool
42397 has_immediate (rtx insn)
42399 int num_imm_operand;
42400 int num_imm32_operand;
42401 int num_imm64_operand;
42403 if (insn)
42404 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42405 &num_imm64_operand);
42406 return false;
42409 /* Return single or double path for instructions. */
42411 static enum insn_path
42412 get_insn_path (rtx insn)
42414 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
42416 if ((int)path == 0)
42417 return path_single;
42419 if ((int)path == 1)
42420 return path_double;
42422 return path_multi;
42425 /* Return insn dispatch group. */
42427 static enum dispatch_group
42428 get_insn_group (rtx insn)
42430 enum dispatch_group group = get_mem_group (insn);
42431 if (group)
42432 return group;
42434 if (is_branch (insn))
42435 return disp_branch;
42437 if (is_cmp (insn))
42438 return disp_cmp;
42440 if (has_immediate (insn))
42441 return disp_imm;
42443 if (is_prefetch (insn))
42444 return disp_prefetch;
42446 return disp_no_group;
42449 /* Count number of GROUP restricted instructions in a dispatch
42450 window WINDOW_LIST. */
42452 static int
42453 count_num_restricted (rtx insn, dispatch_windows *window_list)
42455 enum dispatch_group group = get_insn_group (insn);
42456 int imm_size;
42457 int num_imm_operand;
42458 int num_imm32_operand;
42459 int num_imm64_operand;
42461 if (group == disp_no_group)
42462 return 0;
42464 if (group == disp_imm)
42466 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42467 &num_imm64_operand);
42468 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
42469 || num_imm_operand + window_list->num_imm > MAX_IMM
42470 || (num_imm32_operand > 0
42471 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
42472 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
42473 || (num_imm64_operand > 0
42474 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
42475 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
42476 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
42477 && num_imm64_operand > 0
42478 && ((window_list->num_imm_64 > 0
42479 && window_list->num_insn >= 2)
42480 || window_list->num_insn >= 3)))
42481 return BIG;
42483 return 1;
42486 if ((group == disp_load_store
42487 && (window_list->num_loads >= MAX_LOAD
42488 || window_list->num_stores >= MAX_STORE))
42489 || ((group == disp_load
42490 || group == disp_prefetch)
42491 && window_list->num_loads >= MAX_LOAD)
42492 || (group == disp_store
42493 && window_list->num_stores >= MAX_STORE))
42494 return BIG;
42496 return 1;
42499 /* This function returns true if insn satisfies dispatch rules on the
42500 last window scheduled. */
42502 static bool
42503 fits_dispatch_window (rtx insn)
42505 dispatch_windows *window_list = dispatch_window_list;
42506 dispatch_windows *window_list_next = dispatch_window_list->next;
42507 unsigned int num_restrict;
42508 enum dispatch_group group = get_insn_group (insn);
42509 enum insn_path path = get_insn_path (insn);
42510 int sum;
42512 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
42513 instructions should be given the lowest priority in the
42514 scheduling process in Haifa scheduler to make sure they will be
42515 scheduled in the same dispatch window as the reference to them. */
42516 if (group == disp_jcc || group == disp_cmp)
42517 return false;
42519 /* Check nonrestricted. */
42520 if (group == disp_no_group || group == disp_branch)
42521 return true;
42523 /* Get last dispatch window. */
42524 if (window_list_next)
42525 window_list = window_list_next;
42527 if (window_list->window_num == 1)
42529 sum = window_list->prev->window_size + window_list->window_size;
42531 if (sum == 32
42532 || (min_insn_size (insn) + sum) >= 48)
42533 /* Window 1 is full. Go for next window. */
42534 return true;
42537 num_restrict = count_num_restricted (insn, window_list);
42539 if (num_restrict > num_allowable_groups[group])
42540 return false;
42542 /* See if it fits in the first window. */
42543 if (window_list->window_num == 0)
42545 /* The first widow should have only single and double path
42546 uops. */
42547 if (path == path_double
42548 && (window_list->num_uops + 2) > MAX_INSN)
42549 return false;
42550 else if (path != path_single)
42551 return false;
42553 return true;
42556 /* Add an instruction INSN with NUM_UOPS micro-operations to the
42557 dispatch window WINDOW_LIST. */
42559 static void
42560 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
42562 int byte_len = min_insn_size (insn);
42563 int num_insn = window_list->num_insn;
42564 int imm_size;
42565 sched_insn_info *window = window_list->window;
42566 enum dispatch_group group = get_insn_group (insn);
42567 enum insn_path path = get_insn_path (insn);
42568 int num_imm_operand;
42569 int num_imm32_operand;
42570 int num_imm64_operand;
42572 if (!window_list->violation && group != disp_cmp
42573 && !fits_dispatch_window (insn))
42574 window_list->violation = true;
42576 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42577 &num_imm64_operand);
42579 /* Initialize window with new instruction. */
42580 window[num_insn].insn = insn;
42581 window[num_insn].byte_len = byte_len;
42582 window[num_insn].group = group;
42583 window[num_insn].path = path;
42584 window[num_insn].imm_bytes = imm_size;
42586 window_list->window_size += byte_len;
42587 window_list->num_insn = num_insn + 1;
42588 window_list->num_uops = window_list->num_uops + num_uops;
42589 window_list->imm_size += imm_size;
42590 window_list->num_imm += num_imm_operand;
42591 window_list->num_imm_32 += num_imm32_operand;
42592 window_list->num_imm_64 += num_imm64_operand;
42594 if (group == disp_store)
42595 window_list->num_stores += 1;
42596 else if (group == disp_load
42597 || group == disp_prefetch)
42598 window_list->num_loads += 1;
42599 else if (group == disp_load_store)
42601 window_list->num_stores += 1;
42602 window_list->num_loads += 1;
42606 /* Adds a scheduled instruction, INSN, to the current dispatch window.
42607 If the total bytes of instructions or the number of instructions in
42608 the window exceed allowable, it allocates a new window. */
42610 static void
42611 add_to_dispatch_window (rtx insn)
42613 int byte_len;
42614 dispatch_windows *window_list;
42615 dispatch_windows *next_list;
42616 dispatch_windows *window0_list;
42617 enum insn_path path;
42618 enum dispatch_group insn_group;
42619 bool insn_fits;
42620 int num_insn;
42621 int num_uops;
42622 int window_num;
42623 int insn_num_uops;
42624 int sum;
42626 if (INSN_CODE (insn) < 0)
42627 return;
42629 byte_len = min_insn_size (insn);
42630 window_list = dispatch_window_list;
42631 next_list = window_list->next;
42632 path = get_insn_path (insn);
42633 insn_group = get_insn_group (insn);
42635 /* Get the last dispatch window. */
42636 if (next_list)
42637 window_list = dispatch_window_list->next;
42639 if (path == path_single)
42640 insn_num_uops = 1;
42641 else if (path == path_double)
42642 insn_num_uops = 2;
42643 else
42644 insn_num_uops = (int) path;
42646 /* If current window is full, get a new window.
42647 Window number zero is full, if MAX_INSN uops are scheduled in it.
42648 Window number one is full, if window zero's bytes plus window
42649 one's bytes is 32, or if the bytes of the new instruction added
42650 to the total makes it greater than 48, or it has already MAX_INSN
42651 instructions in it. */
42652 num_insn = window_list->num_insn;
42653 num_uops = window_list->num_uops;
42654 window_num = window_list->window_num;
42655 insn_fits = fits_dispatch_window (insn);
42657 if (num_insn >= MAX_INSN
42658 || num_uops + insn_num_uops > MAX_INSN
42659 || !(insn_fits))
42661 window_num = ~window_num & 1;
42662 window_list = allocate_next_window (window_num);
42665 if (window_num == 0)
42667 add_insn_window (insn, window_list, insn_num_uops);
42668 if (window_list->num_insn >= MAX_INSN
42669 && insn_group == disp_branch)
42671 process_end_window ();
42672 return;
42675 else if (window_num == 1)
42677 window0_list = window_list->prev;
42678 sum = window0_list->window_size + window_list->window_size;
42679 if (sum == 32
42680 || (byte_len + sum) >= 48)
42682 process_end_window ();
42683 window_list = dispatch_window_list;
42686 add_insn_window (insn, window_list, insn_num_uops);
42688 else
42689 gcc_unreachable ();
42691 if (is_end_basic_block (insn_group))
42693 /* End of basic block is reached do end-basic-block process. */
42694 process_end_window ();
42695 return;
42699 /* Print the dispatch window, WINDOW_NUM, to FILE. */
42701 DEBUG_FUNCTION static void
42702 debug_dispatch_window_file (FILE *file, int window_num)
42704 dispatch_windows *list;
42705 int i;
42707 if (window_num == 0)
42708 list = dispatch_window_list;
42709 else
42710 list = dispatch_window_list1;
42712 fprintf (file, "Window #%d:\n", list->window_num);
42713 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
42714 list->num_insn, list->num_uops, list->window_size);
42715 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42716 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
42718 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
42719 list->num_stores);
42720 fprintf (file, " insn info:\n");
42722 for (i = 0; i < MAX_INSN; i++)
42724 if (!list->window[i].insn)
42725 break;
42726 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
42727 i, group_name[list->window[i].group],
42728 i, (void *)list->window[i].insn,
42729 i, list->window[i].path,
42730 i, list->window[i].byte_len,
42731 i, list->window[i].imm_bytes);
42735 /* Print to stdout a dispatch window. */
42737 DEBUG_FUNCTION void
42738 debug_dispatch_window (int window_num)
42740 debug_dispatch_window_file (stdout, window_num);
42743 /* Print INSN dispatch information to FILE. */
42745 DEBUG_FUNCTION static void
42746 debug_insn_dispatch_info_file (FILE *file, rtx insn)
42748 int byte_len;
42749 enum insn_path path;
42750 enum dispatch_group group;
42751 int imm_size;
42752 int num_imm_operand;
42753 int num_imm32_operand;
42754 int num_imm64_operand;
42756 if (INSN_CODE (insn) < 0)
42757 return;
42759 byte_len = min_insn_size (insn);
42760 path = get_insn_path (insn);
42761 group = get_insn_group (insn);
42762 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42763 &num_imm64_operand);
42765 fprintf (file, " insn info:\n");
42766 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
42767 group_name[group], path, byte_len);
42768 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42769 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
42772 /* Print to STDERR the status of the ready list with respect to
42773 dispatch windows. */
42775 DEBUG_FUNCTION void
42776 debug_ready_dispatch (void)
42778 int i;
42779 int no_ready = number_in_ready ();
42781 fprintf (stdout, "Number of ready: %d\n", no_ready);
42783 for (i = 0; i < no_ready; i++)
42784 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
42787 /* This routine is the driver of the dispatch scheduler. */
42789 static void
42790 do_dispatch (rtx insn, int mode)
42792 if (mode == DISPATCH_INIT)
42793 init_dispatch_sched ();
42794 else if (mode == ADD_TO_DISPATCH_WINDOW)
42795 add_to_dispatch_window (insn);
42798 /* Return TRUE if Dispatch Scheduling is supported. */
42800 static bool
42801 has_dispatch (rtx insn, int action)
42803 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
42804 && flag_dispatch_scheduler)
42805 switch (action)
42807 default:
42808 return false;
42810 case IS_DISPATCH_ON:
42811 return true;
42812 break;
42814 case IS_CMP:
42815 return is_cmp (insn);
42817 case DISPATCH_VIOLATION:
42818 return dispatch_violation ();
42820 case FITS_DISPATCH_WINDOW:
42821 return fits_dispatch_window (insn);
42824 return false;
42827 /* Implementation of reassociation_width target hook used by
42828 reassoc phase to identify parallelism level in reassociated
42829 tree. Statements tree_code is passed in OPC. Arguments type
42830 is passed in MODE.
42832 Currently parallel reassociation is enabled for Atom
42833 processors only and we set reassociation width to be 2
42834 because Atom may issue up to 2 instructions per cycle.
42836 Return value should be fixed if parallel reassociation is
42837 enabled for other processors. */
42839 static int
42840 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42841 enum machine_mode mode)
42843 int res = 1;
42845 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42846 res = 2;
42847 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42848 res = 2;
42850 return res;
42853 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42854 place emms and femms instructions. */
42856 static enum machine_mode
42857 ix86_preferred_simd_mode (enum machine_mode mode)
42859 if (!TARGET_SSE)
42860 return word_mode;
42862 switch (mode)
42864 case QImode:
42865 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42866 case HImode:
42867 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42868 case SImode:
42869 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42870 case DImode:
42871 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42873 case SFmode:
42874 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42875 return V8SFmode;
42876 else
42877 return V4SFmode;
42879 case DFmode:
42880 if (!TARGET_VECTORIZE_DOUBLE)
42881 return word_mode;
42882 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42883 return V4DFmode;
42884 else if (TARGET_SSE2)
42885 return V2DFmode;
42886 /* FALLTHRU */
42888 default:
42889 return word_mode;
42893 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42894 vectors. */
42896 static unsigned int
42897 ix86_autovectorize_vector_sizes (void)
42899 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42904 /* Return class of registers which could be used for pseudo of MODE
42905 and of class RCLASS for spilling instead of memory. Return NO_REGS
42906 if it is not possible or non-profitable. */
42907 static reg_class_t
42908 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42910 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42911 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42912 && INTEGER_CLASS_P (rclass))
42913 return SSE_REGS;
42914 return NO_REGS;
42917 /* Implement targetm.vectorize.init_cost. */
42919 static void *
42920 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42922 unsigned *cost = XNEWVEC (unsigned, 3);
42923 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42924 return cost;
42927 /* Implement targetm.vectorize.add_stmt_cost. */
42929 static unsigned
42930 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42931 struct _stmt_vec_info *stmt_info, int misalign,
42932 enum vect_cost_model_location where)
42934 unsigned *cost = (unsigned *) data;
42935 unsigned retval = 0;
42937 if (flag_vect_cost_model)
42939 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42940 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42942 /* Statements in an inner loop relative to the loop being
42943 vectorized are weighted more heavily. The value here is
42944 arbitrary and could potentially be improved with analysis. */
42945 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42946 count *= 50; /* FIXME. */
42948 retval = (unsigned) (count * stmt_cost);
42949 cost[where] += retval;
42952 return retval;
42955 /* Implement targetm.vectorize.finish_cost. */
42957 static void
42958 ix86_finish_cost (void *data, unsigned *prologue_cost,
42959 unsigned *body_cost, unsigned *epilogue_cost)
42961 unsigned *cost = (unsigned *) data;
42962 *prologue_cost = cost[vect_prologue];
42963 *body_cost = cost[vect_body];
42964 *epilogue_cost = cost[vect_epilogue];
42967 /* Implement targetm.vectorize.destroy_cost_data. */
42969 static void
42970 ix86_destroy_cost_data (void *data)
42972 free (data);
42975 /* Validate target specific memory model bits in VAL. */
42977 static unsigned HOST_WIDE_INT
42978 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42980 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42981 bool strong;
42983 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42984 |MEMMODEL_MASK)
42985 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42987 warning (OPT_Winvalid_memory_model,
42988 "Unknown architecture specific memory model");
42989 return MEMMODEL_SEQ_CST;
42991 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42992 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42994 warning (OPT_Winvalid_memory_model,
42995 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42996 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42998 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
43000 warning (OPT_Winvalid_memory_model,
43001 "HLE_RELEASE not used with RELEASE or stronger memory model");
43002 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
43004 return val;
43007 /* Initialize the GCC target structure. */
43008 #undef TARGET_RETURN_IN_MEMORY
43009 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
43011 #undef TARGET_LEGITIMIZE_ADDRESS
43012 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
43014 #undef TARGET_ATTRIBUTE_TABLE
43015 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
43016 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
43017 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
43018 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43019 # undef TARGET_MERGE_DECL_ATTRIBUTES
43020 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
43021 #endif
43023 #undef TARGET_COMP_TYPE_ATTRIBUTES
43024 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
43026 #undef TARGET_INIT_BUILTINS
43027 #define TARGET_INIT_BUILTINS ix86_init_builtins
43028 #undef TARGET_BUILTIN_DECL
43029 #define TARGET_BUILTIN_DECL ix86_builtin_decl
43030 #undef TARGET_EXPAND_BUILTIN
43031 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
43033 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
43034 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
43035 ix86_builtin_vectorized_function
43037 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
43038 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
43040 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
43041 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
43043 #undef TARGET_VECTORIZE_BUILTIN_GATHER
43044 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
43046 #undef TARGET_BUILTIN_RECIPROCAL
43047 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
43049 #undef TARGET_ASM_FUNCTION_EPILOGUE
43050 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
43052 #undef TARGET_ENCODE_SECTION_INFO
43053 #ifndef SUBTARGET_ENCODE_SECTION_INFO
43054 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
43055 #else
43056 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
43057 #endif
43059 #undef TARGET_ASM_OPEN_PAREN
43060 #define TARGET_ASM_OPEN_PAREN ""
43061 #undef TARGET_ASM_CLOSE_PAREN
43062 #define TARGET_ASM_CLOSE_PAREN ""
43064 #undef TARGET_ASM_BYTE_OP
43065 #define TARGET_ASM_BYTE_OP ASM_BYTE
43067 #undef TARGET_ASM_ALIGNED_HI_OP
43068 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
43069 #undef TARGET_ASM_ALIGNED_SI_OP
43070 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
43071 #ifdef ASM_QUAD
43072 #undef TARGET_ASM_ALIGNED_DI_OP
43073 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
43074 #endif
43076 #undef TARGET_PROFILE_BEFORE_PROLOGUE
43077 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
43079 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
43080 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
43082 #undef TARGET_ASM_UNALIGNED_HI_OP
43083 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
43084 #undef TARGET_ASM_UNALIGNED_SI_OP
43085 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
43086 #undef TARGET_ASM_UNALIGNED_DI_OP
43087 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
43089 #undef TARGET_PRINT_OPERAND
43090 #define TARGET_PRINT_OPERAND ix86_print_operand
43091 #undef TARGET_PRINT_OPERAND_ADDRESS
43092 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
43093 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
43094 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
43095 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
43096 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
43098 #undef TARGET_SCHED_INIT_GLOBAL
43099 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
43100 #undef TARGET_SCHED_ADJUST_COST
43101 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
43102 #undef TARGET_SCHED_ISSUE_RATE
43103 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
43104 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
43105 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
43106 ia32_multipass_dfa_lookahead
43108 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
43109 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
43111 #undef TARGET_MEMMODEL_CHECK
43112 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
43114 #ifdef HAVE_AS_TLS
43115 #undef TARGET_HAVE_TLS
43116 #define TARGET_HAVE_TLS true
43117 #endif
43118 #undef TARGET_CANNOT_FORCE_CONST_MEM
43119 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
43120 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
43121 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
43123 #undef TARGET_DELEGITIMIZE_ADDRESS
43124 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
43126 #undef TARGET_MS_BITFIELD_LAYOUT_P
43127 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
43129 #if TARGET_MACHO
43130 #undef TARGET_BINDS_LOCAL_P
43131 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
43132 #endif
43133 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43134 #undef TARGET_BINDS_LOCAL_P
43135 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
43136 #endif
43138 #undef TARGET_ASM_OUTPUT_MI_THUNK
43139 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
43140 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
43141 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
43143 #undef TARGET_ASM_FILE_START
43144 #define TARGET_ASM_FILE_START x86_file_start
43146 #undef TARGET_OPTION_OVERRIDE
43147 #define TARGET_OPTION_OVERRIDE ix86_option_override
43149 #undef TARGET_REGISTER_MOVE_COST
43150 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
43151 #undef TARGET_MEMORY_MOVE_COST
43152 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
43153 #undef TARGET_RTX_COSTS
43154 #define TARGET_RTX_COSTS ix86_rtx_costs
43155 #undef TARGET_ADDRESS_COST
43156 #define TARGET_ADDRESS_COST ix86_address_cost
43158 #undef TARGET_FIXED_CONDITION_CODE_REGS
43159 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
43160 #undef TARGET_CC_MODES_COMPATIBLE
43161 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
43163 #undef TARGET_MACHINE_DEPENDENT_REORG
43164 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
43166 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
43167 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
43169 #undef TARGET_BUILD_BUILTIN_VA_LIST
43170 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
43172 #undef TARGET_FOLD_BUILTIN
43173 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
43175 #undef TARGET_COMPARE_VERSION_PRIORITY
43176 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
43178 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
43179 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
43180 ix86_generate_version_dispatcher_body
43182 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
43183 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
43184 ix86_get_function_versions_dispatcher
43186 #undef TARGET_ENUM_VA_LIST_P
43187 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
43189 #undef TARGET_FN_ABI_VA_LIST
43190 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
43192 #undef TARGET_CANONICAL_VA_LIST_TYPE
43193 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
43195 #undef TARGET_EXPAND_BUILTIN_VA_START
43196 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
43198 #undef TARGET_MD_ASM_CLOBBERS
43199 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
43201 #undef TARGET_PROMOTE_PROTOTYPES
43202 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
43203 #undef TARGET_STRUCT_VALUE_RTX
43204 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
43205 #undef TARGET_SETUP_INCOMING_VARARGS
43206 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
43207 #undef TARGET_MUST_PASS_IN_STACK
43208 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
43209 #undef TARGET_FUNCTION_ARG_ADVANCE
43210 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
43211 #undef TARGET_FUNCTION_ARG
43212 #define TARGET_FUNCTION_ARG ix86_function_arg
43213 #undef TARGET_FUNCTION_ARG_BOUNDARY
43214 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
43215 #undef TARGET_PASS_BY_REFERENCE
43216 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
43217 #undef TARGET_INTERNAL_ARG_POINTER
43218 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
43219 #undef TARGET_UPDATE_STACK_BOUNDARY
43220 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
43221 #undef TARGET_GET_DRAP_RTX
43222 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
43223 #undef TARGET_STRICT_ARGUMENT_NAMING
43224 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
43225 #undef TARGET_STATIC_CHAIN
43226 #define TARGET_STATIC_CHAIN ix86_static_chain
43227 #undef TARGET_TRAMPOLINE_INIT
43228 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
43229 #undef TARGET_RETURN_POPS_ARGS
43230 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
43232 #undef TARGET_LEGITIMATE_COMBINED_INSN
43233 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
43235 #undef TARGET_ASAN_SHADOW_OFFSET
43236 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
43238 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
43239 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
43241 #undef TARGET_SCALAR_MODE_SUPPORTED_P
43242 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
43244 #undef TARGET_VECTOR_MODE_SUPPORTED_P
43245 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
43247 #undef TARGET_C_MODE_FOR_SUFFIX
43248 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
43250 #ifdef HAVE_AS_TLS
43251 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
43252 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
43253 #endif
43255 #ifdef SUBTARGET_INSERT_ATTRIBUTES
43256 #undef TARGET_INSERT_ATTRIBUTES
43257 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
43258 #endif
43260 #undef TARGET_MANGLE_TYPE
43261 #define TARGET_MANGLE_TYPE ix86_mangle_type
43263 #if !TARGET_MACHO
43264 #undef TARGET_STACK_PROTECT_FAIL
43265 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
43266 #endif
43268 #undef TARGET_FUNCTION_VALUE
43269 #define TARGET_FUNCTION_VALUE ix86_function_value
43271 #undef TARGET_FUNCTION_VALUE_REGNO_P
43272 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
43274 #undef TARGET_PROMOTE_FUNCTION_MODE
43275 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
43277 #undef TARGET_MEMBER_TYPE_FORCES_BLK
43278 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
43280 #undef TARGET_INSTANTIATE_DECLS
43281 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
43283 #undef TARGET_SECONDARY_RELOAD
43284 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
43286 #undef TARGET_CLASS_MAX_NREGS
43287 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
43289 #undef TARGET_PREFERRED_RELOAD_CLASS
43290 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
43291 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
43292 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
43293 #undef TARGET_CLASS_LIKELY_SPILLED_P
43294 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
43296 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
43297 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
43298 ix86_builtin_vectorization_cost
43299 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
43300 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
43301 ix86_vectorize_vec_perm_const_ok
43302 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
43303 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
43304 ix86_preferred_simd_mode
43305 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
43306 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
43307 ix86_autovectorize_vector_sizes
43308 #undef TARGET_VECTORIZE_INIT_COST
43309 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
43310 #undef TARGET_VECTORIZE_ADD_STMT_COST
43311 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
43312 #undef TARGET_VECTORIZE_FINISH_COST
43313 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
43314 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
43315 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
43317 #undef TARGET_SET_CURRENT_FUNCTION
43318 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
43320 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
43321 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
43323 #undef TARGET_OPTION_SAVE
43324 #define TARGET_OPTION_SAVE ix86_function_specific_save
43326 #undef TARGET_OPTION_RESTORE
43327 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
43329 #undef TARGET_OPTION_PRINT
43330 #define TARGET_OPTION_PRINT ix86_function_specific_print
43332 #undef TARGET_OPTION_FUNCTION_VERSIONS
43333 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
43335 #undef TARGET_CAN_INLINE_P
43336 #define TARGET_CAN_INLINE_P ix86_can_inline_p
43338 #undef TARGET_EXPAND_TO_RTL_HOOK
43339 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
43341 #undef TARGET_LEGITIMATE_ADDRESS_P
43342 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
43344 #undef TARGET_LRA_P
43345 #define TARGET_LRA_P hook_bool_void_true
43347 #undef TARGET_REGISTER_PRIORITY
43348 #define TARGET_REGISTER_PRIORITY ix86_register_priority
43350 #undef TARGET_REGISTER_USAGE_LEVELING_P
43351 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
43353 #undef TARGET_LEGITIMATE_CONSTANT_P
43354 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
43356 #undef TARGET_FRAME_POINTER_REQUIRED
43357 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
43359 #undef TARGET_CAN_ELIMINATE
43360 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
43362 #undef TARGET_EXTRA_LIVE_ON_ENTRY
43363 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
43365 #undef TARGET_ASM_CODE_END
43366 #define TARGET_ASM_CODE_END ix86_code_end
43368 #undef TARGET_CONDITIONAL_REGISTER_USAGE
43369 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
43371 #if TARGET_MACHO
43372 #undef TARGET_INIT_LIBFUNCS
43373 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
43374 #endif
43376 #undef TARGET_SPILL_CLASS
43377 #define TARGET_SPILL_CLASS ix86_spill_class
43379 struct gcc_target targetm = TARGET_INITIALIZER;
43381 #include "gt-i386.h"